Skip to main content

Upload Training Data

Upload paired audio and transcript files to train your custom model.

Endpoint

PUT /api/v3/custom_models/{id}/data

Authentication

Requires API key with custom_models:write scope.

Request

Path Parameters

ParameterTypeRequiredDescription
idintegerYesModel ID

Headers

HeaderValueRequired
AuthorizationBearer YOUR_API_KEYYes
Content-Typemultipart/form-dataYes

Form Data

FieldTypeRequiredDescription
typestringYesData type: train or test
audio_filefileYesAudio file (MP3, WAV, FLAC, M4A, AAC, OGG)
transcript_filefileYesPlain text transcript (.txt)

Supported Audio Formats

  • MP3 (.mp3)
  • WAV (.wav)
  • FLAC (.flac)
  • M4A (.m4a)
  • AAC (.aac)
  • OGG (.ogg)

Audio Requirements

  • Sample rate: 16kHz or higher recommended
  • Bit rate: 128kbps or higher
  • Channels: Mono or stereo
  • Duration: Any length (will be combined with other files)
  • Quality: Clear speech, minimal background noise

Transcript Requirements

  • Format: Plain text (.txt) file
  • Encoding: UTF-8
  • Content: Verbatim transcription matching audio exactly
  • Accuracy: 95%+ accuracy required
  • Punctuation: Correct punctuation and capitalization
  • Completeness: Include all words spoken, no summarization

Response

Success Response

Status Code: 200 OK

{
"message": "Training data uploaded successfully",
"data_type": "train",
"filename": "cardiology_01.mp3",
"transcript_filename": "cardiology_01.txt",
"training_status": 2
}

After uploading sufficient data, training_status changes from 1 (Not Running) to 2 (Ready to Run).

Examples

cURL

# Upload training data
curl -X PUT https://api.scriptix.io/api/v3/custom_models/123/data \
-H "Authorization: Bearer YOUR_API_KEY" \
-F "type=train" \
-F "audio_file=@cardiology_01.mp3" \
-F "transcript_file=@cardiology_01.txt"

# Upload test data
curl -X PUT https://api.scriptix.io/api/v3/custom_models/123/data \
-H "Authorization: Bearer YOUR_API_KEY" \
-F "type=test" \
-F "audio_file=@cardiology_test_01.mp3" \
-F "transcript_file=@cardiology_test_01.txt"

Python

import requests
from pathlib import Path

def upload_training_data(model_id, audio_path, transcript_path, data_type='train'):
"""Upload paired audio and transcript files."""
url = f"https://api.scriptix.io/api/v3/custom_models/{model_id}/data"
headers = {"Authorization": "Bearer YOUR_API_KEY"}

files = {
'audio_file': open(audio_path, 'rb'),
'transcript_file': open(transcript_path, 'rb')
}
data = {'type': data_type}

response = requests.put(url, headers=headers, files=files, data=data)

# Close files
files['audio_file'].close()
files['transcript_file'].close()

return response.json()

# Upload single file pair
result = upload_training_data(
123,
'audio/cardiology_01.mp3',
'transcripts/cardiology_01.txt',
data_type='train'
)
print(f"Uploaded: {result['filename']}")

Python - Batch Upload

import os
from pathlib import Path

def batch_upload_training_data(model_id, audio_dir, transcript_dir, data_type='train'):
"""Upload all matching audio/transcript pairs from directories."""
audio_dir = Path(audio_dir)
transcript_dir = Path(transcript_dir)

# Find all audio files
audio_files = list(audio_dir.glob('*.mp3')) + list(audio_dir.glob('*.wav'))

uploaded = 0
failed = 0

for audio_file in audio_files:
# Find matching transcript (same name, .txt extension)
transcript_file = transcript_dir / f"{audio_file.stem}.txt"

if not transcript_file.exists():
print(f"Warning: No transcript found for {audio_file.name}")
failed += 1
continue

try:
result = upload_training_data(
model_id,
str(audio_file),
str(transcript_file),
data_type=data_type
)
print(f"✓ Uploaded: {audio_file.name}")
uploaded += 1

except Exception as e:
print(f"✗ Failed: {audio_file.name} - {e}")
failed += 1

print(f"\nUploaded: {uploaded}, Failed: {failed}")

# Usage
batch_upload_training_data(
123,
audio_dir='data/training/audio',
transcript_dir='data/training/transcripts',
data_type='train'
)

batch_upload_training_data(
123,
audio_dir='data/test/audio',
transcript_dir='data/test/transcripts',
data_type='test'
)

JavaScript

const FormData = require('form-data');
const fs = require('fs');
const axios = require('axios');

async function uploadTrainingData(modelId, audioPath, transcriptPath, dataType = 'train') {
const formData = new FormData();
formData.append('type', dataType);
formData.append('audio_file', fs.createReadStream(audioPath));
formData.append('transcript_file', fs.createReadStream(transcriptPath));

const response = await axios.put(
`https://api.scriptix.io/api/v3/custom_models/${modelId}/data`,
formData,
{
headers: {
'Authorization': 'Bearer YOUR_API_KEY',
...formData.getHeaders()
}
}
);

return response.data;
}

// Usage
uploadTrainingData(
123,
'audio/cardiology_01.mp3',
'exports/cardiology_01.txt',
'train'
).then(result => {
console.log(`Uploaded: ${result.filename}`);
});

Error Responses

400 Bad Request - Validation Error

{
"error": "Validation Error",
"message": "Invalid file or parameters",
"details": {
"type": "Must be 'train' or 'test'",
"audio_file": "Unsupported audio format",
"transcript_file": "Transcript file is required"
}
}

Common validation errors:

  • Invalid type value (must be exactly "train" or "test")
  • Unsupported audio format
  • Missing audio or transcript file
  • Empty files
  • Corrupted files

400 Bad Request - Transcript Mismatch

{
"error": "Bad Request",
"message": "Transcript does not match audio duration",
"error_code": "TRANSCRIPT_MISMATCH",
"details": {
"audio_duration": 300,
"transcript_word_count": 50,
"expected_word_count": "approximately 450-600"
}
}

409 Conflict - Model Not Ready

{
"error": "Conflict",
"message": "Cannot upload data: model is training",
"error_code": "MODEL_TRAINING"
}

Solution: Wait for training to complete (status 4 or 5) before uploading more data.

413 Payload Too Large

{
"error": "Payload Too Large",
"message": "Audio file exceeds maximum size",
"details": {
"max_size_mb": 500,
"file_size_mb": 650
}
}

Solution: Split large files into smaller segments.

Data Preparation Guidelines

Audio Quality

✅ Good Audio:

  • Clear speech
  • Minimal background noise
  • Consistent volume
  • Sample rate 16kHz+
  • No clipping or distortion

❌ Poor Audio:

  • Heavy background noise
  • Multiple overlapping speakers
  • Very low volume
  • Severe compression artifacts
  • Audio dropouts

Transcript Accuracy

✅ Good Transcript:

The patient presents with acute myocardial infarction.
Electrocardiogram shows ST-segment elevation in leads
V2 through V4. Recommend immediate cardiac catheterization.

❌ Poor Transcript:

patient has heart attack
ekg shows problem
needs procedure

File Organization

Recommended directory structure:

training_data/
├── train/
│ ├── audio/
│ │ ├── file_001.mp3
│ │ ├── file_002.mp3
│ │ └── ...
│ └── transcripts/
│ ├── file_001.txt
│ ├── file_002.txt
│ └── ...
└── test/
├── audio/
│ ├── test_001.mp3
│ └── ...
└── transcripts/
├── test_001.txt
└── ...

Data Split Recommendations

Total HoursTrainingTestValidation
5-10 hours80%20%-
10-30 hours80%15%5%
30+ hours80%10%10%

Example for 20 hours total:

  • Training: 16 hours (80%) - Upload with type=train
  • Test: 4 hours (20%) - Upload with type=test

Training vs Test Data

Training Data (type=train)

  • Used to train the model
  • Should represent production use case
  • Include variety of speakers, topics, conditions
  • Minimum 5 hours required

Upload priority: Upload training data first

Test Data (type=test)

  • Used to evaluate model performance
  • Should be separate from training data
  • Represents real-world accuracy
  • Recommended 20% of total data

Upload priority: Upload after training data

⚠️ Important: No Overlap

Never use the same audio in both training and test sets. This causes overfitting and inflated accuracy metrics.

Upload Progress Tracking

def upload_with_progress(model_id, audio_dir, transcript_dir):
"""Upload training data with progress tracking."""
from glob import glob
import time

audio_files = glob(f"{audio_dir}/*.mp3")
total = len(audio_files)

print(f"Found {total} audio files")

for i, audio_path in enumerate(audio_files, 1):
filename = os.path.basename(audio_path)
transcript_path = f"{transcript_dir}/{filename.replace('.mp3', '.txt')}"

print(f"\n[{i}/{total}] Uploading {filename}...")

try:
result = upload_training_data(model_id, audio_path, transcript_path)
print(f" ✓ Success - Status: {result['training_status']}")

# Small delay to avoid rate limits
time.sleep(1)

except Exception as e:
print(f" ✗ Error: {e}")

# Check final status
model = get_model(model_id)
print(f"\nFinal status: {model['status_message']}")

Validation Before Upload

import wave
import os

def validate_audio_file(audio_path):
"""Validate audio file before upload."""
# Check file exists
if not os.path.exists(audio_path):
return False, "File not found"

# Check file size
size_mb = os.path.getsize(audio_path) / (1024 * 1024)
if size_mb > 500:
return False, f"File too large: {size_mb:.1f}MB (max 500MB)"

# Check audio properties (for WAV files)
if audio_path.endswith('.wav'):
try:
with wave.open(audio_path, 'rb') as wav:
sample_rate = wav.getframerate()
if sample_rate < 16000:
return False, f"Sample rate too low: {sample_rate}Hz (min 16kHz)"
except Exception as e:
return False, f"Invalid WAV file: {e}"

return True, "Valid"

def validate_transcript_file(transcript_path):
"""Validate transcript file before upload."""
if not os.path.exists(transcript_path):
return False, "File not found"

# Check file size
size_kb = os.path.getsize(transcript_path) / 1024
if size_kb > 1000:
return False, f"File too large: {size_kb:.1f}KB (max 1MB)"

# Check content
try:
with open(transcript_path, 'r', encoding='utf-8') as f:
content = f.read()
if len(content.strip()) == 0:
return False, "File is empty"

word_count = len(content.split())
if word_count < 10:
return False, f"Too few words: {word_count} (min 10)"

except UnicodeDecodeError:
return False, "File not UTF-8 encoded"

return True, "Valid"

# Usage
valid, message = validate_audio_file('audio.mp3')
if not valid:
print(f"Invalid audio: {message}")

Best Practices

1. Upload in Batches

Upload files in batches to track progress and handle errors:

# Upload 10 files at a time
batch_size = 10
for i in range(0, len(audio_files), batch_size):
batch = audio_files[i:i+batch_size]
upload_batch(batch)
time.sleep(10) # Rate limiting

2. Verify After Upload

Check model status after all uploads:

model = get_model(model_id)
if model['training_status'] == 2:
print("✓ Ready to train!")
else:
print(f"Status: {model['status_message']}")

3. Keep Backups

Always keep original audio and transcripts:

# Create backup before upload
tar -czf training_data_backup.tar.gz training_data/

4. Document Data Sources

Track what data was uploaded:

# Create manifest
manifest = {
"model_id": 123,
"upload_date": "2025-01-15",
"training_files": [...],
"test_files": [...],
"total_hours": 20.5
}

with open('upload_manifest.json', 'w') as f:
json.dump(manifest, f, indent=2)

Rate Limits

  • Requests: 50 requests/hour
  • Concurrent: 5 concurrent uploads
  • File size: 500MB max per file

Next Steps

After uploading training and test data:

  1. Verify Status: Model should be training_status = 2 (Ready to Run)
  2. Start Training: Train Model
  3. Monitor Progress: Poll status endpoint during training

See Custom Models Overview for complete workflow.