Transcribe audio files with Streaming
This guide shows you how to transcribe WAV audio files with varying sample rates using our Streaming API.
Quickstart
Here is the complete AssemblyAI Python SDK script to transcribe a WAV audio file using the Streaming API.
1 import assemblyai as aai 2 from assemblyai.streaming.v3 import ( 3 BeginEvent, 4 StreamingClient, 5 StreamingClientOptions, 6 StreamingError, 7 StreamingEvents, 8 StreamingParameters, 9 TerminationEvent, 10 TurnEvent 11 ) 12 from typing import Type 13 import sys 14 15 YOUR_API_KEY = "YOUR_API_KEY" # Replace with your AssemblyAI API key 16 AUDIO_FILE = "audio.wav" # Path to your audio file 17 SAMPLE_RATE = 48000 # Change to match the sample rate of your audio file 18 SAVE_TRANSCRIPT_TO_FILE = True # Set to False to disable saving transcript to file 19 PLAY_AUDIO = True # Set to False to disable audio playback 20 21 # Track session data for output file 22 session_data = { 23 "session_id": None, 24 "parameters": None, 25 "audio_file": AUDIO_FILE, 26 "audio_duration_seconds": None, 27 "turns": [] 28 } 29 30 def on_begin(self: Type[StreamingClient], event: BeginEvent): 31 "This function is called when the connection has been established." 32 33 session_data["session_id"] = event.id 34 print("Session ID:", event.id, "\n") 35 36 def on_turn(self: Type[StreamingClient], event: TurnEvent): 37 "This function is called when a new transcript has been received." 38 39 # Skip empty transcripts 40 if not event.transcript: 41 return 42 43 # Determine status label 44 if not event.end_of_turn: 45 status = "[Partial]" 46 elif event.turn_is_formatted: 47 status = "[Final (formatted)]" 48 else: 49 status = "[Final (unformatted)]" 50 51 print(f"{status}: {event.transcript}") 52 53 # Track final turns (formatted if formatting is enabled, otherwise just final) 54 is_final = event.end_of_turn and (not streaming_params.format_turns or event.turn_is_formatted) 55 if is_final: 56 session_data["turns"].append(event.transcript) 57 print() # Add blank line after final formatted turn for cleaner output 58 59 def on_terminated(self: Type[StreamingClient], event: TerminationEvent): 60 "This function is called when the session has ended." 61 62 session_data["audio_duration_seconds"] = event.audio_duration_seconds 63 print( 64 f"Session terminated: {event.audio_duration_seconds} seconds of audio processed" 65 ) 66 67 def on_error(self: Type[StreamingClient], error: StreamingError): 68 "This function is called when an error occurs." 69 70 print(f"Error occurred: {error}") 71 72 def save_transcript(): 73 "Save the transcript to a file in the same directory as the script." 74 from pathlib import Path 75 76 # Get the audio file name (handles both absolute and relative paths) 77 audio_name = Path(session_data["audio_file"]).stem 78 79 # Generate filename: {file_name}_{session_id}.txt in the same directory as script 80 session_id = session_data["session_id"] or "unknown" 81 output_file = f"{audio_name}_{session_id}.txt" 82 83 with open(output_file, "w") as f: 84 f.write(f"AssemblyAI Session ID: {session_data['session_id']}\n") 85 f.write(f"Audio file: {session_data['audio_file']}\n") 86 f.write(f"Audio duration: {session_data['audio_duration_seconds']} seconds\n") 87 f.write(f"Parameters used: {session_data['parameters']}\n") 88 f.write("See all available parameters and defaults at https://www.assemblyai.com/docs/api-reference/streaming-api/streaming-api#request.query\n\n") 89 90 f.write("\nTranscription Output\n") 91 for i, turn in enumerate(session_data["turns"], 1): 92 f.write(f"[Turn #{i}]: {turn}\n") 93 94 print(f"Transcript saved to {output_file}") 95 96 97 # Create the streaming client 98 client = StreamingClient( 99 StreamingClientOptions( 100 api_key=YOUR_API_KEY 101 ) 102 ) 103 104 client.on(StreamingEvents.Begin, on_begin) 105 client.on(StreamingEvents.Turn, on_turn) 106 client.on(StreamingEvents.Termination, on_terminated) 107 client.on(StreamingEvents.Error, on_error) 108 109 def validate_audio_file(filepath: str, sample_rate: int): 110 """Validate audio file before streaming""" 111 import wave 112 from pathlib import Path 113 114 # Check file extension 115 file_ext = Path(filepath).suffix.lower() 116 if file_ext != ".wav": 117 print(f"Error: Only WAV files are supported. Got: {file_ext}", file=sys.stderr) 118 print(f"Convert your file to WAV using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr) 119 sys.exit(1) 120 121 with wave.open(filepath, 'rb') as wav_file: 122 if wav_file.getnchannels() != 1: 123 print("Error: Only mono audio is supported", file=sys.stderr) 124 print(f"Convert your file to mono using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr) 125 sys.exit(1) 126 127 file_sample_rate = wav_file.getframerate() 128 if file_sample_rate != sample_rate: 129 print(f"Error: File sample rate ({file_sample_rate}) doesn't match expected rate ({sample_rate})", file=sys.stderr) 130 print(f"Either update SAMPLE_RATE to {file_sample_rate}, or convert your file using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr) 131 sys.exit(1) 132 133 def stream_file(filepath: str, sample_rate: int, play_audio: bool = False): 134 """Stream audio file in 50ms chunks, optionally playing audio""" 135 import time 136 import wave 137 138 chunk_duration = 0.05 139 audio_player = None 140 141 if play_audio: 142 try: 143 import pyaudio 144 p = pyaudio.PyAudio() 145 with wave.open(filepath, 'rb') as wav_file: 146 audio_player = p.open( 147 format=p.get_format_from_width(wav_file.getsampwidth()), 148 channels=wav_file.getnchannels(), 149 rate=wav_file.getframerate(), 150 output=True 151 ) 152 except ImportError: 153 print("Warning: pyaudio not installed. Audio playback disabled.", file=sys.stderr) 154 print("Install with: pip install pyaudio", file=sys.stderr) 155 play_audio = False 156 157 try: 158 with wave.open(filepath, 'rb') as wav_file: 159 frames_per_chunk = int(sample_rate * chunk_duration) 160 161 while True: 162 frames = wav_file.readframes(frames_per_chunk) 163 164 if not frames: 165 break 166 167 if audio_player: 168 audio_player.write(frames) 169 else: 170 time.sleep(chunk_duration) 171 172 yield frames 173 finally: 174 if audio_player: 175 audio_player.stop_stream() 176 audio_player.close() 177 p.terminate() 178 179 # Validate audio file before connecting 180 validate_audio_file(AUDIO_FILE, SAMPLE_RATE) 181 182 file_stream = stream_file( 183 filepath=AUDIO_FILE, 184 sample_rate=SAMPLE_RATE, 185 play_audio=PLAY_AUDIO, 186 ) 187 188 # Configure streaming parameters 189 streaming_params = StreamingParameters( 190 sample_rate=SAMPLE_RATE, 191 format_turns=True, 192 speech_model="universal-streaming-english", 193 ) 194 195 # Store parameters for output file (dynamically capture all set parameters) 196 session_data["parameters"] = ", ".join( 197 f"{k}={v}" for k, v in streaming_params.__dict__.items() if v is not None 198 ) 199 200 # Warn if using default turn detection parameters 201 turn_params = ["end_of_turn_confidence_threshold", "min_end_of_turn_silence_when_confident", "max_turn_silence"] 202 if not any(getattr(streaming_params, p, None) is not None for p in turn_params): 203 print("Warning: Using default turn detection parameters. For best results, fine-tune to your use case:") 204 print("https://www.assemblyai.com/docs/universal-streaming/turn-detection#quick-start-configurations\n") 205 206 client.connect(streaming_params) 207 208 try: 209 client.stream(file_stream) 210 finally: 211 client.disconnect(terminate=True) 212 if SAVE_TRANSCRIPT_TO_FILE: 213 save_transcript()
Step-by-step guide
Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up and get your API key from your dashboard.
Install and import packages
Install the AssemblyAI Python SDK and pyaudio for audio playback.
$ pip install assemblyai > pip install pyaudio
Import packages.
1 import assemblyai as aai 2 from assemblyai.streaming.v3 import ( 3 BeginEvent, 4 StreamingClient, 5 StreamingClientOptions, 6 StreamingError, 7 StreamingEvents, 8 StreamingParameters, 9 TerminationEvent, 10 TurnEvent 11 ) 12 from typing import Type 13 import sys
Configure settings
Replace YOUR_API_KEY with your API key.
Set AUDIO_FILE to the relative or absolute path of your audio file, and set SAMPLE_RATE to match your file’s sample rate.
See Connect the client to configure all other stream parameters.
1 YOUR_API_KEY = "YOUR_API_KEY" # Replace with your AssemblyAI API key 2 AUDIO_FILE = "audio.wav" # Path to your audio file 3 SAMPLE_RATE = 48000 # Change to match the sample rate of your audio file 4 SAVE_TRANSCRIPT_TO_FILE = True # Set to False to disable saving transcript to file 5 PLAY_AUDIO = True # Set to False to disable audio playback 6 7 # Track session data for output file 8 session_data = { 9 "session_id": None, 10 "parameters": None, 11 "audio_file": AUDIO_FILE, 12 "audio_duration_seconds": None, 13 "turns": [] 14 }
Websocket Event Handlers
1 def on_begin(self: Type[StreamingClient], event: BeginEvent): 2 "This function is called when the connection has been established." 3 4 session_data["session_id"] = event.id 5 print("Session ID:", event.id, "\n") 6 7 def on_turn(self: Type[StreamingClient], event: TurnEvent): 8 "This function is called when a new transcript has been received." 9 10 # Skip empty transcripts 11 if not event.transcript: 12 return 13 14 # Determine status label 15 if not event.end_of_turn: 16 status = "[Partial]" 17 elif event.turn_is_formatted: 18 status = "[Final (formatted)]" 19 else: 20 status = "[Final (unformatted)]" 21 22 print(f"{status}: {event.transcript}") 23 24 # Track final turns (formatted if formatting is enabled, otherwise just final) 25 is_final = event.end_of_turn and (not streaming_params.format_turns or event.turn_is_formatted) 26 if is_final: 27 session_data["turns"].append(event.transcript) 28 print() # Add blank line after final formatted turn for cleaner output 29 30 def on_terminated(self: Type[StreamingClient], event: TerminationEvent): 31 "This function is called when the session has ended." 32 33 session_data["audio_duration_seconds"] = event.audio_duration_seconds 34 print( 35 f"Session terminated: {event.audio_duration_seconds} seconds of audio processed" 36 ) 37 38 def on_error(self: Type[StreamingClient], error: StreamingError): 39 "This function is called when an error occurs." 40 41 print(f"Error occurred: {error}")
Create the streaming client
1 # Create the streaming client 2 client = StreamingClient( 3 StreamingClientOptions( 4 api_key=YOUR_API_KEY 5 ) 6 ) 7 8 client.on(StreamingEvents.Begin, on_begin) 9 client.on(StreamingEvents.Turn, on_turn) 10 client.on(StreamingEvents.Termination, on_terminated) 11 client.on(StreamingEvents.Error, on_error)
Helper functions for streaming files
The following helper functions are used to validate audio files, stream audio in chunks, and save the transcript output:
validate_audio_file()- Validates that the audio file is a mono WAV file with the expected sample rate.stream_file()- Streams the audio file in 50ms chunks, optionally playing audio through speakers.save_transcript()- Saves the transcript to a text file after the session ends.
1 def save_transcript(): 2 "Save the transcript to a file in the same directory as the script." 3 from pathlib import Path 4 5 # Get the audio file name (handles both absolute and relative paths) 6 audio_name = Path(session_data["audio_file"]).stem 7 8 # Generate filename: {file_name}_{session_id}.txt in the same directory as script 9 session_id = session_data["session_id"] or "unknown" 10 output_file = f"{audio_name}_{session_id}.txt" 11 12 with open(output_file, "w") as f: 13 f.write(f"AssemblyAI Session ID: {session_data['session_id']}\n") 14 f.write(f"Audio file: {session_data['audio_file']}\n") 15 f.write(f"Audio duration: {session_data['audio_duration_seconds']} seconds\n") 16 f.write(f"Parameters used: {session_data['parameters']}\n") 17 f.write("See all available parameters and defaults at https://www.assemblyai.com/docs/api-reference/streaming-api/streaming-api#request.query\n\n") 18 19 f.write("\nTranscription Output\n") 20 for i, turn in enumerate(session_data["turns"], 1): 21 f.write(f"[Turn #{i}]: {turn}\n") 22 23 print(f"Transcript saved to {output_file}") 24 25 def validate_audio_file(filepath: str, sample_rate: int): 26 """Validate audio file before streaming""" 27 import wave 28 from pathlib import Path 29 30 # Check file extension 31 file_ext = Path(filepath).suffix.lower() 32 if file_ext != ".wav": 33 print(f"Error: Only WAV files are supported. Got: {file_ext}", file=sys.stderr) 34 print(f"Convert your file to WAV using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr) 35 sys.exit(1) 36 37 with wave.open(filepath, 'rb') as wav_file: 38 if wav_file.getnchannels() != 1: 39 print("Error: Only mono audio is supported", file=sys.stderr) 40 print(f"Convert your file to mono using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr) 41 sys.exit(1) 42 43 file_sample_rate = wav_file.getframerate() 44 if file_sample_rate != sample_rate: 45 print(f"Error: File sample rate ({file_sample_rate}) doesn't match expected rate ({sample_rate})", file=sys.stderr) 46 print(f"Either update SAMPLE_RATE to {file_sample_rate}, or convert your file using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr) 47 sys.exit(1) 48 49 def stream_file(filepath: str, sample_rate: int, play_audio: bool = False): 50 """Stream audio file in 50ms chunks, optionally playing audio""" 51 import time 52 import wave 53 54 chunk_duration = 0.05 55 audio_player = None 56 57 if play_audio: 58 try: 59 import pyaudio 60 p = pyaudio.PyAudio() 61 with wave.open(filepath, 'rb') as wav_file: 62 audio_player = p.open( 63 format=p.get_format_from_width(wav_file.getsampwidth()), 64 channels=wav_file.getnchannels(), 65 rate=wav_file.getframerate(), 66 output=True 67 ) 68 except ImportError: 69 print("Warning: pyaudio not installed. Audio playback disabled.", file=sys.stderr) 70 print("Install with: pip install pyaudio", file=sys.stderr) 71 play_audio = False 72 73 try: 74 with wave.open(filepath, 'rb') as wav_file: 75 frames_per_chunk = int(sample_rate * chunk_duration) 76 77 while True: 78 frames = wav_file.readframes(frames_per_chunk) 79 80 if not frames: 81 break 82 83 if audio_player: 84 audio_player.write(frames) 85 else: 86 time.sleep(chunk_duration) 87 88 yield frames 89 finally: 90 if audio_player: 91 audio_player.stop_stream() 92 audio_player.close() 93 p.terminate() 94 95 # Validate audio file before connecting 96 validate_audio_file(AUDIO_FILE, SAMPLE_RATE) 97 98 file_stream = stream_file( 99 filepath=AUDIO_FILE, 100 sample_rate=SAMPLE_RATE, 101 play_audio=PLAY_AUDIO, 102 )
Connect the client
A warning is printed if default turn detection parameters are used. This is fine for testing, but for best accuracy and optimal performance, see our recommended settings.
1 # Configure streaming parameters 2 streaming_params = StreamingParameters( 3 sample_rate=SAMPLE_RATE, 4 format_turns=True, 5 speech_model="universal-streaming-english", 6 ) 7 8 # Store parameters for output file (dynamically capture all set parameters) 9 session_data["parameters"] = ", ".join( 10 f"{k}={v}" for k, v in streaming_params.__dict__.items() if v is not None 11 ) 12 13 # Warn if using default turn detection parameters 14 turn_params = ["end_of_turn_confidence_threshold", "min_end_of_turn_silence_when_confident", "max_turn_silence"] 15 if not any(getattr(streaming_params, p, None) is not None for p in turn_params): 16 print("Warning: Using default turn detection parameters. For best results, fine-tune to your use case:") 17 print("https://www.assemblyai.com/docs/universal-streaming/turn-detection#quick-start-configurations\n") 18 19 client.connect(streaming_params)
Stream the file
1 try: 2 client.stream(file_stream) 3 finally: 4 client.disconnect(terminate=True) 5 if SAVE_TRANSCRIPT_TO_FILE: 6 save_transcript()
The session will terminate once the file is finished streaming. If SAVE_TRANSCRIPT_TO_FILE is enabled (default), the transcript will be saved to {audio_filename}_{session_id}.txt in the current working directory.
The AUDIO_FILE path can be either relative (e.g., audio.wav) or absolute (e.g., /path/to/audio.wav).
Example output
Here’s an example of what the console output looks like when streaming an audio file:
1 Warning: Using default turn detection parameters. For best results, fine-tune to your use case: 2 https://www.assemblyai.com/docs/universal-streaming/turn-detection#quick-start-configurations 3 4 Session ID: f37d7c4e-6be9-47ed-b6fc-7600fc78e34d 5 6 [Partial]: the 7 [Partial]: the quick 8 [Partial]: the quick brown 9 [Partial]: the quick brown fox 10 [Partial]: the quick brown fox jumps 11 [Partial]: the quick brown fox jumps over 12 [Partial]: the quick brown fox jumps over the 13 [Partial]: the quick brown fox jumps over the lazy 14 [Partial]: the quick brown fox jumps over the lazy dog 15 [Final (unformatted)]: the quick brown fox jumps over the lazy dog 16 [Final (formatted)]: The quick brown fox jumps over the lazy dog. 17 18 [Partial]: it 19 [Partial]: it is 20 [Partial]: it is a 21 [Partial]: it is a common 22 [Partial]: it is a common typing 23 [Partial]: it is a common typing test 24 [Final (unformatted)]: it is a common typing test 25 [Final (formatted)]: It is a common typing test. 26 27 Session terminated: 7.52 seconds of audio processed 28 Transcript saved to audio_f37d7c4e-6be9-47ed-b6fc-7600fc78e34d.txt
The output shows:
- Partial transcripts: Real-time updates as words are recognized
- Final (unformatted): The complete turn before formatting is applied
- Final (formatted): The final transcript with proper capitalization and punctuation