Transcribe audio files with Streaming

This guide shows you how to transcribe WAV audio files with varying sample rates using our Streaming API.

Quickstart

Here is the complete AssemblyAI Python SDK script to transcribe a WAV audio file using the Streaming API.

1import assemblyai as aai
2from assemblyai.streaming.v3 import (
3 BeginEvent,
4 StreamingClient,
5 StreamingClientOptions,
6 StreamingError,
7 StreamingEvents,
8 StreamingParameters,
9 TerminationEvent,
10 TurnEvent
11)
12from typing import Type
13import sys
14
15YOUR_API_KEY = "YOUR_API_KEY" # Replace with your AssemblyAI API key
16AUDIO_FILE = "audio.wav" # Path to your audio file
17SAMPLE_RATE = 48000 # Change to match the sample rate of your audio file
18SAVE_TRANSCRIPT_TO_FILE = True # Set to False to disable saving transcript to file
19PLAY_AUDIO = True # Set to False to disable audio playback
20
21# Track session data for output file
22session_data = {
23 "session_id": None,
24 "parameters": None,
25 "audio_file": AUDIO_FILE,
26 "audio_duration_seconds": None,
27 "turns": []
28}
29
30def on_begin(self: Type[StreamingClient], event: BeginEvent):
31 "This function is called when the connection has been established."
32
33 session_data["session_id"] = event.id
34 print("Session ID:", event.id, "\n")
35
36def on_turn(self: Type[StreamingClient], event: TurnEvent):
37 "This function is called when a new transcript has been received."
38
39 # Skip empty transcripts
40 if not event.transcript:
41 return
42
43 # Determine status label
44 if not event.end_of_turn:
45 status = "[Partial]"
46 elif event.turn_is_formatted:
47 status = "[Final (formatted)]"
48 else:
49 status = "[Final (unformatted)]"
50
51 print(f"{status}: {event.transcript}")
52
53 # Track final turns (formatted if formatting is enabled, otherwise just final)
54 is_final = event.end_of_turn and (not streaming_params.format_turns or event.turn_is_formatted)
55 if is_final:
56 session_data["turns"].append(event.transcript)
57 print() # Add blank line after final formatted turn for cleaner output
58
59def on_terminated(self: Type[StreamingClient], event: TerminationEvent):
60 "This function is called when the session has ended."
61
62 session_data["audio_duration_seconds"] = event.audio_duration_seconds
63 print(
64 f"Session terminated: {event.audio_duration_seconds} seconds of audio processed"
65 )
66
67def on_error(self: Type[StreamingClient], error: StreamingError):
68 "This function is called when an error occurs."
69
70 print(f"Error occurred: {error}")
71
72def save_transcript():
73 "Save the transcript to a file in the same directory as the script."
74 from pathlib import Path
75
76 # Get the audio file name (handles both absolute and relative paths)
77 audio_name = Path(session_data["audio_file"]).stem
78
79 # Generate filename: {file_name}_{session_id}.txt in the same directory as script
80 session_id = session_data["session_id"] or "unknown"
81 output_file = f"{audio_name}_{session_id}.txt"
82
83 with open(output_file, "w") as f:
84 f.write(f"AssemblyAI Session ID: {session_data['session_id']}\n")
85 f.write(f"Audio file: {session_data['audio_file']}\n")
86 f.write(f"Audio duration: {session_data['audio_duration_seconds']} seconds\n")
87 f.write(f"Parameters used: {session_data['parameters']}\n")
88 f.write("See all available parameters and defaults at https://www.assemblyai.com/docs/api-reference/streaming-api/streaming-api#request.query\n\n")
89
90 f.write("\nTranscription Output\n")
91 for i, turn in enumerate(session_data["turns"], 1):
92 f.write(f"[Turn #{i}]: {turn}\n")
93
94 print(f"Transcript saved to {output_file}")
95
96
97# Create the streaming client
98client = StreamingClient(
99 StreamingClientOptions(
100 api_key=YOUR_API_KEY
101 )
102)
103
104client.on(StreamingEvents.Begin, on_begin)
105client.on(StreamingEvents.Turn, on_turn)
106client.on(StreamingEvents.Termination, on_terminated)
107client.on(StreamingEvents.Error, on_error)
108
109def validate_audio_file(filepath: str, sample_rate: int):
110 """Validate audio file before streaming"""
111 import wave
112 from pathlib import Path
113
114 # Check file extension
115 file_ext = Path(filepath).suffix.lower()
116 if file_ext != ".wav":
117 print(f"Error: Only WAV files are supported. Got: {file_ext}", file=sys.stderr)
118 print(f"Convert your file to WAV using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr)
119 sys.exit(1)
120
121 with wave.open(filepath, 'rb') as wav_file:
122 if wav_file.getnchannels() != 1:
123 print("Error: Only mono audio is supported", file=sys.stderr)
124 print(f"Convert your file to mono using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr)
125 sys.exit(1)
126
127 file_sample_rate = wav_file.getframerate()
128 if file_sample_rate != sample_rate:
129 print(f"Error: File sample rate ({file_sample_rate}) doesn't match expected rate ({sample_rate})", file=sys.stderr)
130 print(f"Either update SAMPLE_RATE to {file_sample_rate}, or convert your file using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr)
131 sys.exit(1)
132
133def stream_file(filepath: str, sample_rate: int, play_audio: bool = False):
134 """Stream audio file in 50ms chunks, optionally playing audio"""
135 import time
136 import wave
137
138 chunk_duration = 0.05
139 audio_player = None
140
141 if play_audio:
142 try:
143 import pyaudio
144 p = pyaudio.PyAudio()
145 with wave.open(filepath, 'rb') as wav_file:
146 audio_player = p.open(
147 format=p.get_format_from_width(wav_file.getsampwidth()),
148 channels=wav_file.getnchannels(),
149 rate=wav_file.getframerate(),
150 output=True
151 )
152 except ImportError:
153 print("Warning: pyaudio not installed. Audio playback disabled.", file=sys.stderr)
154 print("Install with: pip install pyaudio", file=sys.stderr)
155 play_audio = False
156
157 try:
158 with wave.open(filepath, 'rb') as wav_file:
159 frames_per_chunk = int(sample_rate * chunk_duration)
160
161 while True:
162 frames = wav_file.readframes(frames_per_chunk)
163
164 if not frames:
165 break
166
167 if audio_player:
168 audio_player.write(frames)
169 else:
170 time.sleep(chunk_duration)
171
172 yield frames
173 finally:
174 if audio_player:
175 audio_player.stop_stream()
176 audio_player.close()
177 p.terminate()
178
179# Validate audio file before connecting
180validate_audio_file(AUDIO_FILE, SAMPLE_RATE)
181
182file_stream = stream_file(
183 filepath=AUDIO_FILE,
184 sample_rate=SAMPLE_RATE,
185 play_audio=PLAY_AUDIO,
186)
187
188# Configure streaming parameters
189streaming_params = StreamingParameters(
190 sample_rate=SAMPLE_RATE,
191 format_turns=True,
192 speech_model="universal-streaming-english",
193)
194
195# Store parameters for output file (dynamically capture all set parameters)
196session_data["parameters"] = ", ".join(
197 f"{k}={v}" for k, v in streaming_params.__dict__.items() if v is not None
198)
199
200# Warn if using default turn detection parameters
201turn_params = ["end_of_turn_confidence_threshold", "min_end_of_turn_silence_when_confident", "max_turn_silence"]
202if not any(getattr(streaming_params, p, None) is not None for p in turn_params):
203 print("Warning: Using default turn detection parameters. For best results, fine-tune to your use case:")
204 print("https://www.assemblyai.com/docs/universal-streaming/turn-detection#quick-start-configurations\n")
205
206client.connect(streaming_params)
207
208try:
209 client.stream(file_stream)
210finally:
211 client.disconnect(terminate=True)
212 if SAVE_TRANSCRIPT_TO_FILE:
213 save_transcript()

Step-by-step guide

Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up and get your API key from your dashboard.

Install and import packages

Install the AssemblyAI Python SDK and pyaudio for audio playback.

$pip install assemblyai
>pip install pyaudio

Import packages.

1import assemblyai as aai
2from assemblyai.streaming.v3 import (
3 BeginEvent,
4 StreamingClient,
5 StreamingClientOptions,
6 StreamingError,
7 StreamingEvents,
8 StreamingParameters,
9 TerminationEvent,
10 TurnEvent
11)
12from typing import Type
13import sys

Configure settings

Replace YOUR_API_KEY with your API key.

Set AUDIO_FILE to the relative or absolute path of your audio file, and set SAMPLE_RATE to match your file’s sample rate.

See Connect the client to configure all other stream parameters.

1YOUR_API_KEY = "YOUR_API_KEY" # Replace with your AssemblyAI API key
2AUDIO_FILE = "audio.wav" # Path to your audio file
3SAMPLE_RATE = 48000 # Change to match the sample rate of your audio file
4SAVE_TRANSCRIPT_TO_FILE = True # Set to False to disable saving transcript to file
5PLAY_AUDIO = True # Set to False to disable audio playback
6
7# Track session data for output file
8session_data = {
9 "session_id": None,
10 "parameters": None,
11 "audio_file": AUDIO_FILE,
12 "audio_duration_seconds": None,
13 "turns": []
14}

Websocket Event Handlers

1def on_begin(self: Type[StreamingClient], event: BeginEvent):
2 "This function is called when the connection has been established."
3
4 session_data["session_id"] = event.id
5 print("Session ID:", event.id, "\n")
6
7def on_turn(self: Type[StreamingClient], event: TurnEvent):
8 "This function is called when a new transcript has been received."
9
10 # Skip empty transcripts
11 if not event.transcript:
12 return
13
14 # Determine status label
15 if not event.end_of_turn:
16 status = "[Partial]"
17 elif event.turn_is_formatted:
18 status = "[Final (formatted)]"
19 else:
20 status = "[Final (unformatted)]"
21
22 print(f"{status}: {event.transcript}")
23
24 # Track final turns (formatted if formatting is enabled, otherwise just final)
25 is_final = event.end_of_turn and (not streaming_params.format_turns or event.turn_is_formatted)
26 if is_final:
27 session_data["turns"].append(event.transcript)
28 print() # Add blank line after final formatted turn for cleaner output
29
30def on_terminated(self: Type[StreamingClient], event: TerminationEvent):
31 "This function is called when the session has ended."
32
33 session_data["audio_duration_seconds"] = event.audio_duration_seconds
34 print(
35 f"Session terminated: {event.audio_duration_seconds} seconds of audio processed"
36 )
37
38def on_error(self: Type[StreamingClient], error: StreamingError):
39 "This function is called when an error occurs."
40
41 print(f"Error occurred: {error}")

Create the streaming client

1# Create the streaming client
2client = StreamingClient(
3 StreamingClientOptions(
4 api_key=YOUR_API_KEY
5 )
6)
7
8client.on(StreamingEvents.Begin, on_begin)
9client.on(StreamingEvents.Turn, on_turn)
10client.on(StreamingEvents.Termination, on_terminated)
11client.on(StreamingEvents.Error, on_error)

Helper functions for streaming files

The following helper functions are used to validate audio files, stream audio in chunks, and save the transcript output:

  • validate_audio_file() - Validates that the audio file is a mono WAV file with the expected sample rate.
  • stream_file() - Streams the audio file in 50ms chunks, optionally playing audio through speakers.
  • save_transcript() - Saves the transcript to a text file after the session ends.
1def save_transcript():
2 "Save the transcript to a file in the same directory as the script."
3 from pathlib import Path
4
5 # Get the audio file name (handles both absolute and relative paths)
6 audio_name = Path(session_data["audio_file"]).stem
7
8 # Generate filename: {file_name}_{session_id}.txt in the same directory as script
9 session_id = session_data["session_id"] or "unknown"
10 output_file = f"{audio_name}_{session_id}.txt"
11
12 with open(output_file, "w") as f:
13 f.write(f"AssemblyAI Session ID: {session_data['session_id']}\n")
14 f.write(f"Audio file: {session_data['audio_file']}\n")
15 f.write(f"Audio duration: {session_data['audio_duration_seconds']} seconds\n")
16 f.write(f"Parameters used: {session_data['parameters']}\n")
17 f.write("See all available parameters and defaults at https://www.assemblyai.com/docs/api-reference/streaming-api/streaming-api#request.query\n\n")
18
19 f.write("\nTranscription Output\n")
20 for i, turn in enumerate(session_data["turns"], 1):
21 f.write(f"[Turn #{i}]: {turn}\n")
22
23 print(f"Transcript saved to {output_file}")
24
25def validate_audio_file(filepath: str, sample_rate: int):
26 """Validate audio file before streaming"""
27 import wave
28 from pathlib import Path
29
30 # Check file extension
31 file_ext = Path(filepath).suffix.lower()
32 if file_ext != ".wav":
33 print(f"Error: Only WAV files are supported. Got: {file_ext}", file=sys.stderr)
34 print(f"Convert your file to WAV using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr)
35 sys.exit(1)
36
37 with wave.open(filepath, 'rb') as wav_file:
38 if wav_file.getnchannels() != 1:
39 print("Error: Only mono audio is supported", file=sys.stderr)
40 print(f"Convert your file to mono using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr)
41 sys.exit(1)
42
43 file_sample_rate = wav_file.getframerate()
44 if file_sample_rate != sample_rate:
45 print(f"Error: File sample rate ({file_sample_rate}) doesn't match expected rate ({sample_rate})", file=sys.stderr)
46 print(f"Either update SAMPLE_RATE to {file_sample_rate}, or convert your file using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr)
47 sys.exit(1)
48
49def stream_file(filepath: str, sample_rate: int, play_audio: bool = False):
50 """Stream audio file in 50ms chunks, optionally playing audio"""
51 import time
52 import wave
53
54 chunk_duration = 0.05
55 audio_player = None
56
57 if play_audio:
58 try:
59 import pyaudio
60 p = pyaudio.PyAudio()
61 with wave.open(filepath, 'rb') as wav_file:
62 audio_player = p.open(
63 format=p.get_format_from_width(wav_file.getsampwidth()),
64 channels=wav_file.getnchannels(),
65 rate=wav_file.getframerate(),
66 output=True
67 )
68 except ImportError:
69 print("Warning: pyaudio not installed. Audio playback disabled.", file=sys.stderr)
70 print("Install with: pip install pyaudio", file=sys.stderr)
71 play_audio = False
72
73 try:
74 with wave.open(filepath, 'rb') as wav_file:
75 frames_per_chunk = int(sample_rate * chunk_duration)
76
77 while True:
78 frames = wav_file.readframes(frames_per_chunk)
79
80 if not frames:
81 break
82
83 if audio_player:
84 audio_player.write(frames)
85 else:
86 time.sleep(chunk_duration)
87
88 yield frames
89 finally:
90 if audio_player:
91 audio_player.stop_stream()
92 audio_player.close()
93 p.terminate()
94
95# Validate audio file before connecting
96validate_audio_file(AUDIO_FILE, SAMPLE_RATE)
97
98file_stream = stream_file(
99 filepath=AUDIO_FILE,
100 sample_rate=SAMPLE_RATE,
101 play_audio=PLAY_AUDIO,
102)

Connect the client

A warning is printed if default turn detection parameters are used. This is fine for testing, but for best accuracy and optimal performance, see our recommended settings.

1# Configure streaming parameters
2streaming_params = StreamingParameters(
3 sample_rate=SAMPLE_RATE,
4 format_turns=True,
5 speech_model="universal-streaming-english",
6)
7
8# Store parameters for output file (dynamically capture all set parameters)
9session_data["parameters"] = ", ".join(
10 f"{k}={v}" for k, v in streaming_params.__dict__.items() if v is not None
11)
12
13# Warn if using default turn detection parameters
14turn_params = ["end_of_turn_confidence_threshold", "min_end_of_turn_silence_when_confident", "max_turn_silence"]
15if not any(getattr(streaming_params, p, None) is not None for p in turn_params):
16 print("Warning: Using default turn detection parameters. For best results, fine-tune to your use case:")
17 print("https://www.assemblyai.com/docs/universal-streaming/turn-detection#quick-start-configurations\n")
18
19client.connect(streaming_params)

Stream the file

1try:
2 client.stream(file_stream)
3finally:
4 client.disconnect(terminate=True)
5 if SAVE_TRANSCRIPT_TO_FILE:
6 save_transcript()

The session will terminate once the file is finished streaming. If SAVE_TRANSCRIPT_TO_FILE is enabled (default), the transcript will be saved to {audio_filename}_{session_id}.txt in the current working directory.

The AUDIO_FILE path can be either relative (e.g., audio.wav) or absolute (e.g., /path/to/audio.wav).

Example output

Here’s an example of what the console output looks like when streaming an audio file:

1Warning: Using default turn detection parameters. For best results, fine-tune to your use case:
2https://www.assemblyai.com/docs/universal-streaming/turn-detection#quick-start-configurations
3
4Session ID: f37d7c4e-6be9-47ed-b6fc-7600fc78e34d
5
6[Partial]: the
7[Partial]: the quick
8[Partial]: the quick brown
9[Partial]: the quick brown fox
10[Partial]: the quick brown fox jumps
11[Partial]: the quick brown fox jumps over
12[Partial]: the quick brown fox jumps over the
13[Partial]: the quick brown fox jumps over the lazy
14[Partial]: the quick brown fox jumps over the lazy dog
15[Final (unformatted)]: the quick brown fox jumps over the lazy dog
16[Final (formatted)]: The quick brown fox jumps over the lazy dog.
17
18[Partial]: it
19[Partial]: it is
20[Partial]: it is a
21[Partial]: it is a common
22[Partial]: it is a common typing
23[Partial]: it is a common typing test
24[Final (unformatted)]: it is a common typing test
25[Final (formatted)]: It is a common typing test.
26
27Session terminated: 7.52 seconds of audio processed
28Transcript saved to audio_f37d7c4e-6be9-47ed-b6fc-7600fc78e34d.txt

The output shows:

  • Partial transcripts: Real-time updates as words are recognized
  • Final (unformatted): The complete turn before formatting is applied
  • Final (formatted): The final transcript with proper capitalization and punctuation