Transcribe streaming audio

Learn how to transcribe streaming audio.

Overview

By the end of this tutorial, you’ll be able to transcribe audio from your microphone.

Before you begin

To complete this tutorial, you need:

Here’s the full sample code of what you’ll build in this tutorial:

1import pyaudio
2import websocket
3import json
4import threading
5import time
6import wave
7from urllib.parse import urlencode
8from datetime import datetime
9
10# --- Configuration ---
11YOUR_API_KEY = "YOUR-API-KEY" # Replace with your actual API key
12
13CONNECTION_PARAMS = {
14 "speech_model": "u3-rt-pro",
15 "sample_rate": 16000,
16}
17API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"
18API_ENDPOINT = f"{API_ENDPOINT_BASE_URL}?{urlencode(CONNECTION_PARAMS)}"
19
20# Audio Configuration
21FRAMES_PER_BUFFER = 800 # 50ms of audio (0.05s * 16000Hz)
22SAMPLE_RATE = CONNECTION_PARAMS["sample_rate"]
23CHANNELS = 1
24FORMAT = pyaudio.paInt16
25
26# Global variables for audio stream and websocket
27audio = None
28stream = None
29ws_app = None
30audio_thread = None
31stop_event = threading.Event() # To signal the audio thread to stop
32
33# WAV recording variables
34recorded_frames = [] # Store audio frames for WAV file
35recording_lock = threading.Lock() # Thread-safe access to recorded_frames
36
37def save_wav_file():
38 """Save recorded audio frames to a WAV file."""
39 if not recorded_frames:
40 print("No audio data recorded.")
41 return
42
43 # Generate filename with timestamp
44 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
45 filename = f"recorded_audio_{timestamp}.wav"
46
47 try:
48 with wave.open(filename, 'wb') as wf:
49 wf.setnchannels(CHANNELS)
50 wf.setsampwidth(2) # 16-bit = 2 bytes
51 wf.setframerate(SAMPLE_RATE)
52
53 # Write all recorded frames
54 with recording_lock:
55 wf.writeframes(b''.join(recorded_frames))
56
57 print(f"Audio saved to: {filename}")
58 print(f"Duration: {len(recorded_frames) * FRAMES_PER_BUFFER / SAMPLE_RATE:.2f} seconds")
59
60 except Exception as e:
61 print(f"Error saving WAV file: {e}")
62
63# --- WebSocket Event Handlers ---
64
65def on_open(ws):
66 """Called when the WebSocket connection is established."""
67 print("WebSocket connection opened.")
68 print(f"Connected to: {API_ENDPOINT}")
69
70 # Start sending audio data in a separate thread
71 def stream_audio():
72 global stream
73 print("Starting audio streaming...")
74 while not stop_event.is_set():
75 try:
76 audio_data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False)
77
78 # Store audio data for WAV recording
79 with recording_lock:
80 recorded_frames.append(audio_data)
81
82 # Send audio data as binary message
83 ws.send(audio_data, websocket.ABNF.OPCODE_BINARY)
84 except Exception as e:
85 print(f"Error streaming audio: {e}")
86 # If stream read fails, likely means it's closed, stop the loop
87 break
88 print("Audio streaming stopped.")
89
90 global audio_thread
91 audio_thread = threading.Thread(target=stream_audio)
92 audio_thread.daemon = (
93 True # Allow main thread to exit even if this thread is running
94 )
95 audio_thread.start()
96
97def on_message(ws, message):
98 try:
99 data = json.loads(message)
100 msg_type = data.get('type')
101
102 if msg_type == "Begin":
103 session_id = data.get('id')
104 expires_at = data.get('expires_at')
105 print(f"\nSession began: ID={session_id}, ExpiresAt={datetime.fromtimestamp(expires_at)}")
106 elif msg_type == "Turn":
107 transcript = data.get('transcript', '')
108 formatted = data.get('turn_is_formatted', False)
109
110 # Clear previous line for formatted messages
111 if formatted:
112 print('\r' + ' ' * 80 + '\r', end='')
113 print(transcript)
114 else:
115 print(f"\r{transcript}", end='')
116 elif msg_type == "Termination":
117 audio_duration = data.get('audio_duration_seconds', 0)
118 session_duration = data.get('session_duration_seconds', 0)
119 print(f"\nSession Terminated: Audio Duration={audio_duration}s, Session Duration={session_duration}s")
120 except json.JSONDecodeError as e:
121 print(f"Error decoding message: {e}")
122 except Exception as e:
123 print(f"Error handling message: {e}")
124
125def on_error(ws, error):
126 """Called when a WebSocket error occurs."""
127 print(f"\nWebSocket Error: {error}")
128 # Attempt to signal stop on error
129 stop_event.set()
130
131
132def on_close(ws, close_status_code, close_msg):
133 """Called when the WebSocket connection is closed."""
134 print(f"\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}")
135
136 # Save recorded audio to WAV file
137 save_wav_file()
138
139 # Ensure audio resources are released
140 global stream, audio
141 stop_event.set() # Signal audio thread just in case it's still running
142
143 if stream:
144 if stream.is_active():
145 stream.stop_stream()
146 stream.close()
147 stream = None
148 if audio:
149 audio.terminate()
150 audio = None
151 # Try to join the audio thread to ensure clean exit
152 if audio_thread and audio_thread.is_alive():
153 audio_thread.join(timeout=1.0)
154
155# --- Main Execution ---
156def run():
157 global audio, stream, ws_app
158
159 # Initialize PyAudio
160 audio = pyaudio.PyAudio()
161
162 # Open microphone stream
163 try:
164 stream = audio.open(
165 input=True,
166 frames_per_buffer=FRAMES_PER_BUFFER,
167 channels=CHANNELS,
168 format=FORMAT,
169 rate=SAMPLE_RATE,
170 )
171 print("Microphone stream opened successfully.")
172 print("Speak into your microphone. Press Ctrl+C to stop.")
173 print("Audio will be saved to a WAV file when the session ends.")
174 except Exception as e:
175 print(f"Error opening microphone stream: {e}")
176 if audio:
177 audio.terminate()
178 return # Exit if microphone cannot be opened
179
180 # Create WebSocketApp
181 ws_app = websocket.WebSocketApp(
182 API_ENDPOINT,
183 header={"Authorization": YOUR_API_KEY},
184 on_open=on_open,
185 on_message=on_message,
186 on_error=on_error,
187 on_close=on_close,
188 )
189
190 # Run WebSocketApp in a separate thread to allow main thread to catch KeyboardInterrupt
191 ws_thread = threading.Thread(target=ws_app.run_forever)
192 ws_thread.daemon = True
193 ws_thread.start()
194
195 try:
196 # Keep main thread alive until interrupted
197 while ws_thread.is_alive():
198 time.sleep(0.1)
199 except KeyboardInterrupt:
200 print("\nCtrl+C received. Stopping...")
201 stop_event.set() # Signal audio thread to stop
202
203 # Send termination message to the server
204 if ws_app and ws_app.sock and ws_app.sock.connected:
205 try:
206 terminate_message = {"type": "Terminate"}
207 print(f"Sending termination message: {json.dumps(terminate_message)}")
208 ws_app.send(json.dumps(terminate_message))
209 # Give a moment for messages to process before forceful close
210 time.sleep(5)
211 except Exception as e:
212 print(f"Error sending termination message: {e}")
213
214 # Close the WebSocket connection (will trigger on_close)
215 if ws_app:
216 ws_app.close()
217
218 # Wait for WebSocket thread to finish
219 ws_thread.join(timeout=2.0)
220
221 except Exception as e:
222 print(f"\nAn unexpected error occurred: {e}")
223 stop_event.set()
224 if ws_app:
225 ws_app.close()
226 ws_thread.join(timeout=2.0)
227
228 finally:
229 # Final cleanup (already handled in on_close, but good as a fallback)
230 if stream and stream.is_active():
231 stream.stop_stream()
232 if stream:
233 stream.close()
234 if audio:
235 audio.terminate()
236 print("Cleanup complete. Exiting.")
237
238
239if __name__ == "__main__":
240 run()

Step 1: Install and import dependencies

1

Install the required Python packages:

$pip install pyaudio websocket-client
2

Create a file called main.py and import the following packages at the top of your file:

1import pyaudio
2import websocket
3import json
4import threading
5import time
6import wave
7from urllib.parse import urlencode
8from datetime import datetime

Step 2: Configure the API key

In this step, you’ll configure your AssemblyAI API key to authenticate your application and enable access to the streaming transcription service.

1

Browse to API Keys in your dashboard, and then copy your API key.

2

Store your API key in a variable. Replace <YOUR_API_KEY> with your copied API key.

1YOUR_API_KEY = "<YOUR_API_KEY>"
Authenticate with a temporary token

If you need to authenticate on the client, you can avoid exposing your API key by using temporary authentication tokens.

Step 3: Set up audio and websocket configuration

1

Set the parameters that control how your client connects to AssemblyAI’s streaming transcription API. These options determine things like audio sample rate and whether you want punctuation and formatting in your final transcripts.

1CONNECTION_PARAMS = {
2 "speech_model": "u3-rt-pro",
3 "sample_rate": 16000,
4}
5
6API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"
7API_ENDPOINT = f"{API_ENDPOINT_BASE_URL}?{urlencode(CONNECTION_PARAMS)}"

See Streaming endpoints and data zones for more information on endpoints for Streaming STT.

2

Prepare your audio input settings and recording logic. This configuration controls how microphone data is streamed in real-time:

1# Audio Configuration
2FRAMES_PER_BUFFER = 800 # 50ms of audio (0.05s * 16000Hz)
3SAMPLE_RATE = CONNECTION_PARAMS["sample_rate"]
4CHANNELS = 1
5FORMAT = pyaudio.paInt16
6# Global variables for audio stream and websocket
7audio = None
8stream = None
9ws_app = None
10audio_thread = None
11stop_event = threading.Event() # To signal the audio thread to stop
12# WAV recording variables
13recorded_frames = [] # Store audio frames for WAV file
14recording_lock = threading.Lock() # Thread-safe access to recorded_frames
15
16def save_wav_file():
17 """Save recorded audio frames to a WAV file."""
18 if not recorded_frames:
19 print("No audio data recorded.")
20 return
21
22 # Generate filename with timestamp
23 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
24 filename = f"recorded_audio_{timestamp}.wav"
25
26 try:
27 with wave.open(filename, 'wb') as wf:
28 wf.setnchannels(CHANNELS)
29 wf.setsampwidth(2) # 16-bit = 2 bytes
30 wf.setframerate(SAMPLE_RATE)
31
32 # Write all recorded frames
33 with recording_lock:
34 wf.writeframes(b''.join(recorded_frames))
35
36 print(f"Audio saved to: {filename}")
37 print(f"Duration: {len(recorded_frames) * FRAMES_PER_BUFFER / SAMPLE_RATE:.2f} seconds")
38
39 except Exception as e:
40 print(f"Error saving WAV file: {e}")

Step 4: Create event handlers

In this step, you’ll define event handlers to manage the different types of events emitted during the streaming session. The handlers will respond to session lifecycle events, transcription turns, errors, and session termination.

Implement basic event handlers. These handlers let your app respond to key streaming events:

  • on_open – Starts streaming microphone audio in a background thread.
  • on_message – Handles transcription events like Begin, Turn, and Termination.
  • on_error – Logs any connection or streaming errors and triggers cleanup.
  • on_close – Cleans up audio resources and saves a WAV recording when the session ends.
1# --- WebSocket Event Handlers ---
2def on_open(ws):
3 """Called when the WebSocket connection is established."""
4 print("WebSocket connection opened.")
5 print(f"Connected to: {API_ENDPOINT}")
6 # Start sending audio data in a separate thread
7 def stream_audio():
8 global stream
9 print("Starting audio streaming...")
10 while not stop_event.is_set():
11 try:
12 audio_data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False)
13
14 # Store audio data for WAV recording
15 with recording_lock:
16 recorded_frames.append(audio_data)
17
18 # Send audio data as binary message
19 ws.send(audio_data, websocket.ABNF.OPCODE_BINARY)
20 except Exception as e:
21 print(f"Error streaming audio: {e}")
22 # If stream read fails, likely means it's closed, stop the loop
23 break
24 print("Audio streaming stopped.")
25 global audio_thread
26 audio_thread = threading.Thread(target=stream_audio)
27 audio_thread.daemon = (
28 True # Allow main thread to exit even if this thread is running
29 )
30 audio_thread.start()
31def on_message(ws, message):
32 try:
33 data = json.loads(message)
34 msg_type = data.get('type')
35 if msg_type == "Begin":
36 session_id = data.get('id')
37 expires_at = data.get('expires_at')
38 print(f"\nSession began: ID={session_id}, ExpiresAt={datetime.fromtimestamp(expires_at)}")
39 elif msg_type == "Turn":
40 transcript = data.get('transcript', '')
41 formatted = data.get('turn_is_formatted', False)
42 # Clear previous line for formatted messages
43 if formatted:
44 print('\r' + ' ' * 80 + '\r', end='')
45 print(transcript)
46 else:
47 print(f"\r{transcript}", end='')
48 elif msg_type == "Termination":
49 audio_duration = data.get('audio_duration_seconds', 0)
50 session_duration = data.get('session_duration_seconds', 0)
51 print(f"\nSession Terminated: Audio Duration={audio_duration}s, Session Duration={session_duration}s")
52 except json.JSONDecodeError as e:
53 print(f"Error decoding message: {e}")
54 except Exception as e:
55 print(f"Error handling message: {e}")
56def on_error(ws, error):
57 """Called when a WebSocket error occurs."""
58 print(f"\nWebSocket Error: {error}")
59 # Attempt to signal stop on error
60 stop_event.set()
61def on_close(ws, close_status_code, close_msg):
62 """Called when the WebSocket connection is closed."""
63 print(f"\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}")
64
65 # Save recorded audio to WAV file
66 save_wav_file()
67
68 # Ensure audio resources are released
69 global stream, audio
70 stop_event.set() # Signal audio thread just in case it's still running
71 if stream:
72 if stream.is_active():
73 stream.stop_stream()
74 stream.close()
75 stream = None
76 if audio:
77 audio.terminate()
78 audio = None
79 # Try to join the audio thread to ensure clean exit
80 if audio_thread and audio_thread.is_alive():
81 audio_thread.join(timeout=1.0)
Message sequence and turn events

To get a better understanding of the turn event and the message sequences, check out our Message Sequence Breakdown page. This object is how you’ll receive your transcripts.

Step 5: Connect and start transcription

Streaming Speech-to-Text uses WebSockets to stream audio to AssemblyAI. This requires first establishing a connection to the API.

1

Create a main execution function and initialize the audio stream.

1def run():
2 global audio, stream, ws_app
3 # Initialize PyAudio
4 audio = pyaudio.PyAudio()
5 # Open microphone stream
6 try:
7 stream = audio.open(
8 input=True,
9 frames_per_buffer=FRAMES_PER_BUFFER,
10 channels=CHANNELS,
11 format=FORMAT,
12 rate=SAMPLE_RATE,
13 )
14 print("Microphone stream opened successfully.")
15 print("Speak into your microphone. Press Ctrl+C to stop.")
16 print("Audio will be saved to a WAV file when the session ends.")
17 except Exception as e:
18 print(f"Error opening microphone stream: {e}")
19 if audio:
20 audio.terminate()
21 return # Exit if microphone cannot be opened
2

Next, create a WebSocket connection to the streaming service:

1 # Create WebSocketApp
2 ws_app = websocket.WebSocketApp(
3 API_ENDPOINT,
4 header={"Authorization": YOUR_API_KEY},
5 on_open=on_open,
6 on_message=on_message,
7 on_error=on_error,
8 on_close=on_close,
9 )
10 # Run WebSocketApp in a separate thread to allow main thread to catch KeyboardInterrupt
11 ws_thread = threading.Thread(target=ws_app.run_forever)
12 ws_thread.daemon = True
13 ws_thread.start()

Step 6: Close the connection

Close the WebSocket connection when you’re done:

1 try:
2 # Keep main thread alive until interrupted
3 while ws_thread.is_alive():
4 time.sleep(0.1)
5 except KeyboardInterrupt:
6 print("\nCtrl+C received. Stopping...")
7 stop_event.set() # Signal audio thread to stop
8 # Send termination message to the server
9 if ws_app and ws_app.sock and ws_app.sock.connected:
10 try:
11 terminate_message = {"type": "Terminate"}
12 print(f"Sending termination message: {json.dumps(terminate_message)}")
13 ws_app.send(json.dumps(terminate_message))
14 # Give a moment for messages to process before forceful close
15 time.sleep(5)
16 except Exception as e:
17 print(f"Error sending termination message: {e}")
18 # Close the WebSocket connection (will trigger on_close)
19 if ws_app:
20 ws_app.close()
21 # Wait for WebSocket thread to finish
22 ws_thread.join(timeout=2.0)
23 except Exception as e:
24 print(f"\nAn unexpected error occurred: {e}")
25 stop_event.set()
26 if ws_app:
27 ws_app.close()
28 ws_thread.join(timeout=2.0)
29 finally:
30 # Final cleanup (already handled in on_close, but good as a fallback)
31 if stream and stream.is_active():
32 stream.stop_stream()
33 if stream:
34 stream.close()
35 if audio:
36 audio.terminate()
37 print("Cleanup complete. Exiting.")

The connection will also close automatically when you press Ctrl+C. In both cases, the .close() handler will clean up the audio resources.

Note: Pricing is based on session duration so it is very important to close sessions properly to avoid unexpected usage and cost.

Next steps

To learn more about Streaming Speech-to-Text, see the following resources:

Need some help?

If you get stuck, or have any other questions, we’d love to help you out. Contact our support team at support@assemblyai.com or create a support ticket.