Keyterms prompting

Streaming Keyterms Prompting

The keyterms prompting feature helps improve recognition accuracy for specific words and phrases that are important to your use case. Keyterms prompting is supported for Universal-3 Pro, Universal-Streaming English, and Universal-Streaming Multilingual.

Keyterms Prompting costs an additional $0.04/hour.

Quickstart

$pip install websocket-client pyaudio
1import pyaudio
2import websocket
3import json
4import threading
5import time
6from urllib.parse import urlencode
7from datetime import datetime
8
9# --- Configuration ---
10YOUR_API_KEY = "YOUR-API-KEY" # Replace with your actual API key
11
12CONNECTION_PARAMS = {
13 "sample_rate": 16000,
14 "speech_model": "u3-rt-pro",
15 "keyterms_prompt": json.dumps(["Keanu Reeves", "AssemblyAI", "Universal-2"])
16}
17API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"
18API_ENDPOINT = f"{API_ENDPOINT_BASE_URL}?{urlencode(CONNECTION_PARAMS)}"
19
20# Audio Configuration
21FRAMES_PER_BUFFER = 800 # 50ms of audio (0.05s * 16000Hz)
22SAMPLE_RATE = CONNECTION_PARAMS["sample_rate"]
23CHANNELS = 1
24FORMAT = pyaudio.paInt16
25
26# Global variables for audio stream and websocket
27audio = None
28stream = None
29ws_app = None
30audio_thread = None
31stop_event = threading.Event() # To signal the audio thread to stop
32
33# --- WebSocket Event Handlers ---
34
35
36def on_open(ws):
37 """Called when the WebSocket connection is established."""
38 print("WebSocket connection opened.")
39 print(f"Connected to: {API_ENDPOINT}")
40
41 # Start sending audio data in a separate thread
42 def stream_audio():
43 global stream
44 print("Starting audio streaming...")
45 while not stop_event.is_set():
46 try:
47 audio_data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False)
48
49 # Send audio data as binary message
50 ws.send(audio_data, websocket.ABNF.OPCODE_BINARY)
51 except Exception as e:
52 print(f"Error streaming audio: {e}")
53 # If stream read fails, likely means it's closed, stop the loop
54 break
55 print("Audio streaming stopped.")
56
57 global audio_thread
58 audio_thread = threading.Thread(target=stream_audio)
59 audio_thread.daemon = (
60 True # Allow main thread to exit even if this thread is running
61 )
62 audio_thread.start()
63
64def on_message(ws, message):
65 try:
66 data = json.loads(message)
67 msg_type = data.get('type')
68
69 if msg_type == "Begin":
70 session_id = data.get('id')
71 expires_at = data.get('expires_at')
72 print(f"\nSession began: ID={session_id}, ExpiresAt={datetime.fromtimestamp(expires_at)}")
73 elif msg_type == "Turn":
74 transcript = data.get('transcript', '')
75 formatted = data.get('turn_is_formatted', False)
76
77 # Clear previous line for formatted messages
78 if formatted:
79 print('\r' + ' ' * 80 + '\r', end='')
80 print(transcript)
81 else:
82 print(f"\r{transcript}", end='')
83 elif msg_type == "Termination":
84 audio_duration = data.get('audio_duration_seconds', 0)
85 session_duration = data.get('session_duration_seconds', 0)
86 print(f"\nSession Terminated: Audio Duration={audio_duration}s, Session Duration={session_duration}s")
87 except json.JSONDecodeError as e:
88 print(f"Error decoding message: {e}")
89 except Exception as e:
90 print(f"Error handling message: {e}")
91
92def on_error(ws, error):
93 """Called when a WebSocket error occurs."""
94 print(f"\nWebSocket Error: {error}")
95 # Attempt to signal stop on error
96 stop_event.set()
97
98
99def on_close(ws, close_status_code, close_msg):
100 """Called when the WebSocket connection is closed."""
101 print(f"\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}")
102
103 # Ensure audio resources are released
104 global stream, audio
105 stop_event.set() # Signal audio thread just in case it's still running
106
107 if stream:
108 if stream.is_active():
109 stream.stop_stream()
110 stream.close()
111 stream = None
112 if audio:
113 audio.terminate()
114 audio = None
115 # Try to join the audio thread to ensure clean exit
116 if audio_thread and audio_thread.is_alive():
117 audio_thread.join(timeout=1.0)
118
119
120# --- Main Execution ---
121def run():
122 global audio, stream, ws_app
123
124 # Initialize PyAudio
125 audio = pyaudio.PyAudio()
126
127 # Open microphone stream
128 try:
129 stream = audio.open(
130 input=True,
131 frames_per_buffer=FRAMES_PER_BUFFER,
132 channels=CHANNELS,
133 format=FORMAT,
134 rate=SAMPLE_RATE,
135 )
136 print("Microphone stream opened successfully.")
137 print("Speak into your microphone. Press Ctrl+C to stop.")
138 except Exception as e:
139 print(f"Error opening microphone stream: {e}")
140 if audio:
141 audio.terminate()
142 return # Exit if microphone cannot be opened
143
144 # Create WebSocketApp
145 ws_app = websocket.WebSocketApp(
146 API_ENDPOINT,
147 header={"Authorization": YOUR_API_KEY},
148 on_open=on_open,
149 on_message=on_message,
150 on_error=on_error,
151 on_close=on_close,
152 )
153
154 # Run WebSocketApp in a separate thread to allow main thread to catch KeyboardInterrupt
155 ws_thread = threading.Thread(target=ws_app.run_forever)
156 ws_thread.daemon = True
157 ws_thread.start()
158
159 try:
160 # Keep main thread alive until interrupted
161 while ws_thread.is_alive():
162 time.sleep(0.1)
163 except KeyboardInterrupt:
164 print("\nCtrl+C received. Stopping...")
165 stop_event.set() # Signal audio thread to stop
166
167 # Send termination message to the server
168 if ws_app and ws_app.sock and ws_app.sock.connected:
169 try:
170 terminate_message = {"type": "Terminate"}
171 print(f"Sending termination message: {json.dumps(terminate_message)}")
172 ws_app.send(json.dumps(terminate_message))
173 # Give a moment for messages to process before forceful close
174 time.sleep(5)
175 except Exception as e:
176 print(f"Error sending termination message: {e}")
177
178 # Close the WebSocket connection (will trigger on_close)
179 if ws_app:
180 ws_app.close()
181
182 # Wait for WebSocket thread to finish
183 ws_thread.join(timeout=2.0)
184
185 except Exception as e:
186 print(f"\nAn unexpected error occurred: {e}")
187 stop_event.set()
188 if ws_app:
189 ws_app.close()
190 ws_thread.join(timeout=2.0)
191
192 finally:
193 # Final cleanup (already handled in on_close, but good as a fallback)
194 if stream and stream.is_active():
195 stream.stop_stream()
196 if stream:
197 stream.close()
198 if audio:
199 audio.terminate()
200 print("Cleanup complete. Exiting.")
201
202
203if __name__ == "__main__":
204 run()

Configuration

To utilize keyterms prompting, you need to include your desired keyterms as query parameters in the WebSocket URL.

  • You can include a maximum of 100 keyterms per session.
  • Each individual keyterm string must be 50 characters or less in length.

How it works

Streaming Keyterms Prompting has two components to improve accuracy for your terms.

Word-level boosting

The streaming model itself is biased during inference to be more accurate at identifying words from your keyterms list. This happens in real-time as words are emitted during the streaming process, providing immediate improvements to recognition accuracy. This component is enabled by default.

Turn-level boosting

After each turn is completed, an additional boosting pass analyzes the full transcript using your keyterms list. This post-processing step, similar to formatting, provides a second layer of accuracy improvement by examining the complete context of the turn. To enable this component, set format_turns to True.

For Universal-Streaming English and Universal-Streaming Multilingual, you must set format_turns to True to enable turn-level boosting. For Universal-3 Pro (u3-rt-pro), turn-level boosting is always active since formatting is built into the model — there is no need to set format_turns.

Both stages work together to maximize recognition accuracy for your keyterms throughout the streaming process.

Dynamic keyterms prompting

Dynamic keyterms prompting allows you to update keyterms during an active streaming session using the UpdateConfiguration message. This enables you to adapt the recognition context in real-time based on conversation flow or changing requirements.

Updating keyterms during a session

To update keyterms while streaming, send an UpdateConfiguration message with a new keyterms_prompt array:

1# Replace or establish new set of keyterms
2websocket.send('{"type": "UpdateConfiguration", "keyterms_prompt": ["Universal-3"]}')
3
4# Remove keyterms and reset context biasing
5websocket.send('{"type": "UpdateConfiguration", "keyterms_prompt": []}')

How dynamic keyterms work

When you send an UpdateConfiguration message:

  • Replacing keyterms: Providing a new array of keyterms completely replaces the existing set. The new keyterms take effect immediately for subsequent audio processing.
  • Clearing keyterms: Sending an empty array [] removes all keyterms and resets context biasing to the default state.
  • Both boosting stages: Dynamic keyterms work with both word-level boosting (native context biasing) and turn-level boosting (metaphone-based), just like initial keyterms.

Use cases for dynamic keyterms

Dynamic keyterms are particularly useful for:

  • Context-aware voice agents: Update keyterms based on conversation stage (e.g., switching from menu items to payment terms)
  • Multi-topic conversations: Adapt vocabulary as the conversation topic changes
  • Progressive disclosure: Add relevant keyterms as new information becomes available
  • Cleanup: Remove keyterms that are no longer relevant to reduce processing overhead

Important notes

  • Keyterms prompts longer than 50 characters are ignored.
  • Requests containing more than 100 keyterms will result in an error.

Best practices

To maximize the effectiveness of keyterms prompting:

  • Specify Unique Terminology: Include proper names, company names, technical terms, or vocabulary specific to your domain that might not be commonly recognized.
  • Exact Spelling and Capitalization: Provide keyterms with the precise spelling and capitalization you expect to see in the output transcript. This helps the system accurately identify the terms.
  • Avoid Common Words: Do not include single, common English words (e.g., “information”) as keyterms. The system is generally proficient with such words, and adding them as keyterms can be redundant.