Multichannel streams | AssemblyAI

Multichannel streaming audio

To transcribe multichannel streaming audio, we recommend creating a separate session for each channel. This approach allows you to maintain clear speaker separation and get accurate diarized transcriptions for conversations, phone calls, or interviews where speakers are recorded on two different channels.

The following code example demonstrates how to transcribe a dual-channel audio file with diarized, speaker-separated transcripts. This same approach can be applied to any multi-channel audio stream, including those with more than two channels.

Python

JavaScript

Firstly, install the required dependencies.

$ pip install websocket-client numpy pyaudio

Use this complete script to transcribe dual-channel audio with speaker separation:

1 import websocket
2 import json
3 import threading
4 import numpy as np
5 import wave
6 import time
7 import pyaudio
8 from urllib.parse import urlencode
9 
10 # Configuration
11 YOUR_API_KEY = "<YOUR_API_KEY>"
12 AUDIO_FILE_PATH = "<DUAL_CHANNEL_AUDIO_FILE_PATH>"
13 API_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"
14 API_PARAMS = {
15     "sample_rate": 8000,
16     "format_turns": "true",
17 }
18 
19 # Build API endpoint with URL encoding
20 API_ENDPOINT = f"{API_BASE_URL}?{urlencode(API_PARAMS)}"
21 
22 class ChannelTranscriber:
23     def __init__(self, channel_id, channel_name):
24         self.channel_id = channel_id
25         self.channel_name = channel_name
26         self.ws_app = None
27         self.audio_data = []
28         self.current_turn_line = None
29         self.line_count = 0
30 
31     def load_audio_channel(self):
32         """Extract single channel from dual-channel audio file."""
33         with wave.open(AUDIO_FILE_PATH, 'rb') as wf:
34             frames = wf.readframes(wf.getnframes())
35             audio_array = np.frombuffer(frames, dtype=np.int16)
36 
37             if wf.getnchannels() == 2:
38                 audio_array = audio_array.reshape(-1, 2)
39                 channel_audio = audio_array[:, self.channel_id]
40 
41                 # Split into chunks for streaming
42                 FRAMES_PER_BUFFER = 400  # 50ms chunks
43                 for i in range(0, len(channel_audio), FRAMES_PER_BUFFER):
44                     chunk = channel_audio[i:i+FRAMES_PER_BUFFER]
45                     if len(chunk) < FRAMES_PER_BUFFER:
46                         chunk = np.pad(chunk, (0, FRAMES_PER_BUFFER - len(chunk)), 'constant')
47                     self.audio_data.append(chunk.astype(np.int16).tobytes())
48 
49     def on_open(self, ws):
50         """Stream audio data when connection opens."""
51         def stream_audio():
52             for chunk in self.audio_data:
53                 ws.send(chunk, websocket.ABNF.OPCODE_BINARY)
54                 time.sleep(0.05)  # 50ms intervals
55 
56             # Send termination message
57             terminate_message = {"type": "Terminate"}
58             ws.send(json.dumps(terminate_message))
59 
60         threading.Thread(target=stream_audio, daemon=True).start()
61 
62     def clear_current_line(self):
63         if self.current_turn_line is not None:
64             print("\r" + " " * 100 + "\r", end="", flush=True)
65 
66     def print_partial_transcript(self, words):
67         self.clear_current_line()
68         # Build transcript from individual words
69         word_texts = [word.get('text', '') for word in words]
70         transcript = ' '.join(word_texts)
71         partial_text = f"{self.channel_name}: {transcript}"
72         print(partial_text, end="", flush=True)
73         self.current_turn_line = len(partial_text)
74 
75     def print_final_transcript(self, transcript):
76         self.clear_current_line()
77         final_text = f"{self.channel_name}: {transcript}"
78         print(final_text, flush=True)
79         self.current_turn_line = None
80         self.line_count += 1
81 
82     def on_message(self, ws, message):
83         """Handle transcription results."""
84         data = json.loads(message)
85         msg_type = data.get('type')
86 
87         if msg_type == "Turn":
88             transcript = data.get('transcript', '').strip()
89             formatted = data.get('turn_is_formatted', False)
90             words = data.get('words', [])
91 
92             if transcript or words:
93                 if formatted:
94                     self.print_final_transcript(transcript)
95                 else:
96                     self.print_partial_transcript(words)
97 
98     def start_transcription(self):
99         self.load_audio_channel()
100 
101         self.ws_app = websocket.WebSocketApp(
102             API_ENDPOINT,
103             header={"Authorization": YOUR_API_KEY},
104             on_open=self.on_open,
105             on_message=self.on_message,
106         )
107 
108         thread = threading.Thread(target=self.ws_app.run_forever, daemon=True)
109         thread.start()
110         return thread
111 
112 def play_audio_file():
113     try:
114         with wave.open(AUDIO_FILE_PATH, 'rb') as wf:
115             p = pyaudio.PyAudio()
116 
117             stream = p.open(
118                 format=p.get_format_from_width(wf.getsampwidth()),
119                 channels=wf.getnchannels(),
120                 rate=wf.getframerate(),
121                 output=True
122             )
123 
124             print(f"Playing audio: {AUDIO_FILE_PATH}")
125 
126             # Play audio in chunks
127             chunk_size = 1024
128             data = wf.readframes(chunk_size)
129 
130             while data:
131                 stream.write(data)
132                 data = wf.readframes(chunk_size)
133 
134             stream.stop_stream()
135             stream.close()
136             p.terminate()
137 
138             print("Audio playback finished")
139 
140     except Exception as e:
141         print(f"Error playing audio: {e}")
142 
143 
144 def transcribe_multichannel():
145     # Create transcribers for each channel
146     transcriber_1 = ChannelTranscriber(0, "Speaker 1")
147     transcriber_2 = ChannelTranscriber(1, "Speaker 2")
148 
149     # Start audio playback
150     audio_thread = threading.Thread(target=play_audio_file, daemon=True)
151     audio_thread.start()
152 
153     # Start both transcriptions
154     thread_1 = transcriber_1.start_transcription()
155     thread_2 = transcriber_2.start_transcription()
156 
157     # Wait for completion
158     thread_1.join()
159     thread_2.join()
160     audio_thread.join()
161 
162 if __name__ == "__main__":
163     transcribe_multichannel()