Multichannel streams

Multichannel streaming audio

To transcribe multichannel streaming audio, we recommend creating a separate session for each channel. This approach allows you to maintain clear speaker separation and get accurate diarized transcriptions for conversations, phone calls, or interviews where speakers are recorded on two different channels.

The following code example demonstrates how to transcribe a dual-channel audio file with diarized, speaker-separated transcripts. This same approach can be applied to any multi-channel audio stream, including those with more than two channels.

1

Firstly, install the required dependencies.

$pip install websocket-client numpy pyaudio
2

Use this complete script to transcribe dual-channel audio with speaker separation:

1import websocket
2import json
3import threading
4import numpy as np
5import wave
6import time
7import pyaudio
8from urllib.parse import urlencode
9
10# Configuration
11YOUR_API_KEY = "<YOUR_API_KEY>"
12AUDIO_FILE_PATH = "<DUAL_CHANNEL_AUDIO_FILE_PATH>"
13API_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"
14API_PARAMS = {
15 "sample_rate": 8000,
16 "format_turns": "true",
17}
18
19# Build API endpoint with URL encoding
20API_ENDPOINT = f"{API_BASE_URL}?{urlencode(API_PARAMS)}"
21
22class ChannelTranscriber:
23 def __init__(self, channel_id, channel_name):
24 self.channel_id = channel_id
25 self.channel_name = channel_name
26 self.ws_app = None
27 self.audio_data = []
28 self.current_turn_line = None
29 self.line_count = 0
30
31 def load_audio_channel(self):
32 """Extract single channel from dual-channel audio file."""
33 with wave.open(AUDIO_FILE_PATH, 'rb') as wf:
34 frames = wf.readframes(wf.getnframes())
35 audio_array = np.frombuffer(frames, dtype=np.int16)
36
37 if wf.getnchannels() == 2:
38 audio_array = audio_array.reshape(-1, 2)
39 channel_audio = audio_array[:, self.channel_id]
40
41 # Split into chunks for streaming
42 FRAMES_PER_BUFFER = 400 # 50ms chunks
43 for i in range(0, len(channel_audio), FRAMES_PER_BUFFER):
44 chunk = channel_audio[i:i+FRAMES_PER_BUFFER]
45 if len(chunk) < FRAMES_PER_BUFFER:
46 chunk = np.pad(chunk, (0, FRAMES_PER_BUFFER - len(chunk)), 'constant')
47 self.audio_data.append(chunk.astype(np.int16).tobytes())
48
49 def on_open(self, ws):
50 """Stream audio data when connection opens."""
51 def stream_audio():
52 for chunk in self.audio_data:
53 ws.send(chunk, websocket.ABNF.OPCODE_BINARY)
54 time.sleep(0.05) # 50ms intervals
55
56 # Send termination message
57 terminate_message = {"type": "Terminate"}
58 ws.send(json.dumps(terminate_message))
59
60 threading.Thread(target=stream_audio, daemon=True).start()
61
62 def clear_current_line(self):
63 if self.current_turn_line is not None:
64 print("\r" + " " * 100 + "\r", end="", flush=True)
65
66 def print_partial_transcript(self, words):
67 self.clear_current_line()
68 # Build transcript from individual words
69 word_texts = [word.get('text', '') for word in words]
70 transcript = ' '.join(word_texts)
71 partial_text = f"{self.channel_name}: {transcript}"
72 print(partial_text, end="", flush=True)
73 self.current_turn_line = len(partial_text)
74
75 def print_final_transcript(self, transcript):
76 self.clear_current_line()
77 final_text = f"{self.channel_name}: {transcript}"
78 print(final_text, flush=True)
79 self.current_turn_line = None
80 self.line_count += 1
81
82 def on_message(self, ws, message):
83 """Handle transcription results."""
84 data = json.loads(message)
85 msg_type = data.get('type')
86
87 if msg_type == "Turn":
88 transcript = data.get('transcript', '').strip()
89 formatted = data.get('turn_is_formatted', False)
90 words = data.get('words', [])
91
92 if transcript or words:
93 if formatted:
94 self.print_final_transcript(transcript)
95 else:
96 self.print_partial_transcript(words)
97
98 def start_transcription(self):
99 self.load_audio_channel()
100
101 self.ws_app = websocket.WebSocketApp(
102 API_ENDPOINT,
103 header={"Authorization": YOUR_API_KEY},
104 on_open=self.on_open,
105 on_message=self.on_message,
106 )
107
108 thread = threading.Thread(target=self.ws_app.run_forever, daemon=True)
109 thread.start()
110 return thread
111
112def play_audio_file():
113 try:
114 with wave.open(AUDIO_FILE_PATH, 'rb') as wf:
115 p = pyaudio.PyAudio()
116
117 stream = p.open(
118 format=p.get_format_from_width(wf.getsampwidth()),
119 channels=wf.getnchannels(),
120 rate=wf.getframerate(),
121 output=True
122 )
123
124 print(f"Playing audio: {AUDIO_FILE_PATH}")
125
126 # Play audio in chunks
127 chunk_size = 1024
128 data = wf.readframes(chunk_size)
129
130 while data:
131 stream.write(data)
132 data = wf.readframes(chunk_size)
133
134 stream.stop_stream()
135 stream.close()
136 p.terminate()
137
138 print("Audio playback finished")
139
140 except Exception as e:
141 print(f"Error playing audio: {e}")
142
143
144def transcribe_multichannel():
145 # Create transcribers for each channel
146 transcriber_1 = ChannelTranscriber(0, "Speaker 1")
147 transcriber_2 = ChannelTranscriber(1, "Speaker 2")
148
149 # Start audio playback
150 audio_thread = threading.Thread(target=play_audio_file, daemon=True)
151 audio_thread.start()
152
153 # Start both transcriptions
154 thread_1 = transcriber_1.start_transcription()
155 thread_2 = transcriber_2.start_transcription()
156
157 # Wait for completion
158 thread_1.join()
159 thread_2.join()
160 audio_thread.join()
161
162if __name__ == "__main__":
163 transcribe_multichannel()