Multichannel streams
Multichannel streaming audio
To transcribe multichannel streaming audio, we recommend creating a separate session for each channel. This approach allows you to maintain clear speaker separation and get accurate diarized transcriptions for conversations, phone calls, or interviews where speakers are recorded on two different channels.
The following code example demonstrates how to transcribe a dual-channel audio file with diarized, speaker-separated transcripts. This same approach can be applied to any multi-channel audio stream, including those with more than two channels.
2
Use this complete script to transcribe dual-channel audio with speaker separation:
Python
1 import websocket 2 import json 3 import threading 4 import numpy as np 5 import wave 6 import time 7 import pyaudio 8 from urllib.parse import urlencode 9 10 # Configuration 11 YOUR_API_KEY = "<YOUR_API_KEY>" 12 AUDIO_FILE_PATH = "<DUAL_CHANNEL_AUDIO_FILE_PATH>" 13 API_BASE_URL = "wss://streaming.assemblyai.com/v3/ws" 14 API_PARAMS = { 15 "sample_rate": 8000, 16 "format_turns": "true", 17 } 18 19 # Build API endpoint with URL encoding 20 API_ENDPOINT = f"{API_BASE_URL}?{urlencode(API_PARAMS)}" 21 22 class ChannelTranscriber: 23 def __init__(self, channel_id, channel_name): 24 self.channel_id = channel_id 25 self.channel_name = channel_name 26 self.ws_app = None 27 self.audio_data = [] 28 self.current_turn_line = None 29 self.line_count = 0 30 31 def load_audio_channel(self): 32 """Extract single channel from dual-channel audio file.""" 33 with wave.open(AUDIO_FILE_PATH, 'rb') as wf: 34 frames = wf.readframes(wf.getnframes()) 35 audio_array = np.frombuffer(frames, dtype=np.int16) 36 37 if wf.getnchannels() == 2: 38 audio_array = audio_array.reshape(-1, 2) 39 channel_audio = audio_array[:, self.channel_id] 40 41 # Split into chunks for streaming 42 FRAMES_PER_BUFFER = 400 # 50ms chunks 43 for i in range(0, len(channel_audio), FRAMES_PER_BUFFER): 44 chunk = channel_audio[i:i+FRAMES_PER_BUFFER] 45 if len(chunk) < FRAMES_PER_BUFFER: 46 chunk = np.pad(chunk, (0, FRAMES_PER_BUFFER - len(chunk)), 'constant') 47 self.audio_data.append(chunk.astype(np.int16).tobytes()) 48 49 def on_open(self, ws): 50 """Stream audio data when connection opens.""" 51 def stream_audio(): 52 for chunk in self.audio_data: 53 ws.send(chunk, websocket.ABNF.OPCODE_BINARY) 54 time.sleep(0.05) # 50ms intervals 55 56 # Send termination message 57 terminate_message = {"type": "Terminate"} 58 ws.send(json.dumps(terminate_message)) 59 60 threading.Thread(target=stream_audio, daemon=True).start() 61 62 def clear_current_line(self): 63 if self.current_turn_line is not None: 64 print("\r" + " " * 100 + "\r", end="", flush=True) 65 66 def print_partial_transcript(self, words): 67 self.clear_current_line() 68 # Build transcript from individual words 69 word_texts = [word.get('text', '') for word in words] 70 transcript = ' '.join(word_texts) 71 partial_text = f"{self.channel_name}: {transcript}" 72 print(partial_text, end="", flush=True) 73 self.current_turn_line = len(partial_text) 74 75 def print_final_transcript(self, transcript): 76 self.clear_current_line() 77 final_text = f"{self.channel_name}: {transcript}" 78 print(final_text, flush=True) 79 self.current_turn_line = None 80 self.line_count += 1 81 82 def on_message(self, ws, message): 83 """Handle transcription results.""" 84 data = json.loads(message) 85 msg_type = data.get('type') 86 87 if msg_type == "Turn": 88 transcript = data.get('transcript', '').strip() 89 formatted = data.get('turn_is_formatted', False) 90 words = data.get('words', []) 91 92 if transcript or words: 93 if formatted: 94 self.print_final_transcript(transcript) 95 else: 96 self.print_partial_transcript(words) 97 98 def start_transcription(self): 99 self.load_audio_channel() 100 101 self.ws_app = websocket.WebSocketApp( 102 API_ENDPOINT, 103 header={"Authorization": YOUR_API_KEY}, 104 on_open=self.on_open, 105 on_message=self.on_message, 106 ) 107 108 thread = threading.Thread(target=self.ws_app.run_forever, daemon=True) 109 thread.start() 110 return thread 111 112 def play_audio_file(): 113 try: 114 with wave.open(AUDIO_FILE_PATH, 'rb') as wf: 115 p = pyaudio.PyAudio() 116 117 stream = p.open( 118 format=p.get_format_from_width(wf.getsampwidth()), 119 channels=wf.getnchannels(), 120 rate=wf.getframerate(), 121 output=True 122 ) 123 124 print(f"Playing audio: {AUDIO_FILE_PATH}") 125 126 # Play audio in chunks 127 chunk_size = 1024 128 data = wf.readframes(chunk_size) 129 130 while data: 131 stream.write(data) 132 data = wf.readframes(chunk_size) 133 134 stream.stop_stream() 135 stream.close() 136 p.terminate() 137 138 print("Audio playback finished") 139 140 except Exception as e: 141 print(f"Error playing audio: {e}") 142 143 144 def transcribe_multichannel(): 145 # Create transcribers for each channel 146 transcriber_1 = ChannelTranscriber(0, "Speaker 1") 147 transcriber_2 = ChannelTranscriber(1, "Speaker 2") 148 149 # Start audio playback 150 audio_thread = threading.Thread(target=play_audio_file, daemon=True) 151 audio_thread.start() 152 153 # Start both transcriptions 154 thread_1 = transcriber_1.start_transcription() 155 thread_2 = transcriber_2.start_transcription() 156 157 # Wait for completion 158 thread_1.join() 159 thread_2.join() 160 audio_thread.join() 161 162 if __name__ == "__main__": 163 transcribe_multichannel()