Apply Noise Reduction to Audio for Streaming Speech-to-Text

This guide demonstrates how to implement a noise reduction system for real-time audio transcription using AssemblyAI’s Streaming STT and the noisereduce library. You’ll learn how to create a custom audio stream that preprocesses incoming audio to remove background noise before it reaches the transcription service.

This solution is particularly valuable for:

  • Voice assistants operating in noisy environments
  • Customer service applications processing calls
  • Meeting transcription tools
  • Voice-enabled applications requiring high accuracy

The implementation uses Python and combines proven audio processing techniques with AssemblyAI’s powerful transcription capabilities. While our example focuses on microphone input, the principles can be applied to any real-time audio stream.

Quickstart

1import logging
2import numpy as np
3import noisereduce as nr
4import assemblyai as aai
5from typing import Type
6from assemblyai.streaming.v3 import (
7 BeginEvent,
8 StreamingClient,
9 StreamingClientOptions,
10 StreamingError,
11 StreamingEvents,
12 StreamingParameters,
13 StreamingSessionParameters,
14 TerminationEvent,
15 TurnEvent,
16)
17
18logging.basicConfig(level=logging.INFO)
19
20api_key = "<YOUR_API_KEY>"
21
22# --- Noise-reduced microphone stream ---
23def noise_reduced_mic_stream(sample_rate=16000):
24 mic = aai.extras.MicrophoneStream(sample_rate=sample_rate)
25 buffer = np.array([], dtype=np.int16)
26 buffer_size = int(sample_rate * 0.5) # 0.5 seconds
27
28 for raw_audio in mic:
29 audio_data = np.frombuffer(raw_audio, dtype=np.int16)
30 buffer = np.append(buffer, audio_data)
31
32 if len(buffer) >= buffer_size:
33 float_audio = buffer.astype(np.float32) / 32768.0
34 denoised = nr.reduce_noise(
35 y=float_audio,
36 sr=sample_rate,
37 prop_decrease=0.75,
38 n_fft=1024,
39 )
40 int_audio = (denoised * 32768.0).astype(np.int16)
41 buffer = buffer[-1024:] # keep some overlap
42 yield int_audio.tobytes()
43
44
45# --- Event Handlers ---
46def on_begin(self: Type[StreamingClient], event: BeginEvent):
47 print(f" Session started: {event.id}")
48
49def on_turn(self: Type[StreamingClient], event: TurnEvent):
50 print(f"{event.transcript} ({event.end_of_turn})")
51
52 if event.end_of_turn and not event.turn_is_formatted:
53 self.set_params(StreamingSessionParameters(format_turns=True))
54
55def on_terminated(self: Type[StreamingClient], event: TerminationEvent):
56 print(f" Session terminated after {event.audio_duration_seconds} seconds")
57
58def on_error(self: Type[StreamingClient], error: StreamingError):
59 print(f" Error occurred: {error}")
60
61# --- Main Function ---
62def main():
63 client = StreamingClient(
64 StreamingClientOptions(
65 api_key=api_key,
66 api_host="streaming.assemblyai.com",
67 )
68 )
69
70 client.on(StreamingEvents.Begin, on_begin)
71 client.on(StreamingEvents.Turn, on_turn)
72 client.on(StreamingEvents.Termination, on_terminated)
73 client.on(StreamingEvents.Error, on_error)
74
75 client.connect(
76 StreamingParameters(
77 sample_rate=16000,
78 format_turns=True,
79 )
80 )
81
82 try:
83 denoised_stream = noise_reduced_mic_stream(sample_rate=16000)
84 client.stream(denoised_stream)
85 finally:
86 client.disconnect(terminate=True)
87
88if __name__ == "__main__":
89 main()

Step-by-step guide

First, install the following packages: assemblyai, noisereduce, numpy

$pip install assemblyai noisereduce numpy
1import logging
2import numpy as np
3import noisereduce as nr
4import assemblyai as aai
5from typing import Type
6from assemblyai.streaming.v3 import (
7 BeginEvent,
8 StreamingClient,
9 StreamingClientOptions,
10 StreamingError,
11 StreamingEvents,
12 StreamingParameters,
13 StreamingSessionParameters,
14 TerminationEvent,
15 TurnEvent,
16)

Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up for a free account and get your API key from your dashboard. Please note that Streaming Speech-to-text is available for upgraded accounts only. If you’re on the free plan, you’ll need to upgrade your account by adding a credit card.

1api_key = "<YOUR_API_KEY>"

Make sure not to share this token with anyone - it is a private key associated uniquely to your account.

Create a generator function that includes noise reduction.

1def noise_reduced_mic_stream(sample_rate=16000):
2 mic = aai.extras.MicrophoneStream(sample_rate=sample_rate)
3 buffer = np.array([], dtype=np.int16)
4 buffer_size = int(sample_rate * 0.5) # 0.5 seconds
5 for raw_audio in mic:
6 audio_data = np.frombuffer(raw_audio, dtype=np.int16)
7 buffer = np.append(buffer, audio_data)
8 if len(buffer) >= buffer_size:
9 float_audio = buffer.astype(np.float32) / 32768.0
10 denoised = nr.reduce_noise(
11 y=float_audio,
12 sr=sample_rate,
13 prop_decrease=0.75,
14 n_fft=1024,
15 )
16 int_audio = (denoised * 32768.0).astype(np.int16)
17 buffer = buffer[-1024:] # keep some overlap
18 yield int_audio.tobytes()

Create functions to handle different events during transcription.

1def on_begin(self: Type[StreamingClient], event: BeginEvent):
2 print(f" Session started: {event.id}")
3
4def on_turn(self: Type[StreamingClient], event: TurnEvent):
5 print(f"{event.transcript} ({event.end_of_turn})")
6
7 if event.end_of_turn and not event.turn_is_formatted:
8 self.set_params(StreamingSessionParameters(format_turns=True))
9
10def on_terminated(self: Type[StreamingClient], event: TerminationEvent):
11 print(f" Session terminated after {event.audio_duration_seconds} seconds")
12
13def on_error(self: Type[StreamingClient], error: StreamingError):
14 print(f" Error occurred: {error}")

Now we create our transcriber and noise_reduced_mic_stream.

1def main():
2 client = StreamingClient(
3 StreamingClientOptions(
4 api_key=api_key,
5 api_host="streaming.assemblyai.com",
6 )
7 )
8
9 client.on(StreamingEvents.Begin, on_begin)
10 client.on(StreamingEvents.Turn, on_turn)
11 client.on(StreamingEvents.Termination, on_terminated)
12 client.on(StreamingEvents.Error, on_error)
13
14 client.connect(
15 StreamingParameters(
16 sample_rate=16000,
17 format_turns=True,
18 )
19 )
20
21 try:
22 denoised_stream = noise_reduced_mic_stream(sample_rate=16000)
23 client.stream(denoised_stream)
24 finally:
25 client.disconnect(terminate=True)
26
27if __name__ == "__main__":
28 main()

You can press Ctrl+C to stop the transcription.