
Speaker Diarization
Word-level speaker labels. streaming and async. Production-ready.

1base_url = "https://api.assemblyai.com"
2headers = {
3 "authorization": "<YOUR_API_KEY>"
4}
5with open("./my-audio.mp3", "rb") as f:
6 response = requests.post(base_url + "/v2/upload",
7 headers=headers,
8 data=f)
9upload_url = response.json()["upload_url"]
10data = {
11 "audio_url": upload_url, # You can also use a URL to an audio or video file on the web
12 "speech_models": ["universal-3-pro", "universal-2"],
13 "language_detection": True,
14 "speaker_labels": True
15}1import { AssemblyAI } from 'assemblyai'
2
3const client = new AssemblyAI({
4 apiKey: 'YOUR_API_KEY'
5})
6
7const audioUrl =
8 'https://assembly.ai/sports_injuries.mp3'
9
10const params = {
11 audio: audioUrl,
12 speaker_labels: true
13}
14
15const run = async () => {
16 const transcript = await client.transcripts.transcribe(params)
17 console.log(transcript.text)
18
19 for (let utterance of transcript.utterances!) {
20 console.log(`Speaker ${utterance.speaker}: ${utterance.text}`)
21 }
22}
23
24run()1package main
2
3import (
4 "context"
5 "fmt"
6 "os"
7
8 aai "github.com/AssemblyAI/assemblyai-go-sdk"
9)
10
11func main() {
12 ctx := context.Background()
13
14 audioURL := "https://assembly.ai/sports_injuries.mp3"
15
16 client := aai.NewClient("YOUR_API_KEY")
17
18 params := &aai.TranscriptOptionalParams{
19 SpeakerLabels: aai.Bool(true),
20 }
21
22 transcript, err := client.Transcripts.TranscribeFromURL(ctx, audioURL, params)
23 if err != nil {
24 fmt.Println("Something bad happened:", err)
25 os.Exit(1)
26 }
27
28 fmt.Println(*transcript.Text)
29
30 for _, utterance := range transcript.Utterances {
31 fmt.Printf("Speaker %v: %v
32", *utterance.Speaker, *utterance.Text)
33 }
34}1import com.assemblyai.api.AssemblyAI;
2import com.assemblyai.api.resources.transcripts.types.*;
3
4public final class App {
5 public static void main(String[] args) {
6 AssemblyAI client = AssemblyAI.builder()
7 .apiKey("YOUR_API_KEY")
8 .build();
9
10 String audioUrl = "https://assembly.ai/sports_injuries.mp3";
11
12 var params = TranscriptOptionalParams.builder()
13 .speakerLabels(true)
14 .build();
15
16 Transcript transcript = client.transcripts().transcribe(audioUrl, params);
17
18 System.out.println(transcript.getText().get());
19
20 transcript.getUtterances().get().forEach(utterance ->
21 System.out.println("Speaker " + utterance.getSpeaker() + ": " + utterance.getText())
22 );
23 }
24}1require 'assemblyai'
2
3client = AssemblyAI::Client.new(api_key: 'YOUR_API_KEY')
4
5audio_url = 'https://assembly.ai/sports_injuries.mp3'
6
7transcript = client.transcripts.transcribe(
8 audio_url: audio_url,
9 speaker_labels: true
10)
11
12abort transcript.error if transcript.status == AssemblyAI::Transcripts::TranscriptStatus::ERROR
13
14puts transcript.text
15
16transcript.utterances.each do |utterance|
17 printf('Speaker %<speaker>s: %<text>s', speaker: utterance.speaker, text: utterance.text)
18endImprove transcription quality and readability
Sentence-level speaker consistency ensures the last word of a sentence never bleeds into the next speaker. Correctly transcribe and diarize conversations with 30+ speakers in 95+ languages.
Short utterances like 'Yeah,' 'Okay,' and single-word responses are correctly attributed in multi-speaker settings.
Build confidently with the most accurate, production-grade speaker diarization

Unlike turn-level diarization, every word carries its own speaker label so when speakers overlap or change mid-sentence, attribution stays accurate. No more misattributed words corrupting your transcripts.

2x better cpWER than Deepgram Nova-3 on streaming two-speaker telephony. 13% better on four-speaker meetings. 42% fewer false-alarm speakers. Built and benchmarked for the conversations that matter most in production.
Make every voice count - built for what production apps actually need.
Eliminate phantom turns | Detect speaker changes mid-sentence | Real-time diarization for streaming apps | Accurate summarization per speaker |
Ship AI notetakers that just work | Stop users from repeating themselves | Reliable contact center inputs | 95+ languages, streaming and async |
AssemblyAI’s managed API endpoint and diarization won me over—something Whisper couldn’t provide.
Start building with the most accurate diarization API
66% fewer false speakers. 91% fewer phantom turns. Word-level attribution out of the box. Streaming and async, free to start.















