Extract Quotes with Timestamps Using LLM Gateway + Semantic Search | AssemblyAI

This guide will demonstrate how to use AssemblyAI’s LLM Gateway framework to process an audio file and find the best quotes included in it through Semantic Search.

Quickstart

1 import datetime
2 import numpy as np
3 import requests
4 import time
5 from sklearn.neighbors import NearestNeighbors
6 from sentence_transformers import SentenceTransformer
7 
8 # Configuration
9 api_key = "<YOUR_API_KEY>"
10 base_url = "https://api.assemblyai.com"
11 headers = {"authorization": api_key}
12 
13 def upload_file(file_path):
14     """Upload a local audio file to AssemblyAI"""
15     with open(file_path, "rb") as f:
16         response = requests.post(f"{base_url}/v2/upload", headers=headers, data=f)
17         if response.status_code != 200:
18             print(f"Error uploading: {response.status_code}, {response.text}")
19             response.raise_for_status()
20         return response.json()["upload_url"]
21 
22 def transcribe_audio(audio_url):
23     """Submit audio for transcription with sentences enabled and poll until complete"""
24     data = {
25         "audio_url": audio_url,
26         "auto_highlights": False,
27         "sentiment_analysis": False,
28         "entity_detection": False
29     }
30     
31     response = requests.post(f"{base_url}/v2/transcript", headers=headers, json=data)
32     
33     if response.status_code != 200:
34         print(f"Error submitting transcription: {response.status_code}, {response.text}")
35         response.raise_for_status()
36     
37     transcript_id = response.json()["id"]
38     polling_endpoint = f"{base_url}/v2/transcript/{transcript_id}"
39     
40     print("Transcribing...")
41     while True:
42         transcript = requests.get(polling_endpoint, headers=headers).json()
43         if transcript["status"] == "completed":
44             print("Transcription completed!")
45             return transcript
46         elif transcript["status"] == "error":
47             raise RuntimeError(f"Transcription failed: {transcript['error']}")
48         else:
49             time.sleep(3)
50 
51 def get_sentences(transcript_id):
52     """Get sentences from a completed transcript"""
53     sentences_endpoint = f"{base_url}/v2/transcript/{transcript_id}/sentences"
54     response = requests.get(sentences_endpoint, headers=headers)
55     
56     if response.status_code != 200:
57         print(f"Error getting sentences: {response.status_code}, {response.text}")
58         response.raise_for_status()
59     
60     return response.json()["sentences"]
61 
62 def process_with_llm_gateway(transcript_text, question, context=""):
63     """Send transcript to LLM Gateway for question answering"""
64     prompt = f"""Based on the following transcript, please answer this question:
65             Question: {question}
66             Context: {context}
67             Transcript: {transcript_text}
68             Please provide a clear and specific answer."""
69     
70     llm_gateway_data = {
71         "model": "claude-sonnet-4-5-20250929",
72         "messages": [
73             {
74                 "role": "user",
75                 "content": prompt
76             }
77         ],
78         "max_tokens": 2000
79     }
80     
81     response = requests.post(
82         "https://llm-gateway.assemblyai.com/v1/chat/completions",
83         headers=headers,
84         json=llm_gateway_data
85     )
86     
87     result = response.json()
88     
89     if "error" in result:
90         raise RuntimeError(f"LLM Gateway error: {result['error']}")
91     
92     return result['choices'][0]['message']['content']
93 
94 def sliding_window(elements, distance, stride):
95     """Create sliding windows of elements"""
96     idx = 0
97     results = []
98     while idx + distance < len(elements):
99         results.append(elements[idx:idx + distance])
100         idx += (distance - stride)
101     return results
102 
103 # Main execution
104 # If using a local file:
105 audio_url = upload_file("<YOUR_AUDIO FILE>")
106 
107 # If using a public URL:
108 # audio_url = "<YOUR_AUDIO_URL>"
109 
110 # Transcribe audio
111 transcript = transcribe_audio(audio_url)
112 transcript_text = transcript["text"]
113 transcript_id = transcript["id"]
114 
115 # Get sentences
116 print("Getting sentences...")
117 sentences = get_sentences(transcript_id)
118 
119 # Initialize embedder
120 embedder = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
121 embeddings = {}
122 
123 # Create sliding window of sentences and generate embeddings
124 print("Creating embeddings...")
125 sentence_groups = sliding_window(sentences, 5, 2)
126 
127 for sentence_group in sentence_groups:
128     combined_text = " ".join([sentence["text"] for sentence in sentence_group])
129     start = sentence_group[0]["start"]
130     end = sentence_group[-1]["end"]
131     
132     embeddings[(start, end, transcript_id, combined_text)] = embedder.encode(combined_text)
133 
134 # Use LLM Gateway to find the best quotes
135 print("Asking LLM Gateway for best quotes...")
136 question = "What are the 3 best quotes from this video?"
137 context = "Please provide exactly 3 quotes."
138 
139 llm_answer = process_with_llm_gateway(transcript_text, question, context)
140 print(f"\nLLM Gateway Response:\n{llm_answer}\n")
141 
142 # Embed the LLM output
143 llm_gateway_embedding = embedder.encode(llm_answer)
144 
145 # Vectorize transcript embeddings
146 np_embeddings = np.array(list(embeddings.values()))
147 metadata = list(embeddings.keys())
148 
149 # Find the top 3 most similar quotes
150 print("Finding matching quotes in transcript...")
151 knn = NearestNeighbors(n_neighbors=3, metric="cosine")
152 knn.fit(np_embeddings)
153 distances, indices = knn.kneighbors([llm_gateway_embedding])
154 
155 matches = []
156 for distance, index in zip(distances[0], indices[0]):
157     result_metadata = metadata[index]
158     matches.append(
159         {
160             "start_timestamp": result_metadata[0],
161             "end_timestamp": result_metadata[1],
162             "transcript_id": result_metadata[2],
163             "text": result_metadata[3],
164             "confidence": 1 - distance,
165         }
166     )
167 
168 # Display results
169 print("\n" + "="*80)
170 print("BEST MATCHING QUOTES FROM TRANSCRIPT:")
171 print("="*80 + "\n")
172 
173 for index, m in enumerate(matches):
174     print('QUOTE #{}: "{}"'.format(index + 1, m['text']))
175     print('START TIMESTAMP:', str(datetime.timedelta(seconds=m['start_timestamp']/1000)))
176     print('END TIMESTAMP:', str(datetime.timedelta(seconds=m['end_timestamp']/1000)))
177     print('CONFIDENCE:', m['confidence'])
178     print()

Getting Started

Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up for an AssemblyAI account and get your API key from your dashboard.

You’ll also need to install a few libraries that this code depends on:

Numpy, a scientific computing library.
Sciki-Learn, a library for predictive data analysis.
Sentence-Transformers, a framework for state-of-the-art sentence and text embedding.

Step-by-Step Instructions

$ pip install -U numpy scikit-learn sentence-transformers

Then import all of these libraries and set our AssemblyAI API key, headers, and base URL.

1 import datetime
2 import numpy as np
3 import requests
4 import time
5 from sklearn.neighbors import NearestNeighbors
6 from sentence_transformers import SentenceTransformer
7 
8 # Configuration
9 api_key = "<YOUR_API_KEY>"
10 base_url = "https://api.assemblyai.com"
11 headers = {"authorization": api_key}

Next, define functions to upload and transcribe files using AssemblyAI’s Async API, as well as request sentences.

1 def upload_file(file_path):
2     """Upload a local audio file to AssemblyAI"""
3     with open(file_path, "rb") as f:
4         response = requests.post(f"{base_url}/v2/upload", headers=headers, data=f)
5         if response.status_code != 200:
6             print(f"Error uploading: {response.status_code}, {response.text}")
7             response.raise_for_status()
8         return response.json()["upload_url"]
9 
10 def transcribe_audio(audio_url):
11     """Submit audio for transcription with sentences enabled and poll until complete"""
12     data = {
13         "audio_url": audio_url,
14         "auto_highlights": False,
15         "sentiment_analysis": False,
16         "entity_detection": False
17     }
18     
19     response = requests.post(f"{base_url}/v2/transcript", headers=headers, json=data)
20     
21     if response.status_code != 200:
22         print(f"Error submitting transcription: {response.status_code}, {response.text}")
23         response.raise_for_status()
24     
25     transcript_id = response.json()["id"]
26     polling_endpoint = f"{base_url}/v2/transcript/{transcript_id}"
27     
28     print("Transcribing...")
29     while True:
30         transcript = requests.get(polling_endpoint, headers=headers).json()
31         if transcript["status"] == "completed":
32             print("Transcription completed!")
33             return transcript
34         elif transcript["status"] == "error":
35             raise RuntimeError(f"Transcription failed: {transcript['error']}")
36         else:
37             time.sleep(3)
38 
39 def get_sentences(transcript_id):
40     """Get sentences from a completed transcript"""
41     sentences_endpoint = f"{base_url}/v2/transcript/{transcript_id}/sentences"
42     response = requests.get(sentences_endpoint, headers=headers)
43     
44     if response.status_code != 200:
45         print(f"Error getting sentences: {response.status_code}, {response.text}")
46         response.raise_for_status()
47     
48     return response.json()["sentences"]

Then define a function to process each transcript text with LLM Gateway.

1 def process_with_llm_gateway(transcript_text, question, context=""):
2     """Send transcript to LLM Gateway for question answering"""
3     prompt = f"""Based on the following transcript, please answer this question:
4             Question: {question}
5             Context: {context}
6             Transcript: {transcript_text}
7             Please provide a clear and specific answer."""
8     
9     llm_gateway_data = {
10         "model": "claude-sonnet-4-5-20250929",
11         "messages": [
12             {
13                 "role": "user",
14                 "content": prompt
15             }
16         ],
17         "max_tokens": 2000
18     }
19     
20     response = requests.post(
21         "https://llm-gateway.assemblyai.com/v1/chat/completions",
22         headers=headers,
23         json=llm_gateway_data
24     )
25     
26     result = response.json()
27     
28     if "error" in result:
29         raise RuntimeError(f"LLM Gateway error: {result['error']}")
30     
31     return result['choices'][0]['message']['content']

Define a function to implement a sliding window, which allows us to group sentences together in different combinations to retain their semantic meaning and context while also enabling us to customize the length (and thus duration) of the quotes.

1 def sliding_window(elements, distance, stride):
2     """Create sliding windows of elements"""
3     idx = 0
4     results = []
5     while idx + distance < len(elements):
6         results.append(elements[idx:idx + distance])
7         idx += (distance - stride)
8     return results

Execute all upload and transcription functions.

1 # Main execution
2 # If using a local file:
3 audio_url = upload_file("<YOUR_AUDIO FILE>")
4 
5 # If using a public URL:
6 # audio_url = "<YOUR_AUDIO_URL>"
7 
8 # Transcribe audio
9 transcript = transcribe_audio(audio_url)
10 transcript_text = transcript["text"]
11 transcript_id = transcript["id"]
12 
13 # Get sentences
14 print("Getting sentences...")
15 sentences = get_sentences(transcript_id)

Now we can iterate over all of the sentences in our transcript and create embeddings for them to use as part of our Semantic Search later.

We’ll be relying on SentenceTransformer’s multi-qa-mpnet-base-dot-v1 model, which has been fine-tuned specifically for Semantic Search, and is their highest-performing model for this task.

By default, we’ll group 5 sentences together while having 2 of them overlap when the window moves. This should give us quotes around 30 seconds in length at most.

1 # Initialize embedder
2 embedder = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
3 embeddings = {}
4 
5 # Create sliding window of sentences and generate embeddings
6 print("Creating embeddings...")
7 sentence_groups = sliding_window(sentences, 5, 2)
8 
9 for sentence_group in sentence_groups:
10     combined_text = " ".join([sentence["text"] for sentence in sentence_group])
11     start = sentence_group[0]["start"]
12     end = sentence_group[-1]["end"]
13     
14     embeddings[(start, end, transcript_id, combined_text)] = embedder.encode(combined_text)

Now we can query LLM Gateway to provide the type of quotes we want. In this case, let’s prompt LLM Gateway to find the best 3 quotes out of a video that we transcribed.

1 print("Asking LLM Gateway for best quotes...")
2 question = "What are the 3 best quotes from this video?"
3 context = "Please provide exactly 3 quotes."
4 
5 llm_answer = process_with_llm_gateway(transcript_text, question, context)
6 print(f"\nLLM Gateway Response:\n{llm_answer}\n")

Now we can take the embeddings from the transcript text, as well as the embeddings from LLM Gateway’s output, and use them in our k-nearest neighbors algorithm to determine their similarity. The most similar quotes to what LLM Gateway identified will be surfaced as our 3 best quotes, along with their timestamps and confidence scores.

We’ll be relying on cosine similarity rather than the default Euclidean distance metric since it takes into account both the magnitude and direction of our vectors.

1 # Embed the LLM output
2 llm_gateway_embedding = embedder.encode(llm_answer)
3 
4 # Vectorize transcript embeddings
5 np_embeddings = np.array(list(embeddings.values()))
6 metadata = list(embeddings.keys())
7 
8 # Find the top 3 most similar quotes
9 print("Finding matching quotes in transcript...")
10 knn = NearestNeighbors(n_neighbors=3, metric="cosine")
11 knn.fit(np_embeddings)
12 distances, indices = knn.kneighbors([llm_gateway_embedding])
13 
14 matches = []
15 for distance, index in zip(distances[0], indices[0]):
16     result_metadata = metadata[index]
17     matches.append(
18         {
19             "start_timestamp": result_metadata[0],
20             "end_timestamp": result_metadata[1],
21             "transcript_id": result_metadata[2],
22             "text": result_metadata[3],
23             "confidence": 1 - distance,
24         }
25     )
26 
27 # Display results
28 print("\n" + "="*80)
29 print("BEST MATCHING QUOTES FROM TRANSCRIPT:")
30 print("="*80 + "\n")
31 
32 for index, m in enumerate(matches):
33     print('QUOTE #{}: "{}"'.format(index + 1, m['text']))
34     print('START TIMESTAMP:', str(datetime.timedelta(seconds=m['start_timestamp']/1000)))
35     print('END TIMESTAMP:', str(datetime.timedelta(seconds=m['end_timestamp']/1000)))
36     print('CONFIDENCE:', m['confidence'])
37     print()