ASR Integration Guide
About 1378 wordsAbout 5 min
2026-03-21
Patterns for integrating ASR into larger pipelines: voice assistant, FastAPI service, WebSocket streaming, Docker Compose, and subtitle generation.
Full Voice Assistant (VAD + ASR + LLM)
Connect the microphone → VAD → Whisper → llama.cpp/llama-swap chain to build a local, offline voice assistant.
# voice_assistant.py
"""
Full voice assistant pipeline:
Mic → SileroVAD → faster-whisper → llama.cpp (via subprocess) → TTS
"""
import threading
import queue
import subprocess
import numpy as np
import sounddevice as sd
import torch
from faster_whisper import WhisperModel
# ── VAD ───────────────────────────────────────────────────────────────────────
class SileroVAD:
def __init__(self, sr=16000, threshold=0.5):
self.sr = sr
self.threshold = threshold
model, utils = torch.hub.load(
"snakers4/silero-vad", "silero_vad", force_reload=False
)
self.model = model
self.model.eval()
def is_speech(self, pcm: np.ndarray) -> bool:
with torch.no_grad():
return self.model(torch.tensor(pcm), self.sr).item() > self.threshold
# ── LLM via llama.cpp CLI ─────────────────────────────────────────────────────
def ask_llm(prompt: str, model_path: str) -> str:
"""Call llama-cli (llama.cpp) with a prompt and return the response."""
result = subprocess.run(
[
"llama-cli",
"-m", model_path,
"-n", "256",
"--no-display-prompt",
"-p", prompt,
],
capture_output=True, text=True, timeout=60,
)
return result.stdout.strip()
# ── Voice Assistant ───────────────────────────────────────────────────────────
class VoiceAssistant:
def __init__(self,
whisper_model: str = "base.en",
llm_model_path: str = "models/mistral-7b-q4.gguf"):
self.sr = 16000
self.chunk_ms = 30
self.chunk_size = int(self.sr * self.chunk_ms / 1000)
self.vad = SileroVAD(sr=self.sr)
print("Loading Whisper...")
self.asr = WhisperModel(whisper_model, device="cpu", compute_type="int8")
self.llm_path = llm_model_path
self._q: queue.Queue = queue.Queue()
self._running = False
self._speech_buf = []
self._silence_count = 0
self._speaking = False
self.SILENCE_CHUNKS = 20 # 600ms silence to trigger
def _transcribe(self, audio: np.ndarray) -> str:
segs, _ = self.asr.transcribe(audio, beam_size=5, language="en")
return " ".join(s.text.strip() for s in segs).strip()
def _handle_utterance(self, audio: np.ndarray):
print("\n[ASR] transcribing...")
text = self._transcribe(audio)
if not text:
return
print(f"[USER] {text}")
prompt = (
"You are a helpful voice assistant. Reply concisely in 1-2 sentences.\n"
f"User: {text}\nAssistant:"
)
print("[LLM] thinking...")
response = ask_llm(prompt, self.llm_path)
print(f"[ASSISTANT] {response}")
def _process(self):
while self._running:
try:
chunk = self._q.get(timeout=0.1)
except queue.Empty:
continue
if self.vad.is_speech(chunk):
self._silence_count = 0
self._speaking = True
self._speech_buf.append(chunk)
elif self._speaking:
self._speech_buf.append(chunk)
self._silence_count += 1
if self._silence_count >= self.SILENCE_CHUNKS:
audio = np.concatenate(self._speech_buf)
threading.Thread(
target=self._handle_utterance, args=(audio,), daemon=True
).start()
self._speech_buf.clear()
self._speaking = False
self._silence_count = 0
def _mic_cb(self, indata, frames, t, status):
self._q.put(indata[:, 0].copy())
def run(self):
self._running = True
threading.Thread(target=self._process, daemon=True).start()
with sd.InputStream(
samplerate=self.sr, channels=1, dtype="float32",
blocksize=self.chunk_size, callback=self._mic_cb,
):
print("Voice assistant ready. Speak now (Ctrl+C to quit).")
try:
import time
while True:
time.sleep(0.1)
except KeyboardInterrupt:
self._running = False
if __name__ == "__main__":
assistant = VoiceAssistant(
whisper_model="base.en",
llm_model_path="models/mistral-7b-instruct-q4_K_M.gguf",
)
assistant.run()llama-swap Integration (Multi-Model Routing)
Use llama-swap's OpenAI-compatible /v1/chat/completions endpoint alongside ASR:
# asr_llamaswap.py
import httpx
import asyncio
LLAMA_SWAP_URL = "http://localhost:8080"
async def ask_llamaswap(text: str, model: str = "mistral") -> str:
"""Send transcribed text to llama-swap and get a response."""
async with httpx.AsyncClient(timeout=60) as client:
resp = await client.post(
f"{LLAMA_SWAP_URL}/v1/chat/completions",
json={
"model": model,
"messages": [
{"role": "system", "content": "Reply concisely in 1-2 sentences."},
{"role": "user", "content": text},
],
"max_tokens": 256,
"stream": False,
},
)
resp.raise_for_status()
return resp.json()["choices"][0]["message"]["content"]
async def voice_to_llm(audio_file: str, llm_model: str = "mistral"):
from faster_whisper import WhisperModel
asr_model = WhisperModel("base.en", device="cpu", compute_type="int8")
segs, _ = asr_model.transcribe(audio_file, beam_size=5)
transcript = " ".join(s.text.strip() for s in segs)
print(f"Transcribed: {transcript}")
response = await ask_llamaswap(transcript, model=llm_model)
print(f"LLM reply: {response}")
return response
if __name__ == "__main__":
asyncio.run(voice_to_llm("question.wav", llm_model="mistral"))FastAPI ASR Endpoint
# api.py
from fastapi import FastAPI, UploadFile, File, HTTPException, Form
from fastapi.responses import JSONResponse
from faster_whisper import WhisperModel
import tempfile
import os
import shutil
app = FastAPI(title="ASR API")
_model: WhisperModel | None = None
def get_model() -> WhisperModel:
global _model
if _model is None:
_model = WhisperModel("base.en", device="cpu", compute_type="int8")
return _model
@app.post("/transcribe")
async def transcribe_audio(
file: UploadFile = File(...),
language: str | None = Form(default=None),
word_timestamps: bool = Form(default=False),
):
"""
Upload an audio file (WAV, MP3, FLAC, OGG, M4A) and receive a transcript.
"""
allowed_types = {
"audio/wav", "audio/mpeg", "audio/flac",
"audio/ogg", "audio/x-m4a", "audio/mp4",
}
if file.content_type and file.content_type not in allowed_types:
raise HTTPException(
status_code=415,
detail=f"Unsupported media type: {file.content_type}",
)
# Save to a temp file (faster-whisper needs a path)
suffix = os.path.splitext(file.filename or "audio.wav")[1] or ".wav"
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
shutil.copyfileobj(file.file, tmp)
tmp_path = tmp.name
try:
model = get_model()
segments, info = model.transcribe(
tmp_path,
language=language,
beam_size=5,
word_timestamps=word_timestamps,
vad_filter=True,
)
seg_list = []
for seg in segments:
s = {
"start": round(seg.start, 3),
"end": round(seg.end, 3),
"text": seg.text.strip(),
}
if word_timestamps and seg.words:
s["words"] = [
{"word": w.word, "start": round(w.start, 3), "end": round(w.end, 3), "prob": round(w.probability, 3)}
for w in seg.words
]
seg_list.append(s)
full_text = " ".join(s["text"] for s in seg_list)
return JSONResponse({
"text": full_text,
"language": info.language,
"language_probability": round(info.language_probability, 3),
"duration": round(info.duration, 3),
"segments": seg_list,
})
finally:
os.unlink(tmp_path)Run with:
uvicorn api:app --host 0.0.0.0 --port 8001 --reloadTest:
curl -X POST http://localhost:8001/transcribe \
-F "file=@speech.wav" \
-F "word_timestamps=true"WebSocket Streaming Transcription
# ws_server.py
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
import numpy as np
import json
from faster_whisper import WhisperModel
app = FastAPI()
asr_model = WhisperModel("base.en", device="cpu", compute_type="int8")
@app.websocket("/ws/transcribe")
async def ws_transcribe(ws: WebSocket):
"""
Accept float32 PCM chunks over WebSocket.
Send back partial and final transcripts as JSON.
Client sends: binary frames of float32 audio at 16kHz
Server sends: {"type": "partial"|"final", "text": "..."}
"""
await ws.accept()
import asyncio
buffer = []
silence_frames = 0
MAX_SILENCE = 20 # 20 × 30ms = 600ms
SR = 16000
CHUNK_SAMPLES = int(SR * 0.03) # 30ms
try:
while True:
data = await ws.receive_bytes()
chunk = np.frombuffer(data, dtype=np.float32)
energy = float(np.sqrt(np.mean(chunk ** 2)))
is_speech = energy > 0.005
if is_speech:
silence_frames = 0
buffer.append(chunk)
elif buffer:
silence_frames += 1
buffer.append(chunk)
if silence_frames >= MAX_SILENCE:
audio = np.concatenate(buffer)
segs, _ = asr_model.transcribe(audio, beam_size=5)
text = " ".join(s.text.strip() for s in segs)
if text.strip():
await ws.send_text(json.dumps({
"type": "final",
"text": text,
}))
buffer.clear()
silence_frames = 0
elif len(buffer) % 10 == 0:
# Partial update every 300ms
partial_audio = np.concatenate(buffer)
segs, _ = asr_model.transcribe(partial_audio, beam_size=1)
text = " ".join(s.text.strip() for s in segs)
if text.strip():
await ws.send_text(json.dumps({
"type": "partial",
"text": text,
}))
except WebSocketDisconnect:
passSRT Subtitle Generation
# generate_subtitles.py
from faster_whisper import WhisperModel
import sys
def seconds_to_srt_time(t: float) -> str:
h = int(t // 3600)
m = int((t % 3600) // 60)
s = int(t % 60)
ms = int((t % 1) * 1000)
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
def generate_srt(video_path: str, output_srt: str,
model_size: str = "small",
language: str = None):
model = WhisperModel(model_size, device="cpu", compute_type="int8")
segments, info = model.transcribe(
video_path,
language=language,
beam_size=5,
word_timestamps=False,
)
print(f"Detected language: {info.language} ({info.language_probability:.1%})")
with open(output_srt, "w", encoding="utf-8") as f:
for i, seg in enumerate(segments, 1):
f.write(f"{i}\n")
f.write(f"{seconds_to_srt_time(seg.start)} --> {seconds_to_srt_time(seg.end)}\n")
f.write(f"{seg.text.strip()}\n\n")
print(f"Saved: {output_srt}")
def generate_vtt(video_path: str, output_vtt: str,
model_size: str = "small"):
model = WhisperModel(model_size, device="cpu", compute_type="int8")
segments, _ = model.transcribe(video_path, beam_size=5)
def vtt_time(t: float) -> str:
h = int(t // 3600)
m = int((t % 3600) // 60)
s = t % 60
return f"{h:02d}:{m:02d}:{s:06.3f}"
with open(output_vtt, "w", encoding="utf-8") as f:
f.write("WEBVTT\n\n")
for seg in segments:
f.write(f"{vtt_time(seg.start)} --> {vtt_time(seg.end)}\n")
f.write(f"{seg.text.strip()}\n\n")
print(f"Saved: {output_vtt}")
if __name__ == "__main__":
video = sys.argv[1] if len(sys.argv) > 1 else "video.mp4"
generate_srt(video, video.rsplit(".", 1)[0] + ".srt")
generate_vtt(video, video.rsplit(".", 1)[0] + ".vtt")Docker Compose Pipeline
Runs llama-swap + ASR API as a pair of services:
# docker-compose.yml
version: "3.9"
services:
llama-swap:
image: ghcr.io/mostlygeek/llama-swap:latest
volumes:
- ./models:/models
- ./llama-swap-config.yaml:/config.yaml:ro
ports:
- "8080:8080"
environment:
- CONFIG_FILE=/config.yaml
restart: unless-stopped
asr-api:
build:
context: .
dockerfile: Dockerfile.asr
ports:
- "8001:8001"
volumes:
- ./models:/models # shared model storage
environment:
- WHISPER_MODEL=base.en
- DEVICE=cpu
- COMPUTE_TYPE=int8
depends_on:
- llama-swap
restart: unless-stopped# Dockerfile.asr
FROM python:3.11-slim
RUN apt-get update && apt-get install -y ffmpeg libsndfile1 && rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY api.py .
CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8001"]See Also
- ASR Real-Time Streaming
- ASR Implementation
- VAD Integration Guide
- llama-swap Documentation
- llama.cpp Server Guide
- TTS Integration Guide — Companion integration guide for synthesizing the LLM response back to speech