TTS Implementation

About 1340 wordsAbout 4 min

2026-03-21

Complete, production-ready Python implementations for each major TTS library. Each section is self-contained and directly runnable.

Kokoro (Recommended — Fastest Offline)

Basic Synthesis

# kokoro_basic.py
from kokoro import KPipeline
import numpy as np
import soundfile as sf

def synthesize(text: str,
               voice: str = "af_heart",
               speed: float = 1.0,
               lang_code: str = "a",
               output_path: str = "output.wav") -> np.ndarray:
    """
    Synthesize text to a WAV file using Kokoro.
    lang_code: "a"=American EN, "b"=British EN, "j"=Japanese,
               "z"=Chinese, "k"=Korean, "f"=French, "p"=Portuguese
    """
    pipeline = KPipeline(lang_code=lang_code)
    
    audio_chunks = []
    for graphemes, phonemes, audio in pipeline(text, voice=voice, speed=speed):
        audio_chunks.append(audio)
    
    full_audio = np.concatenate(audio_chunks)
    sf.write(output_path, full_audio, 24000)
    return full_audio


if __name__ == "__main__":
    audio = synthesize(
        "The quick brown fox jumps over the lazy dog.",
        voice="af_heart",
        output_path="kokoro_test.wav",
    )
    print(f"Generated {len(audio)/24000:.2f}s of audio")

Reusing the Pipeline (Model Stays Loaded)

# kokoro_reuse.py
from kokoro import KPipeline
import numpy as np
import soundfile as sf
from typing import Iterator

class KokoroTTS:
    """Keeps Kokoro loaded in memory for repeated synthesis calls."""
    
    def __init__(self, lang_code: str = "a", voice: str = "af_heart"):
        self.pipeline = KPipeline(lang_code=lang_code)
        self.default_voice = voice
        self.sample_rate = 24000
    
    def synthesize(self, text: str, voice: str = None, speed: float = 1.0) -> np.ndarray:
        voice = voice or self.default_voice
        chunks = [audio for _, _, audio in self.pipeline(text, voice=voice, speed=speed)]
        return np.concatenate(chunks)
    
    def synthesize_to_file(self, text: str, path: str,
                            voice: str = None, speed: float = 1.0):
        audio = self.synthesize(text, voice, speed)
        sf.write(path, audio, self.sample_rate)
    
    def stream(self, text: str, voice: str = None, speed: float = 1.0) -> Iterator[np.ndarray]:
        """Yield audio chunks sentence by sentence."""
        voice = voice or self.default_voice
        for _, _, audio in self.pipeline(text, voice=voice, speed=speed):
            yield audio


if __name__ == "__main__":
    tts = KokoroTTS(voice="am_michael")
    tts.synthesize_to_file("Hello from Kokoro!", "hello.wav")
    tts.synthesize_to_file("British accent test.", "british.wav", voice="bm_george")

Multi-Voice Batch

# kokoro_batch.py
import csv
from pathlib import Path
from kokoro import KPipeline
import numpy as np
import soundfile as sf

def batch_synthesize(manifest_csv: str, output_dir: str,
                     voice: str = "af_heart", lang: str = "a"):
    """
    CSV format: text,filename
    Synthesizes all rows and saves to output_dir/filename.wav
    """
    pipeline = KPipeline(lang_code=lang)
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    with open(manifest_csv) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            chunks = [a for _, _, a in pipeline(row["text"], voice=voice)]
            audio = np.concatenate(chunks)
            out_path = str(Path(output_dir) / row["filename"])
            sf.write(out_path, audio, 24000)
            print(f"[{i+1}] {row['filename']} ({len(audio)/24000:.2f}s)")

Coqui XTTS-v2 (Voice Cloning)

# xtts_impl.py
from TTS.api import TTS
import soundfile as sf
import numpy as np

class XTTSV2:
    def __init__(self, gpu: bool = False):
        self.tts = TTS(
            "tts_models/multilingual/multi-dataset/xtts_v2",
            gpu=gpu,
        )
    
    def clone_voice(self, text: str, reference_wav: str,
                    language: str = "en", output_path: str = "cloned.wav"):
        """Synthesize text in the voice of reference_wav speaker."""
        self.tts.tts_to_file(
            text=text,
            speaker_wav=reference_wav,
            language=language,
            file_path=output_path,
        )
    
    def clone_voice_numpy(self, text: str, reference_wav: str,
                          language: str = "en") -> tuple[np.ndarray, int]:
        """Return (audio_array, sample_rate) without writing to file."""
        wav = self.tts.tts(
            text=text,
            speaker_wav=reference_wav,
            language=language,
        )
        return np.array(wav, dtype=np.float32), 24000


if __name__ == "__main__":
    xtts = XTTSV2(gpu=False)
    xtts.clone_voice(
        text="This voice was cloned from just a few seconds of reference audio.",
        reference_wav="my_voice_sample.wav",
        language="en",
        output_path="my_cloned_voice.wav",
    )
    print("Done! Check my_cloned_voice.wav")

F5-TTS (Best Open-Source Cloning Quality)

# f5tts_impl.py
from f5_tts.api import F5TTS
import soundfile as sf
import numpy as np
import os

class F5TTSEngine:
    def __init__(self):
        self.model = F5TTS()
    
    def synthesize(self,
                   gen_text: str,
                   ref_audio: str,
                   ref_text: str,
                   output_path: str = "f5_output.wav",
                   nfe_step: int = 32,
                   speed: float = 1.0) -> np.ndarray:
        """
        Args:
            gen_text:   Text to generate
            ref_audio:  Path to reference WAV (target voice, 3–15s)
            ref_text:   Transcript of what is spoken in ref_audio
            nfe_step:   Flow matching ODE steps (16=fast, 32=balanced, 64=best)
            speed:      Speech rate multiplier
        """
        wav, sr, _ = self.model.infer(
            ref_file=ref_audio,
            ref_text=ref_text,
            gen_text=gen_text,
            nfe_step=nfe_step,
            cfg_strength=2.0,
            speed=speed,
            cross_fade_duration=0.15,
        )
        sf.write(output_path, wav, sr)
        return wav
    
    def synthesize_long(self, text: str, ref_audio: str, ref_text: str,
                         output_path: str = "f5_long.wav",
                         max_chars: int = 200) -> np.ndarray:
        """Split long text into chunks and concatenate."""
        import re
        sentences = re.split(r"(?<=[.!?])\s+", text)
        
        chunks = []
        current = ""
        for s in sentences:
            if len(current) + len(s) > max_chars:
                if current:
                    w, sr, _ = self.model.infer(
                        ref_file=ref_audio, ref_text=ref_text,
                        gen_text=current.strip(), nfe_step=32,
                    )
                    chunks.append(w)
                current = s
            else:
                current += " " + s
        
        if current.strip():
            w, sr, _ = self.model.infer(
                ref_file=ref_audio, ref_text=ref_text,
                gen_text=current.strip(), nfe_step=32,
            )
            chunks.append(w)
        
        full_audio = np.concatenate(chunks)
        sf.write(output_path, full_audio, sr)
        return full_audio


if __name__ == "__main__":
    engine = F5TTSEngine()
    engine.synthesize(
        gen_text="Hello! This is a demonstration of F5-TTS voice cloning.",
        ref_audio="reference.wav",
        ref_text="This is the text spoken in the reference audio file.",
        output_path="f5_demo.wav",
    )

Bark (Expressive / Creative)

# bark_impl.py
from bark import SAMPLE_RATE, generate_audio, preload_models
import soundfile as sf
import numpy as np
import os

# Use small models for faster inference (lower quality)
# os.environ["SUNO_USE_SMALL_MODELS"] = "True"

preload_models()

VOICE_PRESETS = {
    "male_1": "v2/en_speaker_6",
    "male_2": "v2/en_speaker_0",
    "female_1": "v2/en_speaker_9",
    "female_2": "v2/en_speaker_3",
    "british": "v2/en_speaker_5",
}

def bark_synthesize(text: str,
                    preset: str = "female_1",
                    output_path: str = "bark_out.wav") -> np.ndarray:
    """
    Special tokens supported in text:
      [laughter]  [laughs]  [sighs]  [music]  [gasps]  [clears throat]
      ♪ song lyrics ♪      ---  (dramatic pause)
      CAPS = emphasis
    """
    history_prompt = VOICE_PRESETS.get(preset, preset)
    audio = generate_audio(text, history_prompt=history_prompt)
    sf.write(output_path, audio, SAMPLE_RATE)
    return audio


if __name__ == "__main__":
    bark_synthesize(
        "[clears throat] Hello! [laughs] I am Bark — an expressive TTS model. ♪ la la la ♪",
        preset="female_1",
        output_path="bark_demo.wav",
    )

edge-tts

# edge_tts_impl.py
import asyncio
import edge_tts
import soundfile as sf
import io
import numpy as np
import tempfile
import os

VOICES = {
    "aria":  "en-US-AriaNeural",
    "guy":   "en-US-GuyNeural",
    "jenny": "en-US-JennyNeural",
    "sonia": "en-GB-SoniaNeural",
    "ryan":  "en-GB-RyanNeural",
}

async def synthesize_async(text: str,
                            voice: str = "aria",
                            output_path: str = "edge_out.mp3",
                            rate: str = "+0%",
                            pitch: str = "+0Hz") -> str:
    """
    rate: "+10%" = 10% faster, "-20%" = 20% slower
    pitch: "+5Hz" = higher, "-10Hz" = lower
    """
    voice_id = VOICES.get(voice, voice)
    communicate = edge_tts.Communicate(text, voice_id, rate=rate, pitch=pitch)
    await communicate.save(output_path)
    return output_path

async def synthesize_to_numpy(text: str, voice: str = "aria") -> tuple[np.ndarray, int]:
    """Returns (float32 array, sample_rate) — decodes MP3 in memory."""
    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
        tmp = f.name
    try:
        await synthesize_async(text, voice, tmp)
        audio, sr = sf.read(tmp, dtype="float32")
        if audio.ndim > 1:
            audio = audio.mean(axis=1)
        return audio, sr
    finally:
        os.unlink(tmp)


def synthesize(text: str, voice: str = "aria", output_path: str = "edge_out.mp3"):
    """Sync wrapper."""
    asyncio.run(synthesize_async(text, voice, output_path))


if __name__ == "__main__":
    synthesize("Hello! This is edge-tts running with no model download.", voice="aria")
    print("Saved edge_out.mp3")

Saving Audio in Multiple Formats

# audio_formats.py
import numpy as np
import soundfile as sf
import subprocess
import tempfile
import os

def save_audio(audio: np.ndarray, sample_rate: int,
               output_path: str, normalize: bool = True):
    """
    Save audio as WAV, MP3, or OGG based on file extension.
    audio: float32 numpy array, values in [-1, 1]
    """
    if normalize and audio.max() > 0:
        audio = audio / np.abs(audio).max() * 0.95
    
    ext = os.path.splitext(output_path)[1].lower()
    
    if ext == ".wav":
        sf.write(output_path, audio, sample_rate, subtype="PCM_16")
    
    elif ext in (".mp3", ".ogg", ".flac"):
        # Write WAV first, then convert with ffmpeg
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            tmp_wav = tmp.name
        sf.write(tmp_wav, audio, sample_rate, subtype="PCM_16")
        
        ffmpeg_cmd = ["ffmpeg", "-y", "-i", tmp_wav]
        if ext == ".mp3":
            ffmpeg_cmd += ["-codec:a", "libmp3lame", "-qscale:a", "2"]
        elif ext == ".ogg":
            ffmpeg_cmd += ["-codec:a", "libvorbis", "-qscale:a", "5"]
        elif ext == ".flac":
            ffmpeg_cmd += ["-codec:a", "flac"]
        ffmpeg_cmd.append(output_path)
        
        subprocess.run(ffmpeg_cmd, check=True, capture_output=True)
        os.unlink(tmp_wav)
    else:
        raise ValueError(f"Unsupported format: {ext}")

Resample for ASR Round-Trip

If you need to feed TTS output back into ASR (16kHz):

import numpy as np
import soundfile as sf
import librosa  # pip install librosa

def resample_for_asr(audio: np.ndarray, src_sr: int, target_sr: int = 16000) -> np.ndarray:
    if src_sr == target_sr:
        return audio
    return librosa.resample(audio, orig_sr=src_sr, target_sr=target_sr)

# Example: Kokoro outputs 24kHz, ASR needs 16kHz
tts_audio_24k = np.zeros(24000)  # placeholder
asr_audio_16k = resample_for_asr(tts_audio_24k, 24000, 16000)

AI

VAD

ASR

TTS

llama-swap

llama.cpp

Embedded Sytems

EDK2-UEFI

U-Boot

Yocto

QEMU

QNX

AUTOSAR Adaptive

MISRA C++

ASIL

ASPICE

DevOps

Conan

Artifactory

Jenkins