By Zsolt Bizderi in lab — 30 Jan 2026

A Local Pipeline for AI-Narrated Presentations

Turning PDFs into Video with Kokoro and Ollama

I wanted something that could take my internal updates and research summaries, which usually exist as slides or PDFs, and instantly turn them into narrated videos for asynchronous sharing. Tools like Synthesia or HeyGen can do that, but they're closed-source and online-only.

This project transforms a PDF (preferably PowerPoint-based) into a narrated video, using a local LLM and TTS engine to generate natural-sounding speech and automatically sync captions with each slide.

How It Works

The script runs a full automated pipeline:

Extract text from a PDF using PyMuPDF.
Ask an LLM (served via Ollama) to rewrite the content into spoken narration.
Use Kokoro Web TTS to generate audio for each narration segment.
Render each PDF page to an image and match slide durations to the corresponding audio lengths.
Combine everything with FFmpeg into a clean, normalised video complete with captions.

You can run it in one of three modes:

--segment-mode pages : one narration segment per page (perfect for slide decks).
--segment-mode llm : narration breaks wherever the LLM decides.
--segment-mode auto : timing based on text length per page.

The result is a fully automated narrated presentation, locally generated end-to-end without any external API calls.

Deploying Kokoro Web TTS

Kokoro Web is a lightweight, high-quality open-source text-to-speech server.
You can run it with Docker in just a simple compose stack:

services:
  kokoro-web:
    image: ghcr.io/eduardolat/kokoro-web:latest
    container_name: kokoro-web
    ports:
      - "45732:3000"
    volumes:
      - kokoro_cache:/kokoro/cache
    restart: unless-stopped

volumes:
  kokoro_cache:
    name: kokoro_cache
    driver: local

Once running, Kokoro listens on the port you define (e.g. http://localhost:45732/api/v1/audio/speech).

You can then point your script's KOKORO_URL to it, and it will generate natural MP3 speech files on demand.

Local Setup

This project assumes:

You already have Ollama running locally (for LLM narration).
FFmpeg is available in your system path.

You've installed Python packages:

pip install requests pymupdf mutagen tqdm

Then clone or copy the script and run:

python3 aivideo.py --pdf myfile.pdf --segment-mode pages

It will automatically generate:

Narration text
Per-segment audio
Rendered slides
Subtitles (.srt)
Final combined video

Example `.env`

PDF_PATH="input.pdf"
OUT_DIR="build"

OLLAMA_URL="http://localhost:11434/api/generate"
OLLAMA_MODEL="llama3.1:8b"

KOKORO_URL="http://localhost:45732/api/v1/audio/speech"
KOKORO_MODEL="model_q8f16"
KOKORO_VOICE="af_heart"
KOKORO_SPEED="1.0"

SLIDE_DPI="180"
FPS="30"
LOUDNORM="true"

Python Script

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import logging
import os
import re
import shutil
import subprocess
import sys
import time
import unicodedata
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple, Optional

import fitz
import requests
from mutagen.mp3 import MP3
from requests.adapters import HTTPAdapter, Retry
from tqdm import tqdm

DEFAULT_PDF = Path("input.pdf")
DEFAULT_OUT_DIR = Path("build")

DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434/api/generate"
DEFAULT_OLLAMA_MODEL = "llama3.1:8b"

DEFAULT_KOKORO_URL = "http://127.0.0.1:45732/api/v1/audio/speech"
DEFAULT_KOKORO_MODEL = "model_q8f16"
DEFAULT_KOKORO_VOICE = "af_heart"
DEFAULT_KOKORO_SPEED = 1.0

DEFAULT_DPI = 180
DEFAULT_FPS = 30
DEFAULT_LOUDNORM = True
DEFAULT_SEGMENT_MODE = "pages"

SLIDE_MARKER_CANONICAL = "[SLIDE_BREAK]"
SLIDE_MARKER_REGEX = re.compile(r"\[\s*SLIDE[_\-\s]*BREAK\s*\]", re.IGNORECASE)
CLEAN_MD_REGEX = re.compile(r"[#*`>\\]+")
SENTENCE_PAUSE = "[0.5s]"
SLIDE_PAUSE = "[1s]"

FFMPEG = shutil.which("ffmpeg")

def load_dotenv_if_present(dotenv_path: Path) -> None:
    if not dotenv_path.exists():
        return
    for raw in dotenv_path.read_text(encoding="utf-8").splitlines():
        line = raw.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        k, v = line.split("=", 1)
        k = k.strip()
        v = v.strip().strip('"').strip("'")
        os.environ.setdefault(k, v)

def guess_latest_pdf(downloads_dir: Path) -> Optional[Path]:
    if not downloads_dir.exists():
        return None
    pdfs = sorted(downloads_dir.glob("*.pdf"), key=lambda p: p.stat().st_mtime, reverse=True)
    return pdfs[0] if pdfs else None

def session_with_retries() -> requests.Session:
    s = requests.Session()
    retries = Retry(
        total=5,
        backoff_factor=0.6,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=frozenset(["GET", "POST"]),
        raise_on_status=False,
    )
    adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=10)
    s.mount("http://", adapter)
    s.mount("https://", adapter)
    return s

HTTP = session_with_retries()

@dataclass
class Config:
    pdf_path: Path
    out_dir: Path
    ollama_url: str
    ollama_model: str
    kokoro_url: str
    kokoro_model: str
    kokoro_voice: str
    kokoro_speed: float
    dpi: int
    force: bool
    loudnorm: bool
    fps: int
    segment_mode: str

def die(msg: str, code: int = 1) -> None:
    logging.error(msg)
    sys.exit(code)

def preflight(cfg: Config) -> None:
    if not cfg.pdf_path.exists():
        die(f"PDF not found: {cfg.pdf_path}")
    if not FFMPEG:
        die("ffmpeg not found in PATH.")
    try:
        HTTP.post(cfg.ollama_url, json={"model": cfg.ollama_model, "prompt": "ping"}, timeout=5)
    except Exception as e:
        logging.warning(f"Could not reach text generation endpoint at {cfg.ollama_url}: {e}")
    try:
        HTTP.post(cfg.kokoro_url, json={"model": cfg.kokoro_model, "voice": cfg.kokoro_voice, "input": "ping"}, timeout=5)
    except Exception as e:
        logging.warning(f"Could not reach TTS endpoint at {cfg.kokoro_url}: {e}")

def normalize_text(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\r", "")
    s = re.sub(r"\s{3,}", " ", s)
    return s.strip()

def ensure_dir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

def extract_text_from_pdf(path: Path) -> str:
    text_chunks: List[str] = []
    with fitz.open(path) as pdf:
        for page in pdf:
            text_chunks.append(page.get_text("text"))
    text = "\n".join(text_chunks)
    return normalize_text(text)

def extract_text_per_page(path: Path) -> List[str]:
    pages = []
    with fitz.open(path) as pdf:
        for page in pdf:
            pages.append(normalize_text(page.get_text("text")))
    return pages

def build_prompt_llm(raw_text: str) -> str:
    """
    Create a spoken narration script for a short internal update video based on the document below.

    Tone: friendly, concise, conversational; confident but plainspoken.
    Length target: roughly 3–5 minutes spoken.

    Structure:
    - Start with a brief greeting like: "Hi everyone, welcome to the latest update."
    - Break the script into natural topic sections and insert the exact tag [SLIDE_BREAK] between sections.
    - Finish with a simple sign-off (no contact details).

    Output rules:
    - Output only the final spoken narration with [SLIDE_BREAK] markers. No lists, headings, or meta commentary.
    - If the source contains marketing taglines or boilerplate, skip them.

    Document:
    \"\"\"%s\"\"\"
    """ % raw_text

def build_prompt_pages(per_page_texts: List[str]) -> str:
    """
    Turn the slide text into a spoken narration script.

    Tone: friendly, concise, conversational; confident but plainspoken.
    Length target: overall ~3–5 minutes.

    Output EXACTLY %d sections, one per slide, in order.
    Separate each section with the exact tag [SLIDE_BREAK].
    Start with a brief greeting like: "Hi everyone, welcome to the latest update."
    End with a simple sign-off (no contact details).
    Skip any marketing taglines or boilerplate if present.

    Slides (one block per slide, keep order):
    %s
    """ % (
        len(per_page_texts),
        "\n\n--- SLIDE ---\n".join(per_page_texts),
    )

def _ollama_stream_text(url: str, model: str, prompt: str) -> str:
    payload = {"model": model, "prompt": prompt}
    r = HTTP.post(url, json=payload, stream=True, timeout=60)
    r.raise_for_status()
    buf: List[str] = []
    for line in r.iter_lines(decode_unicode=True):
        if not line:
            continue
        try:
            data = json.loads(line)
            seg = data.get("response", "")
            if seg:
                buf.append(seg)
        except json.JSONDecodeError:
            continue
    output = "".join(buf)
    output = SLIDE_MARKER_REGEX.sub(SLIDE_MARKER_CANONICAL, output)
    cleaned = CLEAN_MD_REGEX.sub("", output)
    cleaned = normalize_text(cleaned)
    return cleaned

def generate_presentation_text_llm(cfg: Config, raw_text: str, out_txt: Path) -> str:
    if out_txt.exists() and not cfg.force:
        logging.info(f"Reusing existing narration: {out_txt}")
        return out_txt.read_text(encoding="utf-8")
    prompt = build_prompt_llm(raw_text)
    logging.info("Generating narration…")
    cleaned = _ollama_stream_text(cfg.ollama_url, cfg.ollama_model, prompt)
    if SLIDE_MARKER_CANONICAL not in cleaned:
        cleaned = cleaned.replace("\n\n", f"\n\n{SLIDE_MARKER_CANONICAL}\n\n")
    out_txt.write_text(cleaned, encoding="utf-8")
    return cleaned

def generate_presentation_text_pages(cfg: Config, per_page_texts: List[str], out_txt: Path) -> str:
    if out_txt.exists() and not cfg.force:
        logging.info(f"Reusing existing narration: {out_txt}")
        return out_txt.read_text(encoding="utf-8")
    prompt = build_prompt_pages(per_page_texts)
    logging.info("Generating narration (page-aware)…")
    cleaned = _ollama_stream_text(cfg.ollama_url, cfg.ollama_model, prompt)
    if SLIDE_MARKER_CANONICAL not in cleaned:
        cleaned = cleaned.replace("\n\n", f"\n\n{SLIDE_MARKER_CANONICAL}\n\n")
    out_txt.write_text(cleaned, encoding="utf-8")
    return cleaned

def split_segments(narration: str) -> List[str]:
    parts = [s.strip() for s in re.split(SLIDE_MARKER_REGEX, narration) if s.strip()]
    return parts

def split_sentences(s: str) -> List[str]:
    parts = re.split(r'(?<=[.!?])\s+(?!\[[0-9]', s.strip())
    return [p for p in parts if p.strip()]

def enforce_segment_count(segments: List[str], target: int) -> List[str]:
    segs = [s.strip() for s in segments if s.strip()]
    while len(segs) < target and segs:
        idx = max(range(len(segs)), key=lambda i: len(segs[i]))
        sents = split_sentences(segs[idx])
        if len(sents) <= 1:
            mid = len(segs[idx]) // 2
            sents = [segs[idx][:mid].strip(), segs[idx][mid:].strip()]
        needed_new_segments = target - (len(segs) - 1)
        needed_new_segments = max(2, min(needed_new_segments, len(sents)))
        chunk_sizes = []
        base = len(sents) // needed_new_segments
        rem = len(sents) % needed_new_segments
        for i in range(needed_new_segments):
            chunk_sizes.append(base + (1 if i < rem else 0))
        new_pieces = []
        cursor = 0
        for cs in chunk_sizes:
            new_pieces.append(" ".join(sents[cursor:cursor+cs]).strip())
            cursor += cs
        segs = segs[:idx] + new_pieces + segs[idx+1:]
    while len(segs) > target and len(segs) >= 2:
        pair_idx = min(range(len(segs)-1), key=lambda i: len(segs[i]) + len(segs[i+1]))
        merged = (segs[pair_idx] + " " + segs[pair_idx+1]).strip()
        segs = segs[:pair_idx] + [merged] + segs[pair_idx+2:]
    return segs

def prepare_segment_for_tts(segment: str) -> str:
    segment = re.sub(r"([.!?])\s+", rf"\1{SENTENCE_PAUSE} ", segment)
    return segment.strip()

def tts_segment(cfg: Config, text: str, out_mp3: Path) -> None:
    if out_mp3.exists() and out_mp3.stat().st_size > 0:
        return
    payload = {
        "model": cfg.kokoro_model,
        "voice": cfg.kokoro_voice,
        "input": text,
        "response_format": "mp3",
        "speed": cfg.kokoro_speed
    }
    r = HTTP.post(cfg.kokoro_url, json=payload, stream=True, timeout=120)
    r.raise_for_status()
    with open(out_mp3, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)

def generate_tts_for_segments(cfg: Config, segments: List[str], audio_dir: Path) -> List[Path]:
    ensure_dir(audio_dir)
    out_files: List[Path] = []
    for i, seg in enumerate(tqdm(segments, desc="TTS segments")):
        prepped = prepare_segment_for_tts(seg if i == 0 else SLIDE_PAUSE + " " + seg)
        mp3 = audio_dir / f"seg_{i+1:03d}.mp3"
        tts_segment(cfg, prepped, mp3)
        out_files.append(mp3)
    return out_files

def concat_audio(mp3_files: List[Path], out_mp3: Path) -> None:
    if out_mp3.exists() and out_mp3.stat().st_size > 0:
        return
    lst = out_mp3.parent / "audio_list.txt"
    with open(lst, "w", encoding="utf-8") as f:
        for p in mp3_files:
            f.write(f"file '{p.resolve()}'\n")
    subprocess.run(
        [FFMPEG, "-y", "-f", "concat", "-safe", "0", "-i", str(lst), "-c", "copy", str(out_mp3)],
        check=True
    )

def pdf_to_images(pdf_path: Path, output_dir: Path, dpi: int) -> List[Path]:
    ensure_dir(output_dir)
    out: List[Path] = []
    with fitz.open(pdf_path) as doc:
        for i, page in enumerate(tqdm(doc, desc="Rendering slides")):
            scale = dpi / 72.0
            mat = fitz.Matrix(scale, scale)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img_path = output_dir / f"slide_{i+1:02d}.png"
            pix.save(str(img_path))
            out.append(img_path)
    return out

def get_durations_from_segments(seg_audio: List[Path]) -> List[float]:
    durs = []
    for p in seg_audio:
        audio = MP3(str(p))
        durs.append(float(audio.info.length))
    return durs

def build_slideshow(
    slide_images: List[Path],
    seg_durations: List[float],
    out_mp4: Path,
    fps: int
) -> None:
    ensure_dir(out_mp4.parent)
    img_seq = list(slide_images)
    while len(img_seq) < len(seg_durations):
        img_seq.append(img_seq[-1])
    ffconcat = out_mp4.parent / "slides.ffconcat"
    with open(ffconcat, "w", encoding="utf-8") as f:
        f.write("ffconcat version 1.0\n")
        for img, dur in zip(img_seq[:len(seg_durations)], seg_durations):
            f.write(f"file '{img.resolve()}'\n")
            f.write(f"duration {max(0.8, dur):.3f}\n")
        f.write(f"file '{img_seq[len(seg_durations)-1].resolve()}'\n")
    vf = (
        "scale=1280:720:force_original_aspect_ratio=decrease,"
        "pad=1280:720:(ow-iw)/2:(oh-ih)/2:color=white,"
        f"fps={fps},"
        "format=yuv420p"
    )
    cmd_cfr = [
        FFMPEG, "-y",
        "-f", "concat", "-safe", "0",
        "-i", str(ffconcat),
        "-fps_mode", "cfr",
        "-vf", vf,
        "-pix_fmt", "yuv420p",
        str(out_mp4),
    ]
    cmd_vfr = [
        FFMPEG, "-y",
        "-f", "concat", "-safe", "0",
        "-i", str(ffconcat),
        "-fps_mode", "vfr",
        "-vf", vf.replace(f"fps={fps},", ""),
        "-pix_fmt", "yuv420p",
        str(out_mp4),
    ]
    try:
        subprocess.run(cmd_cfr, check=True)
    except subprocess.CalledProcessError:
        logging.warning("CFR render failed; retrying with VFR fallback…")
        subprocess.run(cmd_vfr, check=True)

def build_srt(segments: List[str], durations: List[float], out_srt: Path) -> None:
    def fmt_time(t: float) -> str:
        ms = int(round(t * 1000))
        h, ms = divmod(ms, 3600_000)
        m, ms = divmod(ms, 60_000)
        s, ms = divmod(ms, 1000)
        return f"{h:02}:{m:02}:{s:02},{ms:03}"

    cur = 0.0
    lines = []
    for i, (seg, dur) in enumerate(zip(segments, durations), start=1):
        start = fmt_time(cur)
        end = fmt_time(cur + dur)
        cur += dur
        txt = re.sub(r"\[\d+(\.\d+)?s\]", "", seg).strip()
        lines.append(f"{i}\n{start} --> {end}\n{txt}\n")
    out_srt.write_text("\n".join(lines), encoding="utf-8")

def mux_video_audio(
    video_mp4: Path,
    audio_mp3: Path,
    final_mp4: Path,
    loudnorm: bool
) -> None:
    args = [
        FFMPEG, "-y",
        "-i", str(video_mp4),
        "-i", str(audio_mp3),
        "-c:v", "libx264",
        "-pix_fmt", "yuv420p",
        "-c:a", "aac",
        "-shortest"
    ]
    if loudnorm:
        args.extend(["-af", "loudnorm=I=-16:TP=-1.5:LRA=11"])
    args.append(str(final_mp4))
    subprocess.run(args, check=True)

def token_weight(s: str) -> int:
    return max(1, len(re.findall(r"\w+", s)))

def durations_by_page_token_weight(page_texts: List[str], total_audio_sec: float) -> List[float]:
    weights = [token_weight(t) for t in page_texts]
    total = sum(weights)
    if total <= 0:
        return [max(0.8, total_audio_sec / max(1, len(page_texts)))] * len(page_texts)
    base = [total_audio_sec * w / total for w in weights]
    return [max(0.8, d) for d in base]

def run(cfg: Config) -> None:
    preflight(cfg)
    ensure_dir(cfg.out_dir)

    narration_txt = cfg.out_dir / "presentation.txt"
    audio_dir = cfg.out_dir / "audio_segments"
    combined_mp3 = cfg.out_dir / "presentation.mp3"
    slides_dir = cfg.out_dir / "slides"
    slideshow_mp4 = cfg.out_dir / "slideshow.mp4"
    srt_file = cfg.out_dir / "presentation.srt"
    final_video = cfg.out_dir / "presentation_video_final.mp4"

    if cfg.segment_mode == "pages":
        logging.info("Extracting text per page…")
        page_texts = extract_text_per_page(cfg.pdf_path)
        num_pages = len(page_texts)
        logging.info(f"PDF has {num_pages} pages")

        logging.info("Generating narration with page-aligned markers…")
        narration = generate_presentation_text_pages(cfg, page_texts, narration_txt)

        segments = split_segments(narration)
        segments = enforce_segment_count(segments, num_pages)
        logging.info(f"Using {len(segments)} page-aligned narration segments")

        logging.info("Generating speech (per segment)…")
        seg_audio = generate_tts_for_segments(cfg, segments, audio_dir)

        logging.info("Concatenating audio segments…")
        concat_audio(seg_audio, combined_mp3)

        logging.info("Rendering slides from PDF…")
        slide_imgs = pdf_to_images(cfg.pdf_path, slides_dir, cfg.dpi)

        logging.info("Measuring segment durations from audio…")
        durations = get_durations_from_segments(seg_audio)

        logging.info("Building slideshow video (page-aligned)…")
        build_slideshow(slide_imgs, durations, slideshow_mp4, cfg.fps)

        logging.info("Creating captions…")
        build_srt(segments, durations, srt_file)

    elif cfg.segment_mode == "auto":
        logging.info("Extracting document text and per-page text…")
        raw_text = extract_text_from_pdf(cfg.pdf_path)
        page_texts = extract_text_per_page(cfg.pdf_path)

        logging.info("Generating narration…")
        narration = generate_presentation_text_llm(cfg, raw_text, narration_txt)
        segments = split_segments(narration)
        logging.info(f"Detected {len(segments)} narration segments")

        logging.info("Generating speech (per segment)…")
        seg_audio = generate_tts_for_segments(cfg, segments, audio_dir)

        logging.info("Concatenating audio segments…")
        concat_audio(seg_audio, combined_mp3)

        logging.info("Rendering slides from PDF…")
        slide_imgs = pdf_to_images(cfg.pdf_path, slides_dir, cfg.dpi)

        logging.info("Measuring segment durations from audio…")
        durations = get_durations_from_segments(seg_audio)
        total_audio = sum(durations)
        logging.info(f"Total audio length: {total_audio:.2f}s")

        logging.info("Computing token-weighted durations per page…")
        page_durs = durations_by_page_token_weight(page_texts, total_audio)

        logging.info("Building slideshow video (auto-timed)…")
        build_slideshow(slide_imgs, page_durs, slideshow_mp4, cfg.fps)

        logging.info("Creating captions…")
        build_srt(segments, durations, srt_file)

    elif cfg.segment_mode == "llm":
        logging.info("Extracting document text…")
        raw_text = extract_text_from_pdf(cfg.pdf_path)

        logging.info("Generating narration with markers…")
        narration = generate_presentation_text_llm(cfg, raw_text, narration_txt)
        segments = split_segments(narration)
        logging.info(f"Detected {len(segments)} narration segments")

        logging.info("Generating speech (per segment)…")
        seg_audio = generate_tts_for_segments(cfg, segments, audio_dir)

        logging.info("Concatenating audio segments…")
        concat_audio(seg_audio, combined_mp3)

        logging.info("Rendering slides from PDF…")
        slide_imgs = pdf_to_images(cfg.pdf_path, slides_dir, cfg.dpi)

        logging.info("Measuring segment durations from audio…")
        durations = get_durations_from_segments(seg_audio)

        logging.info("Building slideshow video (segment-timed)…")
        build_slideshow(slide_imgs, durations, slideshow_mp4, cfg.fps)

        logging.info("Creating captions…")
        build_srt(segments, durations, srt_file)

    else:
        die(f"Unknown --segment-mode '{cfg.segment_mode}'. Use pages|auto|llm")

    logging.info("Muxing final video + audio…")
    mux_video_audio(slideshow_mp4, combined_mp3, final_video, cfg.loudnorm)

    logging.info(f"Done: {final_video}")
    logging.info(f"Narration: {narration_txt}")
    logging.info(f"Audio: {combined_mp3}")
    logging.info(f"Captions: {srt_file}")

def parse_args() -> Config:
    load_dotenv_if_present(Path(".env"))

    p = argparse.ArgumentParser(prog="aivideo", description="Build narrated video from a PDF")
    p.add_argument("--pdf", type=Path, default=None, help="Path to source PDF")
    p.add_argument("--out", type=Path, default=None, help="Output directory")
    p.add_argument("--ollama-url", default=None)
    p.add_argument("--ollama-model", default=None)
    p.add_argument("--kokoro-url", default=None)
    p.add_argument("--kokoro-model", default=None)
    p.add_argument("--kokoro-voice", default=None)
    p.add_argument("--kokoro-speed", type=float, default=None)
    p.add_argument("--dpi", type=int, default=None, help="Slide render DPI")
    p.add_argument("--fps", type=int, default=None)
    p.add_argument("--force", action="store_true", help="Regenerate all artifacts")
    p.add_argument("--segment-mode", choices=["pages", "auto", "llm"], default=None,
                   help="Alignment strategy")

    try:
        bool_action = argparse.BooleanOptionalAction  # type: ignore[attr-defined]
    except AttributeError:
        bool_action = None
    if bool_action:
        p.add_argument("--loudnorm", action=bool_action, default=None, help="Apply EBU R128 loudness normalization")
    else:
        p.add_argument("--loudnorm", action="store_true", help="Apply EBU R128 loudness normalization")

    args = p.parse_args()

    pdf = args.pdf or (Path(os.getenv("PDF_PATH")) if os.getenv("PDF_PATH") else None) or DEFAULT_PDF
    if pdf is None or not pdf.exists():
        candidate = guess_latest_pdf(Path.home() / "Downloads")
        if candidate and (pdf is None or not Path(pdf).exists()):
            logging.info(f"No valid --pdf provided; using newest PDF in Downloads: {candidate}")
            pdf = candidate
    if pdf is None or not pdf.exists():
        die("Missing --pdf (and none found via PDF_PATH/.env/defaults/Downloads).")

    out_dir = args.out or Path(os.getenv("OUT_DIR", "")) or DEFAULT_OUT_DIR

    ollama_url = args.ollama_url or os.getenv("OLLAMA_URL") or DEFAULT_OLLAMA_URL
    ollama_model = args.ollama_model or os.getenv("OLLAMA_MODEL") or DEFAULT_OLLAMA_MODEL

    kokoro_url = args.kokoro_url or os.getenv("KOKORO_URL") or DEFAULT_KOKORO_URL
    kokoro_model = args.kokoro_model or os.getenv("KOKORO_MODEL") or DEFAULT_KOKORO_MODEL
    kokoro_voice = args.kokoro_voice or os.getenv("KOKORO_VOICE") or DEFAULT_KOKORO_VOICE

    def _float_env(key: str, default: float) -> float:
        val = os.getenv(key)
        if val is None:
            return default
        try:
            return float(val)
        except ValueError:
            return default

    def _int_env(key: str, default: int) -> int:
        val = os.getenv(key)
        if val is None:
            return default
        try:
            return int(val)
        except ValueError:
            return default

    kokoro_speed = args.kokoro_speed if args.kokoro_speed is not None else _float_env("KOKORO_SPEED", DEFAULT_KOKORO_SPEED)
    dpi = args.dpi if args.dpi is not None else _int_env("SLIDE_DPI", DEFAULT_DPI)
    fps = args.fps if args.fps is not None else _int_env("FPS", DEFAULT_FPS)

    if hasattr(args, "loudnorm") and args.loudnorm is not None:
        loudnorm = bool(args.loudnorm)
    else:
        env_loudnorm = os.getenv("LOUDNORM")
        if env_loudnorm is not None:
            loudnorm = env_loudnorm.strip().lower() in ("1", "true", "yes", "on")
        else:
            loudnorm = DEFAULT_LOUDNORM

    segment_mode = args.segment_mode or os.getenv("SEGMENT_MODE", DEFAULT_SEGMENT_MODE).lower()
    if segment_mode not in ("pages", "auto", "llm"):
        segment_mode = DEFAULT_SEGMENT_MODE

    return Config(
        pdf_path=pdf,
        out_dir=out_dir,
        ollama_url=ollama_url,
        ollama_model=ollama_model,
        kokoro_url=kokoro_url,
        kokoro_model=kokoro_model,
        kokoro_voice=kokoro_voice,
        kokoro_speed=kokoro_speed,
        dpi=dpi,
        force=args.force,
        loudnorm=loudnorm,
        fps=fps,
        segment_mode=segment_mode,
    )

if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s | %(levelname)s | %(message)s",
        datefmt="%H:%M:%S"
    )
    try:
        cfg = parse_args()
        run(cfg)
    except subprocess.CalledProcessError as e:
        die(f"Subprocess failed: {e}")
    except requests.RequestException as e:
        die(f"Network error: {e}")
    except Exception as e:
        die(f"Unexpected error: {e}")

From here, one can fine-tune the tone, swap out voices, and replace the LLM and TTS layers with your own. It's flexible by design and since it's all local, and depending on your hardware, it can be very fast once set up.

Turning static documents into polished, narrated videos using only open tools, all locally, is neat.

A Local Pipeline for AI-Narrated Presentations

How It Works

Deploying Kokoro Web TTS

Local Setup

Example `.env`

Python Script

Backing Up TrueNAS SCALE to Synology Using rsync

Turning a Travel Router into a Network Tap

How It Works

Deploying Kokoro Web TTS

Local Setup

Example .env

Python Script

Backing Up TrueNAS SCALE to Synology Using rsync

Turning a Travel Router into a Network Tap

Related projects

Example `.env`