A Local Pipeline for AI-Narrated Presentations
Turning PDFs into Video with Kokoro and Ollama
I wanted something that could take my internal updates and research summaries, which usually exist as slides or PDFs, and instantly turn them into narrated videos for asynchronous sharing. Tools like Synthesia or HeyGen can do that, but they're closed-source and online-only.
This project transforms a PDF (preferably PowerPoint-based) into a narrated video, using a local LLM and TTS engine to generate natural-sounding speech and automatically sync captions with each slide.
How It Works
The script runs a full automated pipeline:
- Extract text from a PDF using PyMuPDF.
- Ask an LLM (served via Ollama) to rewrite the content into spoken narration.
- Use Kokoro Web TTS to generate audio for each narration segment.
- Render each PDF page to an image and match slide durations to the corresponding audio lengths.
- Combine everything with FFmpeg into a clean, normalised video complete with captions.
You can run it in one of three modes:
--segment-mode pages: one narration segment per page (perfect for slide decks).--segment-mode llm: narration breaks wherever the LLM decides.--segment-mode auto: timing based on text length per page.
The result is a fully automated narrated presentation, locally generated end-to-end without any external API calls.
Deploying Kokoro Web TTS
Kokoro Web is a lightweight, high-quality open-source text-to-speech server.
You can run it with Docker in just a simple compose stack:
services:
kokoro-web:
image: ghcr.io/eduardolat/kokoro-web:latest
container_name: kokoro-web
ports:
- "45732:3000"
volumes:
- kokoro_cache:/kokoro/cache
restart: unless-stopped
volumes:
kokoro_cache:
name: kokoro_cache
driver: local
Once running, Kokoro listens on the port you define (e.g. http://localhost:45732/api/v1/audio/speech).
You can then point your script's KOKORO_URL to it, and it will generate natural MP3 speech files on demand.
Local Setup
This project assumes:
- You already have Ollama running locally (for LLM narration).
- FFmpeg is available in your system path.
You've installed Python packages:
pip install requests pymupdf mutagen tqdm
Then clone or copy the script and run:
python3 aivideo.py --pdf myfile.pdf --segment-mode pages
It will automatically generate:
- Narration text
- Per-segment audio
- Rendered slides
- Subtitles (
.srt) - Final combined video
Example .env
PDF_PATH="input.pdf"
OUT_DIR="build"
OLLAMA_URL="http://localhost:11434/api/generate"
OLLAMA_MODEL="llama3.1:8b"
KOKORO_URL="http://localhost:45732/api/v1/audio/speech"
KOKORO_MODEL="model_q8f16"
KOKORO_VOICE="af_heart"
KOKORO_SPEED="1.0"
SLIDE_DPI="180"
FPS="30"
LOUDNORM="true"
Python Script
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import logging
import os
import re
import shutil
import subprocess
import sys
import time
import unicodedata
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple, Optional
import fitz
import requests
from mutagen.mp3 import MP3
from requests.adapters import HTTPAdapter, Retry
from tqdm import tqdm
DEFAULT_PDF = Path("input.pdf")
DEFAULT_OUT_DIR = Path("build")
DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434/api/generate"
DEFAULT_OLLAMA_MODEL = "llama3.1:8b"
DEFAULT_KOKORO_URL = "http://127.0.0.1:45732/api/v1/audio/speech"
DEFAULT_KOKORO_MODEL = "model_q8f16"
DEFAULT_KOKORO_VOICE = "af_heart"
DEFAULT_KOKORO_SPEED = 1.0
DEFAULT_DPI = 180
DEFAULT_FPS = 30
DEFAULT_LOUDNORM = True
DEFAULT_SEGMENT_MODE = "pages"
SLIDE_MARKER_CANONICAL = "[SLIDE_BREAK]"
SLIDE_MARKER_REGEX = re.compile(r"\[\s*SLIDE[_\-\s]*BREAK\s*\]", re.IGNORECASE)
CLEAN_MD_REGEX = re.compile(r"[#*`>\\]+")
SENTENCE_PAUSE = "[0.5s]"
SLIDE_PAUSE = "[1s]"
FFMPEG = shutil.which("ffmpeg")
def load_dotenv_if_present(dotenv_path: Path) -> None:
if not dotenv_path.exists():
return
for raw in dotenv_path.read_text(encoding="utf-8").splitlines():
line = raw.strip()
if not line or line.startswith("#") or "=" not in line:
continue
k, v = line.split("=", 1)
k = k.strip()
v = v.strip().strip('"').strip("'")
os.environ.setdefault(k, v)
def guess_latest_pdf(downloads_dir: Path) -> Optional[Path]:
if not downloads_dir.exists():
return None
pdfs = sorted(downloads_dir.glob("*.pdf"), key=lambda p: p.stat().st_mtime, reverse=True)
return pdfs[0] if pdfs else None
def session_with_retries() -> requests.Session:
s = requests.Session()
retries = Retry(
total=5,
backoff_factor=0.6,
status_forcelist=(429, 500, 502, 503, 504),
allowed_methods=frozenset(["GET", "POST"]),
raise_on_status=False,
)
adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=10)
s.mount("http://", adapter)
s.mount("https://", adapter)
return s
HTTP = session_with_retries()
@dataclass
class Config:
pdf_path: Path
out_dir: Path
ollama_url: str
ollama_model: str
kokoro_url: str
kokoro_model: str
kokoro_voice: str
kokoro_speed: float
dpi: int
force: bool
loudnorm: bool
fps: int
segment_mode: str
def die(msg: str, code: int = 1) -> None:
logging.error(msg)
sys.exit(code)
def preflight(cfg: Config) -> None:
if not cfg.pdf_path.exists():
die(f"PDF not found: {cfg.pdf_path}")
if not FFMPEG:
die("ffmpeg not found in PATH.")
try:
HTTP.post(cfg.ollama_url, json={"model": cfg.ollama_model, "prompt": "ping"}, timeout=5)
except Exception as e:
logging.warning(f"Could not reach text generation endpoint at {cfg.ollama_url}: {e}")
try:
HTTP.post(cfg.kokoro_url, json={"model": cfg.kokoro_model, "voice": cfg.kokoro_voice, "input": "ping"}, timeout=5)
except Exception as e:
logging.warning(f"Could not reach TTS endpoint at {cfg.kokoro_url}: {e}")
def normalize_text(s: str) -> str:
s = unicodedata.normalize("NFKC", s)
s = s.replace("\r", "")
s = re.sub(r"\s{3,}", " ", s)
return s.strip()
def ensure_dir(p: Path) -> None:
p.mkdir(parents=True, exist_ok=True)
def extract_text_from_pdf(path: Path) -> str:
text_chunks: List[str] = []
with fitz.open(path) as pdf:
for page in pdf:
text_chunks.append(page.get_text("text"))
text = "\n".join(text_chunks)
return normalize_text(text)
def extract_text_per_page(path: Path) -> List[str]:
pages = []
with fitz.open(path) as pdf:
for page in pdf:
pages.append(normalize_text(page.get_text("text")))
return pages
def build_prompt_llm(raw_text: str) -> str:
"""
Create a spoken narration script for a short internal update video based on the document below.
Tone: friendly, concise, conversational; confident but plainspoken.
Length target: roughly 3–5 minutes spoken.
Structure:
- Start with a brief greeting like: "Hi everyone, welcome to the latest update."
- Break the script into natural topic sections and insert the exact tag [SLIDE_BREAK] between sections.
- Finish with a simple sign-off (no contact details).
Output rules:
- Output only the final spoken narration with [SLIDE_BREAK] markers. No lists, headings, or meta commentary.
- If the source contains marketing taglines or boilerplate, skip them.
Document:
\"\"\"%s\"\"\"
""" % raw_text
def build_prompt_pages(per_page_texts: List[str]) -> str:
"""
Turn the slide text into a spoken narration script.
Tone: friendly, concise, conversational; confident but plainspoken.
Length target: overall ~3–5 minutes.
Output EXACTLY %d sections, one per slide, in order.
Separate each section with the exact tag [SLIDE_BREAK].
Start with a brief greeting like: "Hi everyone, welcome to the latest update."
End with a simple sign-off (no contact details).
Skip any marketing taglines or boilerplate if present.
Slides (one block per slide, keep order):
%s
""" % (
len(per_page_texts),
"\n\n--- SLIDE ---\n".join(per_page_texts),
)
def _ollama_stream_text(url: str, model: str, prompt: str) -> str:
payload = {"model": model, "prompt": prompt}
r = HTTP.post(url, json=payload, stream=True, timeout=60)
r.raise_for_status()
buf: List[str] = []
for line in r.iter_lines(decode_unicode=True):
if not line:
continue
try:
data = json.loads(line)
seg = data.get("response", "")
if seg:
buf.append(seg)
except json.JSONDecodeError:
continue
output = "".join(buf)
output = SLIDE_MARKER_REGEX.sub(SLIDE_MARKER_CANONICAL, output)
cleaned = CLEAN_MD_REGEX.sub("", output)
cleaned = normalize_text(cleaned)
return cleaned
def generate_presentation_text_llm(cfg: Config, raw_text: str, out_txt: Path) -> str:
if out_txt.exists() and not cfg.force:
logging.info(f"Reusing existing narration: {out_txt}")
return out_txt.read_text(encoding="utf-8")
prompt = build_prompt_llm(raw_text)
logging.info("Generating narration…")
cleaned = _ollama_stream_text(cfg.ollama_url, cfg.ollama_model, prompt)
if SLIDE_MARKER_CANONICAL not in cleaned:
cleaned = cleaned.replace("\n\n", f"\n\n{SLIDE_MARKER_CANONICAL}\n\n")
out_txt.write_text(cleaned, encoding="utf-8")
return cleaned
def generate_presentation_text_pages(cfg: Config, per_page_texts: List[str], out_txt: Path) -> str:
if out_txt.exists() and not cfg.force:
logging.info(f"Reusing existing narration: {out_txt}")
return out_txt.read_text(encoding="utf-8")
prompt = build_prompt_pages(per_page_texts)
logging.info("Generating narration (page-aware)…")
cleaned = _ollama_stream_text(cfg.ollama_url, cfg.ollama_model, prompt)
if SLIDE_MARKER_CANONICAL not in cleaned:
cleaned = cleaned.replace("\n\n", f"\n\n{SLIDE_MARKER_CANONICAL}\n\n")
out_txt.write_text(cleaned, encoding="utf-8")
return cleaned
def split_segments(narration: str) -> List[str]:
parts = [s.strip() for s in re.split(SLIDE_MARKER_REGEX, narration) if s.strip()]
return parts
def split_sentences(s: str) -> List[str]:
parts = re.split(r'(?<=[.!?])\s+(?!\[[0-9]', s.strip())
return [p for p in parts if p.strip()]
def enforce_segment_count(segments: List[str], target: int) -> List[str]:
segs = [s.strip() for s in segments if s.strip()]
while len(segs) < target and segs:
idx = max(range(len(segs)), key=lambda i: len(segs[i]))
sents = split_sentences(segs[idx])
if len(sents) <= 1:
mid = len(segs[idx]) // 2
sents = [segs[idx][:mid].strip(), segs[idx][mid:].strip()]
needed_new_segments = target - (len(segs) - 1)
needed_new_segments = max(2, min(needed_new_segments, len(sents)))
chunk_sizes = []
base = len(sents) // needed_new_segments
rem = len(sents) % needed_new_segments
for i in range(needed_new_segments):
chunk_sizes.append(base + (1 if i < rem else 0))
new_pieces = []
cursor = 0
for cs in chunk_sizes:
new_pieces.append(" ".join(sents[cursor:cursor+cs]).strip())
cursor += cs
segs = segs[:idx] + new_pieces + segs[idx+1:]
while len(segs) > target and len(segs) >= 2:
pair_idx = min(range(len(segs)-1), key=lambda i: len(segs[i]) + len(segs[i+1]))
merged = (segs[pair_idx] + " " + segs[pair_idx+1]).strip()
segs = segs[:pair_idx] + [merged] + segs[pair_idx+2:]
return segs
def prepare_segment_for_tts(segment: str) -> str:
segment = re.sub(r"([.!?])\s+", rf"\1{SENTENCE_PAUSE} ", segment)
return segment.strip()
def tts_segment(cfg: Config, text: str, out_mp3: Path) -> None:
if out_mp3.exists() and out_mp3.stat().st_size > 0:
return
payload = {
"model": cfg.kokoro_model,
"voice": cfg.kokoro_voice,
"input": text,
"response_format": "mp3",
"speed": cfg.kokoro_speed
}
r = HTTP.post(cfg.kokoro_url, json=payload, stream=True, timeout=120)
r.raise_for_status()
with open(out_mp3, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
def generate_tts_for_segments(cfg: Config, segments: List[str], audio_dir: Path) -> List[Path]:
ensure_dir(audio_dir)
out_files: List[Path] = []
for i, seg in enumerate(tqdm(segments, desc="TTS segments")):
prepped = prepare_segment_for_tts(seg if i == 0 else SLIDE_PAUSE + " " + seg)
mp3 = audio_dir / f"seg_{i+1:03d}.mp3"
tts_segment(cfg, prepped, mp3)
out_files.append(mp3)
return out_files
def concat_audio(mp3_files: List[Path], out_mp3: Path) -> None:
if out_mp3.exists() and out_mp3.stat().st_size > 0:
return
lst = out_mp3.parent / "audio_list.txt"
with open(lst, "w", encoding="utf-8") as f:
for p in mp3_files:
f.write(f"file '{p.resolve()}'\n")
subprocess.run(
[FFMPEG, "-y", "-f", "concat", "-safe", "0", "-i", str(lst), "-c", "copy", str(out_mp3)],
check=True
)
def pdf_to_images(pdf_path: Path, output_dir: Path, dpi: int) -> List[Path]:
ensure_dir(output_dir)
out: List[Path] = []
with fitz.open(pdf_path) as doc:
for i, page in enumerate(tqdm(doc, desc="Rendering slides")):
scale = dpi / 72.0
mat = fitz.Matrix(scale, scale)
pix = page.get_pixmap(matrix=mat, alpha=False)
img_path = output_dir / f"slide_{i+1:02d}.png"
pix.save(str(img_path))
out.append(img_path)
return out
def get_durations_from_segments(seg_audio: List[Path]) -> List[float]:
durs = []
for p in seg_audio:
audio = MP3(str(p))
durs.append(float(audio.info.length))
return durs
def build_slideshow(
slide_images: List[Path],
seg_durations: List[float],
out_mp4: Path,
fps: int
) -> None:
ensure_dir(out_mp4.parent)
img_seq = list(slide_images)
while len(img_seq) < len(seg_durations):
img_seq.append(img_seq[-1])
ffconcat = out_mp4.parent / "slides.ffconcat"
with open(ffconcat, "w", encoding="utf-8") as f:
f.write("ffconcat version 1.0\n")
for img, dur in zip(img_seq[:len(seg_durations)], seg_durations):
f.write(f"file '{img.resolve()}'\n")
f.write(f"duration {max(0.8, dur):.3f}\n")
f.write(f"file '{img_seq[len(seg_durations)-1].resolve()}'\n")
vf = (
"scale=1280:720:force_original_aspect_ratio=decrease,"
"pad=1280:720:(ow-iw)/2:(oh-ih)/2:color=white,"
f"fps={fps},"
"format=yuv420p"
)
cmd_cfr = [
FFMPEG, "-y",
"-f", "concat", "-safe", "0",
"-i", str(ffconcat),
"-fps_mode", "cfr",
"-vf", vf,
"-pix_fmt", "yuv420p",
str(out_mp4),
]
cmd_vfr = [
FFMPEG, "-y",
"-f", "concat", "-safe", "0",
"-i", str(ffconcat),
"-fps_mode", "vfr",
"-vf", vf.replace(f"fps={fps},", ""),
"-pix_fmt", "yuv420p",
str(out_mp4),
]
try:
subprocess.run(cmd_cfr, check=True)
except subprocess.CalledProcessError:
logging.warning("CFR render failed; retrying with VFR fallback…")
subprocess.run(cmd_vfr, check=True)
def build_srt(segments: List[str], durations: List[float], out_srt: Path) -> None:
def fmt_time(t: float) -> str:
ms = int(round(t * 1000))
h, ms = divmod(ms, 3600_000)
m, ms = divmod(ms, 60_000)
s, ms = divmod(ms, 1000)
return f"{h:02}:{m:02}:{s:02},{ms:03}"
cur = 0.0
lines = []
for i, (seg, dur) in enumerate(zip(segments, durations), start=1):
start = fmt_time(cur)
end = fmt_time(cur + dur)
cur += dur
txt = re.sub(r"\[\d+(\.\d+)?s\]", "", seg).strip()
lines.append(f"{i}\n{start} --> {end}\n{txt}\n")
out_srt.write_text("\n".join(lines), encoding="utf-8")
def mux_video_audio(
video_mp4: Path,
audio_mp3: Path,
final_mp4: Path,
loudnorm: bool
) -> None:
args = [
FFMPEG, "-y",
"-i", str(video_mp4),
"-i", str(audio_mp3),
"-c:v", "libx264",
"-pix_fmt", "yuv420p",
"-c:a", "aac",
"-shortest"
]
if loudnorm:
args.extend(["-af", "loudnorm=I=-16:TP=-1.5:LRA=11"])
args.append(str(final_mp4))
subprocess.run(args, check=True)
def token_weight(s: str) -> int:
return max(1, len(re.findall(r"\w+", s)))
def durations_by_page_token_weight(page_texts: List[str], total_audio_sec: float) -> List[float]:
weights = [token_weight(t) for t in page_texts]
total = sum(weights)
if total <= 0:
return [max(0.8, total_audio_sec / max(1, len(page_texts)))] * len(page_texts)
base = [total_audio_sec * w / total for w in weights]
return [max(0.8, d) for d in base]
def run(cfg: Config) -> None:
preflight(cfg)
ensure_dir(cfg.out_dir)
narration_txt = cfg.out_dir / "presentation.txt"
audio_dir = cfg.out_dir / "audio_segments"
combined_mp3 = cfg.out_dir / "presentation.mp3"
slides_dir = cfg.out_dir / "slides"
slideshow_mp4 = cfg.out_dir / "slideshow.mp4"
srt_file = cfg.out_dir / "presentation.srt"
final_video = cfg.out_dir / "presentation_video_final.mp4"
if cfg.segment_mode == "pages":
logging.info("Extracting text per page…")
page_texts = extract_text_per_page(cfg.pdf_path)
num_pages = len(page_texts)
logging.info(f"PDF has {num_pages} pages")
logging.info("Generating narration with page-aligned markers…")
narration = generate_presentation_text_pages(cfg, page_texts, narration_txt)
segments = split_segments(narration)
segments = enforce_segment_count(segments, num_pages)
logging.info(f"Using {len(segments)} page-aligned narration segments")
logging.info("Generating speech (per segment)…")
seg_audio = generate_tts_for_segments(cfg, segments, audio_dir)
logging.info("Concatenating audio segments…")
concat_audio(seg_audio, combined_mp3)
logging.info("Rendering slides from PDF…")
slide_imgs = pdf_to_images(cfg.pdf_path, slides_dir, cfg.dpi)
logging.info("Measuring segment durations from audio…")
durations = get_durations_from_segments(seg_audio)
logging.info("Building slideshow video (page-aligned)…")
build_slideshow(slide_imgs, durations, slideshow_mp4, cfg.fps)
logging.info("Creating captions…")
build_srt(segments, durations, srt_file)
elif cfg.segment_mode == "auto":
logging.info("Extracting document text and per-page text…")
raw_text = extract_text_from_pdf(cfg.pdf_path)
page_texts = extract_text_per_page(cfg.pdf_path)
logging.info("Generating narration…")
narration = generate_presentation_text_llm(cfg, raw_text, narration_txt)
segments = split_segments(narration)
logging.info(f"Detected {len(segments)} narration segments")
logging.info("Generating speech (per segment)…")
seg_audio = generate_tts_for_segments(cfg, segments, audio_dir)
logging.info("Concatenating audio segments…")
concat_audio(seg_audio, combined_mp3)
logging.info("Rendering slides from PDF…")
slide_imgs = pdf_to_images(cfg.pdf_path, slides_dir, cfg.dpi)
logging.info("Measuring segment durations from audio…")
durations = get_durations_from_segments(seg_audio)
total_audio = sum(durations)
logging.info(f"Total audio length: {total_audio:.2f}s")
logging.info("Computing token-weighted durations per page…")
page_durs = durations_by_page_token_weight(page_texts, total_audio)
logging.info("Building slideshow video (auto-timed)…")
build_slideshow(slide_imgs, page_durs, slideshow_mp4, cfg.fps)
logging.info("Creating captions…")
build_srt(segments, durations, srt_file)
elif cfg.segment_mode == "llm":
logging.info("Extracting document text…")
raw_text = extract_text_from_pdf(cfg.pdf_path)
logging.info("Generating narration with markers…")
narration = generate_presentation_text_llm(cfg, raw_text, narration_txt)
segments = split_segments(narration)
logging.info(f"Detected {len(segments)} narration segments")
logging.info("Generating speech (per segment)…")
seg_audio = generate_tts_for_segments(cfg, segments, audio_dir)
logging.info("Concatenating audio segments…")
concat_audio(seg_audio, combined_mp3)
logging.info("Rendering slides from PDF…")
slide_imgs = pdf_to_images(cfg.pdf_path, slides_dir, cfg.dpi)
logging.info("Measuring segment durations from audio…")
durations = get_durations_from_segments(seg_audio)
logging.info("Building slideshow video (segment-timed)…")
build_slideshow(slide_imgs, durations, slideshow_mp4, cfg.fps)
logging.info("Creating captions…")
build_srt(segments, durations, srt_file)
else:
die(f"Unknown --segment-mode '{cfg.segment_mode}'. Use pages|auto|llm")
logging.info("Muxing final video + audio…")
mux_video_audio(slideshow_mp4, combined_mp3, final_video, cfg.loudnorm)
logging.info(f"Done: {final_video}")
logging.info(f"Narration: {narration_txt}")
logging.info(f"Audio: {combined_mp3}")
logging.info(f"Captions: {srt_file}")
def parse_args() -> Config:
load_dotenv_if_present(Path(".env"))
p = argparse.ArgumentParser(prog="aivideo", description="Build narrated video from a PDF")
p.add_argument("--pdf", type=Path, default=None, help="Path to source PDF")
p.add_argument("--out", type=Path, default=None, help="Output directory")
p.add_argument("--ollama-url", default=None)
p.add_argument("--ollama-model", default=None)
p.add_argument("--kokoro-url", default=None)
p.add_argument("--kokoro-model", default=None)
p.add_argument("--kokoro-voice", default=None)
p.add_argument("--kokoro-speed", type=float, default=None)
p.add_argument("--dpi", type=int, default=None, help="Slide render DPI")
p.add_argument("--fps", type=int, default=None)
p.add_argument("--force", action="store_true", help="Regenerate all artifacts")
p.add_argument("--segment-mode", choices=["pages", "auto", "llm"], default=None,
help="Alignment strategy")
try:
bool_action = argparse.BooleanOptionalAction # type: ignore[attr-defined]
except AttributeError:
bool_action = None
if bool_action:
p.add_argument("--loudnorm", action=bool_action, default=None, help="Apply EBU R128 loudness normalization")
else:
p.add_argument("--loudnorm", action="store_true", help="Apply EBU R128 loudness normalization")
args = p.parse_args()
pdf = args.pdf or (Path(os.getenv("PDF_PATH")) if os.getenv("PDF_PATH") else None) or DEFAULT_PDF
if pdf is None or not pdf.exists():
candidate = guess_latest_pdf(Path.home() / "Downloads")
if candidate and (pdf is None or not Path(pdf).exists()):
logging.info(f"No valid --pdf provided; using newest PDF in Downloads: {candidate}")
pdf = candidate
if pdf is None or not pdf.exists():
die("Missing --pdf (and none found via PDF_PATH/.env/defaults/Downloads).")
out_dir = args.out or Path(os.getenv("OUT_DIR", "")) or DEFAULT_OUT_DIR
ollama_url = args.ollama_url or os.getenv("OLLAMA_URL") or DEFAULT_OLLAMA_URL
ollama_model = args.ollama_model or os.getenv("OLLAMA_MODEL") or DEFAULT_OLLAMA_MODEL
kokoro_url = args.kokoro_url or os.getenv("KOKORO_URL") or DEFAULT_KOKORO_URL
kokoro_model = args.kokoro_model or os.getenv("KOKORO_MODEL") or DEFAULT_KOKORO_MODEL
kokoro_voice = args.kokoro_voice or os.getenv("KOKORO_VOICE") or DEFAULT_KOKORO_VOICE
def _float_env(key: str, default: float) -> float:
val = os.getenv(key)
if val is None:
return default
try:
return float(val)
except ValueError:
return default
def _int_env(key: str, default: int) -> int:
val = os.getenv(key)
if val is None:
return default
try:
return int(val)
except ValueError:
return default
kokoro_speed = args.kokoro_speed if args.kokoro_speed is not None else _float_env("KOKORO_SPEED", DEFAULT_KOKORO_SPEED)
dpi = args.dpi if args.dpi is not None else _int_env("SLIDE_DPI", DEFAULT_DPI)
fps = args.fps if args.fps is not None else _int_env("FPS", DEFAULT_FPS)
if hasattr(args, "loudnorm") and args.loudnorm is not None:
loudnorm = bool(args.loudnorm)
else:
env_loudnorm = os.getenv("LOUDNORM")
if env_loudnorm is not None:
loudnorm = env_loudnorm.strip().lower() in ("1", "true", "yes", "on")
else:
loudnorm = DEFAULT_LOUDNORM
segment_mode = args.segment_mode or os.getenv("SEGMENT_MODE", DEFAULT_SEGMENT_MODE).lower()
if segment_mode not in ("pages", "auto", "llm"):
segment_mode = DEFAULT_SEGMENT_MODE
return Config(
pdf_path=pdf,
out_dir=out_dir,
ollama_url=ollama_url,
ollama_model=ollama_model,
kokoro_url=kokoro_url,
kokoro_model=kokoro_model,
kokoro_voice=kokoro_voice,
kokoro_speed=kokoro_speed,
dpi=dpi,
force=args.force,
loudnorm=loudnorm,
fps=fps,
segment_mode=segment_mode,
)
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
datefmt="%H:%M:%S"
)
try:
cfg = parse_args()
run(cfg)
except subprocess.CalledProcessError as e:
die(f"Subprocess failed: {e}")
except requests.RequestException as e:
die(f"Network error: {e}")
except Exception as e:
die(f"Unexpected error: {e}")
From here, one can fine-tune the tone, swap out voices, and replace the LLM and TTS layers with your own. It's flexible by design and since it's all local, and depending on your hardware, it can be very fast once set up.
Turning static documents into polished, narrated videos using only open tools, all locally, is neat.