Backend: generator.py (1600 Z.) in Module gesplittet — pipeline, textkit, bausteine, onepager, guide, elements

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-12 08:05:47 +02:00
parent 0b4a086e89
commit 5702108d28
9 changed files with 1673 additions and 1599 deletions
--- a/backend/guide.py
+++ b/backend/guide.py
@@ -0,0 +1,492 @@
+"""Guide-Generierung: 6 Schritte mit Prüfung nach jeder Phase (OnePager hat einen eigenen Weg).
+
+Prüf-Agenten notieren nur Probleme; das Anpassen übernimmt der jeweilige Erzeuger-Typ.
+Schritt-Dateien bleiben liegen → Abbruch erhält Fortschritt, ▶ setzt am offenen Schritt fort.
+"""
+
+import asyncio
+import json
+import logging
+import math
+from datetime import datetime, timezone
+from pathlib import Path
+
+from agents import run_agent
+from bausteine import _pdfs_konvertieren
+from config import DEFAULT_PROVIDER, FORMAT_ANTEIL, TEMPLATES_DIR
+from database import list_guides, update_guide
+from fsutil import atomic_write_json, atomic_write_text
+from jsonio import read_json_file as _json_datei
+from onepager import _generate_onepager
+from paths import bausteine_path, guide_content_path, project_dir
+from pipeline import (
+    CANCELLED, FAILED, GenContext, _check_then_fix, _claude_error, _extra,
+    _fail, _gather_error, _log, _prompt, _race, _semaphore, _set_progress,
+    _set_step, _timeout, clear_guide_cancelled, is_guide_cancelled,
+    run_single_slot,
+)
+from textkit import (
+    _eindeutige_titel, _lade_bausteine, _parse_fragment, _split_chunks,
+    _titel, _titel_aufloesen, _titel_index, _zuteilung_text,
+)
+
+log = logging.getLogger("creator.guide")
+
+GUIDE_STEPS = ("Auswahl", "Auswahl-Prüfung", "Gliederung", "Gliederungs-Prüfung", "Schreiben", "Lese-Prüfung")
+
+# Writer skalieren mit der Section-Zahl: 1 Writer je ~30 Sections (gedeckelt).
+# Kleine Pakete vermeiden Lazy-Output bei langen Listen und begrenzen den Schaden
+# eines fehlgeschlagenen Writers.
+WRITER_SECTIONS = 30
+WRITER_MAX = 20
+
+
+def _guide_files(content_path: Path) -> dict:
+    d, stem = content_path.parent, content_path.stem
+    return {
+        "auswahl": d / f"{stem}.auswahl.json",
+        "auswahl_check": d / f"{stem}.auswahl-check.json",
+        "gliederung": d / f"{stem}.gliederung.json",
+        "gliederung_check": d / f"{stem}.gliederung-check.json",
+        # chunk-/lese-check-/fix-Dateien sind dynamisch: {stem}.chunk-i.md usw.
+    }
+
+
+def guide_slot_dateien(content_path: Path) -> list[Path]:
+    """Alle Schritt-Dateien eines Guides (für den Frischstart)."""
+    return [p for p in content_path.parent.glob(f"{content_path.stem}.*") if p != content_path]
+
+
+def _resolve_auswahl(data, entries: dict[int, str], k_min: int, k_max: int) -> list[int] | None:
+    """{"bausteine": [Titel]} → Nummern; None bei Schema-Verstoß/Drift/falschem Umfang."""
+    if not isinstance(data, dict) or not isinstance(data.get("bausteine"), list):
+        return None
+    idx = _titel_index(entries)
+    nums: list[int] = []
+    seen: set[int] = set()
+    total = unknown = 0
+    for t in data["bausteine"]:
+        total += 1
+        num = _titel_aufloesen(idx, t) if isinstance(t, str) else None
+        if num is None:
+            unknown += 1
+        elif num not in seen:
+            seen.add(num)
+            nums.append(num)
+    if total == 0 or (total - unknown) / total < 0.85:
+        return None
+    if len(nums) < 0.9 * k_min or len(nums) > 1.1 * k_max:
+        return None
+    return nums
+
+
+def _lese_probleme_schema(data):
+    """{"ok": true} → [] · {"probleme": [{"section", "problem"}]} → Liste · sonst None."""
+    if not isinstance(data, dict):
+        return None
+    if data.get("ok") is True:
+        return []
+    p = data.get("probleme")
+    if not isinstance(p, list) or not p:
+        return None
+    out = []
+    for x in p:
+        if not isinstance(x, dict) or not isinstance(x.get("section"), str) or not isinstance(x.get("problem"), str):
+            return None
+        out.append({"section": x["section"].strip(), "problem": x["problem"].strip()})
+    return out or None
+
+
+def _resolve_gliederung(data, entries: dict[int, str], soll_min: int, soll_max: int) -> list[dict] | None:
+    """{"kapitel": [{"titel", "bausteine": [Titel]}]} → [{"title", "nums"}].
+
+    `soll_min`/`soll_max` = erlaubte Spanne gewählter Bausteine (mit kleiner Toleranz).
+    """
+    if not isinstance(data, dict) or not isinstance(data.get("kapitel"), list):
+        return None
+    idx = _titel_index(entries)
+    chapters: list[dict] = []
+    seen: set[int] = set()
+    total = unknown = 0
+    for ch in data["kapitel"]:
+        if not isinstance(ch, dict) or not isinstance(ch.get("bausteine"), list):
+            return None
+        nums = []
+        for t in ch["bausteine"]:
+            total += 1
+            num = _titel_aufloesen(idx, t) if isinstance(t, str) else None
+            if num is None:
+                unknown += 1
+            elif num not in seen:
+                nums.append(num)
+                seen.add(num)
+        if nums:
+            chapters.append({"title": str(ch.get("titel", "")).strip() or "Kapitel", "nums": nums})
+    if not chapters or total == 0:
+        return None
+    if (total - unknown) / total < 0.85:
+        return None
+    if len(seen) < 0.9 * soll_min or len(seen) > 1.1 * soll_max:
+        return None
+    return chapters
+
+
+async def _generate_sections(
+    guide_id: str, topic: str, format_name: str, entries: dict[int, str],
+    facts: str, instructions: str, provider: str,
+    content_path: Path,
+) -> list[dict] | None:
+    def is_cancelled() -> bool:
+        return is_guide_cancelled(guide_id)
+
+    ctx = GenContext(topic=topic, provider=provider, is_cancelled=is_cancelled, guide_id=guide_id)
+    spec = (TEMPLATES_DIR / "Format" / "Section.md").read_text(encoding="utf-8")
+    files = _guide_files(content_path)
+    bausteine_liste = "\n".join(f"- {t}" for t in entries.values())
+    n = len(entries)
+    anteil_min, anteil_max, minimum, zweck = FORMAT_ANTEIL[format_name]
+    k_min = min(n, max(minimum, math.ceil(anteil_min * n)))
+    k_max = min(n, max(k_min, math.floor(anteil_max * n)))
+    auswahl_auftrag = (
+        f"Wähle MINDESTENS {k_min} und HÖCHSTENS {k_max} der Bausteine und baue daraus {zweck}. "
+        "Wähle, was diesem Zweck dient — lass weg, was dafür nicht nötig ist."
+    )
+
+    # Schritt 1: Auswahl — vorhandene gültige Datei wird übernommen (Resume)
+    auswahl = _resolve_auswahl(_json_datei(files["auswahl"]), entries, k_min, k_max)
+    if auswahl is None:
+        await _set_step(guide_id, 0, "Wähle Bausteine…")
+        files["auswahl"].unlink(missing_ok=True)
+        status, auswahl = await run_single_slot(
+            ctx, "Guide-Auswahl",
+            key=f"{guide_id}-auswahl",
+            prompt=_prompt(
+                "Guide-Auswahl",
+                topic=topic, format_name=format_name, bausteine=bausteine_liste,
+                auswahl_auftrag=auswahl_auftrag, out_path=files["auswahl"], extra=_extra(instructions),
+            ),
+            role="guide", capabilities="files",
+            payload=lambda result: _resolve_auswahl(_json_datei(files["auswahl"]), entries, k_min, k_max),
+            timeout=_timeout("guide_auswahl", n),
+        )
+        if status == CANCELLED:
+            return None
+        if status == FAILED:
+            await _fail(guide_id, "Auswahl fehlgeschlagen")
+            return None
+
+    def auswahl_titel() -> str:
+        return "\n".join(f"- {_titel(entries[num])}" for num in auswahl)
+
+    def auswahl_json() -> str:
+        return json.dumps({"bausteine": [_titel(entries[num]) for num in auswahl]}, ensure_ascii=False)
+
+    # Schritt 2: Auswahl-Prüfung — notiert Probleme; Anpassung macht ein Auswahl-Agent
+    status, fixed = await _check_then_fix(
+        ctx, name="Auswahl", step=1,
+        check_key=f"{guide_id}-auswahl-check",
+        check_prompt=_prompt(
+            "Guide-Auswahl-Check",
+            topic=topic, format_name=format_name, auswahl_auftrag=auswahl_auftrag,
+            bausteine=bausteine_liste, auswahl=auswahl_titel(),
+            out_path=files["auswahl_check"], extra=_extra(instructions),
+        ),
+        check_path=files["auswahl_check"], check_timeout=_timeout("guide_check", len(auswahl)),
+        fix_key=f"{guide_id}-auswahl-fix",
+        build_fix_prompt=lambda probleme: _prompt(
+            "Guide-Auswahl-Fix",
+            topic=topic, format_name=format_name, auswahl_auftrag=auswahl_auftrag,
+            bausteine=bausteine_liste, auswahl=auswahl_titel(),
+            probleme="\n".join(f"- {p}" for p in probleme),
+            out_path=files["auswahl"], extra=_extra(instructions),
+        ),
+        fix_payload=lambda result: _resolve_auswahl(_json_datei(files["auswahl"]), entries, k_min, k_max),
+        fix_timeout=_timeout("guide_auswahl", n), fix_role="guide",
+        on_fix_invalid=lambda: atomic_write_text(files["auswahl"], auswahl_json()),
+    )
+    if status == CANCELLED:
+        return None
+    if status == FAILED:
+        await _fail(guide_id, "Auswahl-Prüfung fehlgeschlagen")
+        return None
+    if fixed is not None:
+        auswahl = fixed
+
+    sel_entries = {num: entries[num] for num in auswahl}
+    soll = len(sel_entries)
+    sel_liste = "\n".join(f"- {t}" for t in sel_entries.values())
+
+    # Schritt 3: Gliederung der festen Auswahl
+    plan = _resolve_gliederung(_json_datei(files["gliederung"]), sel_entries, soll, soll)
+    if plan is None:
+        await _set_step(guide_id, 2, "Plane Gliederung…")
+        files["gliederung"].unlink(missing_ok=True)
+        status, plan = await run_single_slot(
+            ctx, "Gliederung",
+            key=f"{guide_id}-gliederung",
+            prompt=_prompt(
+                "Guide-Gliederung",
+                topic=topic, format_name=format_name, bausteine=sel_liste,
+                out_path=files["gliederung"], extra=_extra(instructions),
+            ),
+            role="guide", capabilities="files",
+            payload=lambda result: _resolve_gliederung(_json_datei(files["gliederung"]), sel_entries, soll, soll),
+            timeout=_timeout("plan", soll),
+        )
+        if status == CANCELLED:
+            return None
+        if status == FAILED:
+            await _fail(guide_id, "Gliederung fehlgeschlagen")
+            return None
+
+    def gliederung_text() -> str:
+        return "\n".join(_zuteilung_text([ch], {num: _titel(entries[num]) for num in ch["nums"]}) for ch in plan)
+
+    def gliederung_json() -> str:
+        return json.dumps(
+            {"kapitel": [{"titel": ch["title"], "bausteine": [_titel(entries[num]) for num in ch["nums"]]} for ch in plan]},
+            ensure_ascii=False,
+        )
+
+    # Schritt 4: Gliederungs-Prüfung
+    status, fixed = await _check_then_fix(
+        ctx, name="Gliederung", step=3,
+        check_key=f"{guide_id}-gliederung-check",
+        check_prompt=_prompt(
+            "Guide-Gliederung-Check",
+            topic=topic, format_name=format_name, zweck=zweck,
+            auswahl=auswahl_titel(), gliederung=gliederung_text(),
+            out_path=files["gliederung_check"], extra=_extra(instructions),
+        ),
+        check_path=files["gliederung_check"], check_timeout=_timeout("guide_check", soll),
+        fix_key=f"{guide_id}-gliederung-fix",
+        build_fix_prompt=lambda probleme: _prompt(
+            "Guide-Gliederung-Fix",
+            topic=topic, format_name=format_name,
+            auswahl=auswahl_titel(), gliederung=gliederung_text(),
+            probleme="\n".join(f"- {p}" for p in probleme),
+            out_path=files["gliederung"], extra=_extra(instructions),
+        ),
+        fix_payload=lambda result: _resolve_gliederung(_json_datei(files["gliederung"]), sel_entries, soll, soll),
+        fix_timeout=_timeout("plan", soll), fix_role="guide",
+        on_fix_invalid=lambda: atomic_write_text(files["gliederung"], gliederung_json()),
+    )
+    if status == CANCELLED:
+        return None
+    if status == FAILED:
+        await _fail(guide_id, "Gliederungs-Prüfung fehlgeschlagen")
+        return None
+    if fixed is not None:
+        plan = fixed
+
+    # Schritt 5: Schreiben — vorhandene Chunk-Dateien werden übernommen (Resume)
+    total_sections = sum(len(c["nums"]) for c in plan)
+    chunks = _split_chunks(plan, min(WRITER_MAX, max(1, math.ceil(total_sections / WRITER_SECTIONS))))
+    zuteilungen = [_zuteilung_text(chunk, entries) for chunk in chunks]
+    chunk_sizes = [sum(len(c["nums"]) for c in chunk) for chunk in chunks]
+    writer_count = len(zuteilungen)
+    paths = [content_path.parent / f"{content_path.stem}.chunk-{i}.md" for i in range(1, writer_count + 1)]
+    offen = [i for i, p in enumerate(paths) if not p.exists()]
+    if offen:
+        await _set_step(guide_id, 4, f"Schreibe Sections ({writer_count} Writer)…" if writer_count > 1 else "Schreibe Sections…")
+        results = await asyncio.gather(*[
+            run_agent(
+                f"{guide_id}-w{i + 1}",
+                _prompt(
+                    "Guide-Writer",
+                    topic=topic, format_name=format_name, zuteilung=zuteilungen[i],
+                    facts=facts, spec=spec, out_path=paths[i], extra=_extra(instructions),
+                ),
+                _timeout("writer", chunk_sizes[i]), provider=provider, role="guide", capabilities="full",
+            )
+            for i in offen
+        ], return_exceptions=True)
+        if is_cancelled():
+            return None
+        for i, r in zip(offen, results):
+            if isinstance(r, BaseException):
+                _log(topic, f"Writer {i + 1}: {type(r).__name__}: {r}")
+            elif r[0] != 0:
+                _log(topic, f"Writer {i + 1}: {_claude_error('Fehler', *r)}")
+            elif not paths[i].exists():
+                _log(topic, f"Writer {i + 1}: keine Ausgabedatei erstellt")
+        if not any(p.exists() for p in paths):
+            await _fail(guide_id, _gather_error("Writer-Fehler", list(results)))
+            return None
+
+    idx = _titel_index(entries)
+    by_num: dict[int, dict] = {}
+    for p in paths:
+        if not p.exists():
+            continue
+        for sec in _parse_fragment(p.read_text(encoding="utf-8")):
+            num = _titel_aufloesen(idx, sec["titel"])
+            if num is None:
+                _log(topic, f"Writer lieferte unbekannte Section '{sec['titel'][:40]}' (ignoriert)")
+            elif num not in by_num:
+                by_num[num] = sec
+    if not by_num:
+        await _fail(guide_id, "Keine Sections in der Writer-Ausgabe gefunden")
+        return None
+
+    # Schritt 6: Lese-Prüfung pro Writer-Paket — Fix beauftragt Writer nur mit beanstandeten Sections
+    chunk_nums = [[num for ch in chunk for num in ch["nums"] if num in by_num] for chunk in chunks]
+    check_paths = [content_path.parent / f"{content_path.stem}.lese-check-{i}.json" for i in range(1, writer_count + 1)]
+    offen_checks = [i for i, p in enumerate(check_paths) if _lese_probleme_schema(_json_datei(p)) is None and chunk_nums[i]]
+    if offen_checks:
+        await _set_step(guide_id, 5, f"Prüfe Lesbarkeit ({len(offen_checks)} Prüfer)…" if len(offen_checks) > 1 else "Prüfe Lesbarkeit…")
+
+        def sections_text(nums: list[int]) -> str:
+            return "\n\n".join(f"SECTION: {_titel(entries[num])}\n{by_num[num]['md']}" for num in nums)
+
+        slots = [{
+            "key": f"{guide_id}-lese-check-{i + 1}",
+            "prompt": _prompt(
+                "Guide-Lese-Check",
+                topic=topic, format_name=format_name, spec=spec,
+                sections=sections_text(chunk_nums[i]),
+                out_path=check_paths[i], extra=_extra(instructions),
+            ),
+            "role": "fast", "capabilities": "files",
+            "payload": (lambda result, p=check_paths[i]: _lese_probleme_schema(_json_datei(p))),
+        } for i in offen_checks]
+        res = await _race(topic, "Lese-Prüfung", slots, len(slots), _timeout("lese_check", max(chunk_sizes)), provider, cancelled=is_cancelled)
+        if is_cancelled():
+            return None
+        if res is None:
+            await _fail(guide_id, "Lese-Prüfung fehlgeschlagen")
+            return None
+
+    probleme_by_num: dict[int, str] = {}
+    for p in check_paths:
+        for item in (_lese_probleme_schema(_json_datei(p)) or []):
+            num = _titel_aufloesen(idx, item["section"])
+            if num in by_num and num not in probleme_by_num:
+                probleme_by_num[num] = item["problem"]
+
+    if probleme_by_num:
+        _log(topic, f"Lese-Prüfung: {len(probleme_by_num)} Section(s) beanstandet")
+        await _set_step(guide_id, 5, f"Überarbeite {len(probleme_by_num)} Section(s)…")
+        fix_chunks = [[num for num in nums if num in probleme_by_num] for nums in chunk_nums]
+        fix_offen = [i for i, nums in enumerate(fix_chunks) if nums]
+        fix_paths = [content_path.parent / f"{content_path.stem}.fix-{i + 1}.md" for i in range(writer_count)]
+
+        def auftraege_text(nums: list[int]) -> str:
+            return "\n\n".join(
+                f"SECTION: {_titel(entries[num])}\nPROBLEM: {probleme_by_num[num]}\nAKTUELLER INHALT:\n{by_num[num]['md']}"
+                for num in nums
+            )
+
+        results = await asyncio.gather(*[
+            run_agent(
+                f"{guide_id}-fix-w{i + 1}",
+                _prompt(
+                    "Guide-Sections-Fix",
+                    topic=topic, format_name=format_name, facts=facts, spec=spec,
+                    auftraege=auftraege_text(fix_chunks[i]),
+                    out_path=fix_paths[i], extra=_extra(instructions),
+                ),
+                _timeout("writer", len(fix_chunks[i])), provider=provider, role="guide", capabilities="full",
+            )
+            for i in fix_offen
+        ], return_exceptions=True)
+        if is_cancelled():
+            return None
+        for i, r in zip(fix_offen, results):
+            if isinstance(r, BaseException) or (not isinstance(r, BaseException) and r[0] != 0):
+                _log(topic, f"Sections-Fix {i + 1} fehlgeschlagen — Original bleibt")
+        ersetzt = 0
+        for i in fix_offen:
+            if not fix_paths[i].exists():
+                continue
+            for sec in _parse_fragment(fix_paths[i].read_text(encoding="utf-8")):
+                num = _titel_aufloesen(idx, sec["titel"])
+                if num in probleme_by_num and sec["md"].strip():
+                    by_num[num] = sec
+                    ersetzt += 1
+        _log(topic, f"Lese-Prüfung: {ersetzt} Section(s) überarbeitet")
+
+    await _set_progress(guide_id, "Setze zusammen…")
+    chapters: list[dict] = []
+    for ch in plan:
+        sections = [
+            {"num": num, "title": _titel(entries[num]), "md": by_num[num]["md"]}
+            for num in ch["nums"] if num in by_num
+        ]
+        if sections:
+            chapters.append({"title": ch["title"], "sections": sections})
+    geplant = {num for ch in plan for num in ch["nums"]}
+    missing = sorted(geplant - set(by_num))
+    if missing:
+        _log(topic, f"Sections fehlen in der Writer-Ausgabe: {[_titel(entries[n]) for n in missing]}")
+    if not chapters:
+        await _fail(guide_id, "Keine Sections in der Writer-Ausgabe gefunden")
+        return None
+    return chapters
+
+
+async def reconcile_guides() -> None:
+    """DB↔Dateisystem abgleichen: status=done ohne Content-Datei → error.
+
+    Läuft beim Server-Start (nach init_db) — fängt Crashes zwischen
+    Datei-Write und Status-Update ab.
+    """
+    for g in await list_guides():
+        if g["status"] == "done" and not guide_content_path(g["topic"], g["format"]).exists():
+            log.warning("[%s] Guide %s: done ohne Content-Datei — auf error gesetzt", g["topic"], g["id"])
+            now = datetime.now(timezone.utc).isoformat()
+            await update_guide(g["id"], status="error", error_msg="Inhalt fehlt — neu generieren", updated_at=now)
+
+
+async def generate_guide(guide_id: str, topic: str, format_name: str, instructions: str = "", provider: str = DEFAULT_PROVIDER) -> None:
+    async with _semaphore:
+        now = datetime.now(timezone.utc).isoformat()
+        await update_guide(guide_id, status="generating", progress="Starte…", updated_at=now)
+
+        content_path = guide_content_path(topic, format_name)
+        content_path.parent.mkdir(parents=True, exist_ok=True)
+        project = project_dir(topic) if project_dir(topic).is_dir() else None
+
+        try:
+            if is_guide_cancelled(guide_id):
+                return
+
+            if project:
+                await asyncio.to_thread(_pdfs_konvertieren, project)
+
+            # „Neu erstellen": fertiger Guide → kompletter Frischstart.
+            # Sonst sind Schritt-Dateien Reste eines Abbruchs/Fehlers → Resume.
+            if content_path.exists():
+                for p_alt in guide_slot_dateien(content_path):
+                    p_alt.unlink(missing_ok=True)
+
+            if format_name == "OnePager":
+                chapters = await _generate_onepager(guide_id, topic, instructions, provider, project, content_path)
+            else:
+                alle = _lade_bausteine(bausteine_path(topic).read_text(encoding="utf-8"))
+                if not alle:
+                    await _fail(guide_id, "Keine Bausteine gefunden")
+                    return
+                entries = _eindeutige_titel(alle)
+                facts = _prompt("Guide-Fakten-Projekt", project=project) if project else _prompt("Guide-Fakten-Thema")
+                chapters = await _generate_sections(
+                    guide_id, topic, format_name, entries,
+                    facts, instructions, provider, content_path,
+                )
+            if chapters is None or is_guide_cancelled(guide_id):
+                return
+
+            atomic_write_json(content_path, {"topic": topic, "format": format_name, "chapters": chapters}, indent=1)
+
+            now = datetime.now(timezone.utc).isoformat()
+            await update_guide(guide_id, status="done", progress=None, step=None, updated_at=now)
+
+        except asyncio.TimeoutError:
+            await _fail(guide_id, "Timeout bei der Generierung")
+        except FileNotFoundError:
+            await _fail(guide_id, "Bausteine fehlen")
+        except Exception as e:
+            log.exception("[%s] Guide-Generierung fehlgeschlagen (%s)", topic, guide_id)
+            await _fail(guide_id, str(e)[:2000])
+        finally:
+            clear_guide_cancelled(guide_id)