Backend: generator.py (1600 Z.) in Module gesplittet — pipeline, textkit, bausteine, onepager, guide, elements

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-12 08:05:47 +02:00
parent 0b4a086e89
commit 5702108d28
9 changed files with 1673 additions and 1599 deletions
--- a/backend/textkit.py
+++ b/backend/textkit.py
@@ -0,0 +1,143 @@
+"""Reine Text-Helfer: Titel-Normalisierung, Listen-Parser, Chunk-Aufteilung.
+
+Kein Zustand, keine IO — überall gefahrlos importierbar.
+"""
+
+import re
+
+_CATEGORIES = ("KERN", "WICHTIG", "REST")  # nur noch für den Altformat-Reader
+
+
+def _norm_titel(s: str) -> str:
+    """Normalisiert einen Titel für den Schlüssel-Vergleich."""
+    s = re.sub(r"[`'\"<>]", "", s)
+    return re.sub(r"\s+", " ", s).strip().lower()
+
+
+def _titel(entry: str) -> str:
+    return entry.split(" — ")[0].strip() or entry
+
+
+def _eindeutige_titel(entries: dict[int, str]) -> dict[int, str]:
+    """Macht Titel eindeutig (Suffix " (2)", " (3)" …), damit sie als Schlüssel taugen."""
+    seen: dict[str, int] = {}
+    out: dict[int, str] = {}
+    for num, text in entries.items():
+        titel = _titel(text)
+        key = _norm_titel(titel)
+        seen[key] = seen.get(key, 0) + 1
+        if seen[key] > 1:
+            rest = text.split(" — ", 1)
+            text = f"{titel} ({seen[key]})" + (f" — {rest[1]}" if len(rest) == 2 else "")
+    # zweiter Durchlauf nicht nötig: Suffixe kollidieren praktisch nicht
+        out[num] = text
+    return out
+
+
+def _titel_index(entries: dict[int, str]) -> dict[str, int]:
+    return {_norm_titel(_titel(text)): num for num, text in entries.items()}
+
+
+def _titel_aufloesen(idx: dict[str, int], t: str) -> int | None:
+    """Titel → Nummer; toleriert mitgeschleppte Beschreibungen ("Titel — …")."""
+    if not isinstance(t, str):
+        return None
+    return idx.get(_norm_titel(t)) or idx.get(_norm_titel(_titel(t)))
+
+
+def _parse_auswahl(text: str) -> dict[int, str]:
+    """Parst eine Baustein-Liste: `N. Titel — Kurzbeschreibung` pro Zeile."""
+    entries: dict[int, str] = {}
+    last = None
+    for line in text.splitlines():
+        m = re.match(r"\s*(\d+)[.)]\s+(.*\S)", line)
+        if m:
+            last = int(m.group(1))
+            entries[last] = m.group(2)
+        elif last is not None and line.strip():
+            entries[last] += " " + line.strip()
+    return entries
+
+
+def _parse_kategorien(text: str) -> dict[str, list[str]]:
+    """Altformat-Reader: finale Baustein-Datei mit ## KERN/WICHTIG/REST-Abschnitten."""
+    cats: dict[str, list[str]] = {}
+    current = None
+    for line in text.splitlines():
+        s = line.strip()
+        m = re.match(r"#+\s*(KERN|WICHTIG|REST)\b", s, re.IGNORECASE)
+        if m:
+            current = m.group(1).upper()
+            cats.setdefault(current, [])
+            continue
+        m = re.match(r"(\d+)[.)]\s+(.*\S)", s)
+        if m and current:
+            cats[current].append(m.group(2))
+    return cats
+
+
+def _lade_bausteine(text: str) -> dict[int, str]:
+    """Lädt die finale Baustein-Datei — sortierte Liste (neu) oder Kategorien (Altformat)."""
+    if re.search(r"^#+\s*KERN\b", text, re.IGNORECASE | re.MULTILINE):
+        cats = _parse_kategorien(text)
+        texts = [t for cat in _CATEGORIES for t in cats.get(cat, [])]
+        return {i: t for i, t in enumerate(texts, 1)}
+    return _parse_auswahl(text)
+
+
+_FRAGMENT_KAPITEL_RE = re.compile(r"<!--\s*kapitel\s*:\s*(.*?)\s*-->", re.IGNORECASE)
+_FRAGMENT_SECTION_RE = re.compile(r"<!--\s*section\s*:\s*(.*?)\s*-->", re.IGNORECASE)
+
+
+def _parse_fragment(text: str) -> list[dict]:
+    """Parst eine Writer-Datei → [{"kapitel", "titel", "md"}] in Datei-Reihenfolge."""
+    sections: list[dict] = []
+    kapitel = None
+    current = None
+    for line in text.splitlines():
+        s = line.strip()
+        m = _FRAGMENT_KAPITEL_RE.match(s)
+        if m:
+            kapitel = m.group(1)
+            current = None
+            continue
+        m = _FRAGMENT_SECTION_RE.match(s)
+        if m:
+            current = {"kapitel": kapitel, "titel": m.group(1), "md": []}
+            sections.append(current)
+            continue
+        if current is not None:
+            current["md"].append(line)
+    for sec in sections:
+        sec["md"] = "\n".join(sec["md"]).strip()
+    return sections
+
+
+def _split_chunks(chapters: list[dict], n: int) -> list[list[dict]]:
+    """Teilt Kapitel in bis zu n zusammenhängende Chunks, balanciert nach Section-Anzahl."""
+    n = max(1, min(n, len(chapters)))
+    chunks: list[list[dict]] = []
+    current: list[dict] = []
+    count = 0
+    remaining_total = sum(len(c["nums"]) for c in chapters)
+    remaining_chunks = n
+    for ch in chapters:
+        current.append(ch)
+        count += len(ch["nums"])
+        if remaining_chunks > 1 and count >= remaining_total / remaining_chunks:
+            chunks.append(current)
+            remaining_total -= count
+            remaining_chunks -= 1
+            current = []
+            count = 0
+    if current:
+        chunks.append(current)
+    return chunks
+
+
+def _zuteilung_text(chunk: list[dict], entries: dict[int, str]) -> str:
+    lines = []
+    for ch in chunk:
+        lines.append(f"KAPITEL: {ch['title']}")
+        lines.extend(f"- {entries[num]}" for num in ch["nums"])
+    return "\n".join(lines)