Backend: generator.py (1600 Z.) in Module gesplittet — pipeline, textkit, bausteine, onepager, guide, elements
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
143
backend/textkit.py
Normal file
143
backend/textkit.py
Normal file
@@ -0,0 +1,143 @@
|
||||
"""Reine Text-Helfer: Titel-Normalisierung, Listen-Parser, Chunk-Aufteilung.
|
||||
|
||||
Kein Zustand, keine IO — überall gefahrlos importierbar.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
_CATEGORIES = ("KERN", "WICHTIG", "REST") # nur noch für den Altformat-Reader
|
||||
|
||||
|
||||
def _norm_titel(s: str) -> str:
|
||||
"""Normalisiert einen Titel für den Schlüssel-Vergleich."""
|
||||
s = re.sub(r"[`'\"<>]", "", s)
|
||||
return re.sub(r"\s+", " ", s).strip().lower()
|
||||
|
||||
|
||||
def _titel(entry: str) -> str:
|
||||
return entry.split(" — ")[0].strip() or entry
|
||||
|
||||
|
||||
def _eindeutige_titel(entries: dict[int, str]) -> dict[int, str]:
|
||||
"""Macht Titel eindeutig (Suffix " (2)", " (3)" …), damit sie als Schlüssel taugen."""
|
||||
seen: dict[str, int] = {}
|
||||
out: dict[int, str] = {}
|
||||
for num, text in entries.items():
|
||||
titel = _titel(text)
|
||||
key = _norm_titel(titel)
|
||||
seen[key] = seen.get(key, 0) + 1
|
||||
if seen[key] > 1:
|
||||
rest = text.split(" — ", 1)
|
||||
text = f"{titel} ({seen[key]})" + (f" — {rest[1]}" if len(rest) == 2 else "")
|
||||
# zweiter Durchlauf nicht nötig: Suffixe kollidieren praktisch nicht
|
||||
out[num] = text
|
||||
return out
|
||||
|
||||
|
||||
def _titel_index(entries: dict[int, str]) -> dict[str, int]:
|
||||
return {_norm_titel(_titel(text)): num for num, text in entries.items()}
|
||||
|
||||
|
||||
def _titel_aufloesen(idx: dict[str, int], t: str) -> int | None:
|
||||
"""Titel → Nummer; toleriert mitgeschleppte Beschreibungen ("Titel — …")."""
|
||||
if not isinstance(t, str):
|
||||
return None
|
||||
return idx.get(_norm_titel(t)) or idx.get(_norm_titel(_titel(t)))
|
||||
|
||||
|
||||
def _parse_auswahl(text: str) -> dict[int, str]:
|
||||
"""Parst eine Baustein-Liste: `N. Titel — Kurzbeschreibung` pro Zeile."""
|
||||
entries: dict[int, str] = {}
|
||||
last = None
|
||||
for line in text.splitlines():
|
||||
m = re.match(r"\s*(\d+)[.)]\s+(.*\S)", line)
|
||||
if m:
|
||||
last = int(m.group(1))
|
||||
entries[last] = m.group(2)
|
||||
elif last is not None and line.strip():
|
||||
entries[last] += " " + line.strip()
|
||||
return entries
|
||||
|
||||
|
||||
def _parse_kategorien(text: str) -> dict[str, list[str]]:
|
||||
"""Altformat-Reader: finale Baustein-Datei mit ## KERN/WICHTIG/REST-Abschnitten."""
|
||||
cats: dict[str, list[str]] = {}
|
||||
current = None
|
||||
for line in text.splitlines():
|
||||
s = line.strip()
|
||||
m = re.match(r"#+\s*(KERN|WICHTIG|REST)\b", s, re.IGNORECASE)
|
||||
if m:
|
||||
current = m.group(1).upper()
|
||||
cats.setdefault(current, [])
|
||||
continue
|
||||
m = re.match(r"(\d+)[.)]\s+(.*\S)", s)
|
||||
if m and current:
|
||||
cats[current].append(m.group(2))
|
||||
return cats
|
||||
|
||||
|
||||
def _lade_bausteine(text: str) -> dict[int, str]:
|
||||
"""Lädt die finale Baustein-Datei — sortierte Liste (neu) oder Kategorien (Altformat)."""
|
||||
if re.search(r"^#+\s*KERN\b", text, re.IGNORECASE | re.MULTILINE):
|
||||
cats = _parse_kategorien(text)
|
||||
texts = [t for cat in _CATEGORIES for t in cats.get(cat, [])]
|
||||
return {i: t for i, t in enumerate(texts, 1)}
|
||||
return _parse_auswahl(text)
|
||||
|
||||
|
||||
_FRAGMENT_KAPITEL_RE = re.compile(r"<!--\s*kapitel\s*:\s*(.*?)\s*-->", re.IGNORECASE)
|
||||
_FRAGMENT_SECTION_RE = re.compile(r"<!--\s*section\s*:\s*(.*?)\s*-->", re.IGNORECASE)
|
||||
|
||||
|
||||
def _parse_fragment(text: str) -> list[dict]:
|
||||
"""Parst eine Writer-Datei → [{"kapitel", "titel", "md"}] in Datei-Reihenfolge."""
|
||||
sections: list[dict] = []
|
||||
kapitel = None
|
||||
current = None
|
||||
for line in text.splitlines():
|
||||
s = line.strip()
|
||||
m = _FRAGMENT_KAPITEL_RE.match(s)
|
||||
if m:
|
||||
kapitel = m.group(1)
|
||||
current = None
|
||||
continue
|
||||
m = _FRAGMENT_SECTION_RE.match(s)
|
||||
if m:
|
||||
current = {"kapitel": kapitel, "titel": m.group(1), "md": []}
|
||||
sections.append(current)
|
||||
continue
|
||||
if current is not None:
|
||||
current["md"].append(line)
|
||||
for sec in sections:
|
||||
sec["md"] = "\n".join(sec["md"]).strip()
|
||||
return sections
|
||||
|
||||
|
||||
def _split_chunks(chapters: list[dict], n: int) -> list[list[dict]]:
|
||||
"""Teilt Kapitel in bis zu n zusammenhängende Chunks, balanciert nach Section-Anzahl."""
|
||||
n = max(1, min(n, len(chapters)))
|
||||
chunks: list[list[dict]] = []
|
||||
current: list[dict] = []
|
||||
count = 0
|
||||
remaining_total = sum(len(c["nums"]) for c in chapters)
|
||||
remaining_chunks = n
|
||||
for ch in chapters:
|
||||
current.append(ch)
|
||||
count += len(ch["nums"])
|
||||
if remaining_chunks > 1 and count >= remaining_total / remaining_chunks:
|
||||
chunks.append(current)
|
||||
remaining_total -= count
|
||||
remaining_chunks -= 1
|
||||
current = []
|
||||
count = 0
|
||||
if current:
|
||||
chunks.append(current)
|
||||
return chunks
|
||||
|
||||
|
||||
def _zuteilung_text(chunk: list[dict], entries: dict[int, str]) -> str:
|
||||
lines = []
|
||||
for ch in chunk:
|
||||
lines.append(f"KAPITEL: {ch['title']}")
|
||||
lines.extend(f"- {entries[num]}" for num in ch["nums"])
|
||||
return "\n".join(lines)
|
||||
Reference in New Issue
Block a user