152 lines
5.1 KiB
Python
152 lines
5.1 KiB
Python
"""Reine Text-Helfer: Titel-Normalisierung, Listen-Parser, Chunk-Aufteilung.
|
|
|
|
Kein Zustand, keine IO — überall gefahrlos importierbar.
|
|
"""
|
|
|
|
import re
|
|
import unicodedata
|
|
|
|
_CATEGORIES = ("KERN", "WICHTIG", "REST") # nur noch für den Altformat-Reader
|
|
|
|
|
|
def _norm_titel(s: str) -> str:
|
|
"""Normalisiert einen Titel für den Schlüssel-Vergleich.
|
|
|
|
NFKC + casefold fangen Unicode-Varianten; Anführungszeichen, Markdown-
|
|
Emphasis und Dash-Varianten kommen aus KI-Output in allen Spielarten.
|
|
"""
|
|
s = unicodedata.normalize("NFKC", s)
|
|
s = re.sub(r"[`'\"<>„“”‚’«»*_]", "", s)
|
|
s = re.sub(r"[–—‐]", "-", s)
|
|
s = re.sub(r"\s+", " ", s).strip().strip(".:;").strip()
|
|
return s.casefold()
|
|
|
|
|
|
def _titel(entry: str) -> str:
|
|
return entry.split(" — ")[0].strip() or entry
|
|
|
|
|
|
def _eindeutige_titel(entries: dict[int, str]) -> dict[int, str]:
|
|
"""Macht Titel eindeutig (Suffix " (2)", " (3)" …), damit sie als Schlüssel taugen."""
|
|
seen: dict[str, int] = {}
|
|
out: dict[int, str] = {}
|
|
for num, text in entries.items():
|
|
titel = _titel(text)
|
|
key = _norm_titel(titel)
|
|
seen[key] = seen.get(key, 0) + 1
|
|
if seen[key] > 1:
|
|
rest = text.split(" — ", 1)
|
|
text = f"{titel} ({seen[key]})" + (f" — {rest[1]}" if len(rest) == 2 else "")
|
|
# zweiter Durchlauf nicht nötig: Suffixe kollidieren praktisch nicht
|
|
out[num] = text
|
|
return out
|
|
|
|
|
|
def _titel_index(entries: dict[int, str]) -> dict[str, int]:
|
|
return {_norm_titel(_titel(text)): num for num, text in entries.items()}
|
|
|
|
|
|
def _titel_aufloesen(idx: dict[str, int], t: str) -> int | None:
|
|
"""Titel → Nummer; toleriert mitgeschleppte Beschreibungen ("Titel — …")."""
|
|
if not isinstance(t, str):
|
|
return None
|
|
return idx.get(_norm_titel(t)) or idx.get(_norm_titel(_titel(t)))
|
|
|
|
|
|
def _parse_auswahl(text: str) -> dict[int, str]:
|
|
"""Parst eine Baustein-Liste: `N. Titel — Kurzbeschreibung` pro Zeile."""
|
|
entries: dict[int, str] = {}
|
|
last = None
|
|
for line in text.splitlines():
|
|
m = re.match(r"\s*(\d+)[.)]\s+(.*\S)", line)
|
|
if m:
|
|
last = int(m.group(1))
|
|
entries[last] = m.group(2)
|
|
elif last is not None and line.strip():
|
|
entries[last] += " " + line.strip()
|
|
return entries
|
|
|
|
|
|
def _parse_kategorien(text: str) -> dict[str, list[str]]:
|
|
"""Altformat-Reader: finale Baustein-Datei mit ## KERN/WICHTIG/REST-Abschnitten."""
|
|
cats: dict[str, list[str]] = {}
|
|
current = None
|
|
for line in text.splitlines():
|
|
s = line.strip()
|
|
m = re.match(r"#+\s*(KERN|WICHTIG|REST)\b", s, re.IGNORECASE)
|
|
if m:
|
|
current = m.group(1).upper()
|
|
cats.setdefault(current, [])
|
|
continue
|
|
m = re.match(r"(\d+)[.)]\s+(.*\S)", s)
|
|
if m and current:
|
|
cats[current].append(m.group(2))
|
|
return cats
|
|
|
|
|
|
def _lade_bausteine(text: str) -> dict[int, str]:
|
|
"""Lädt die finale Baustein-Datei — sortierte Liste (neu) oder Kategorien (Altformat)."""
|
|
if re.search(r"^#+\s*KERN\b", text, re.IGNORECASE | re.MULTILINE):
|
|
cats = _parse_kategorien(text)
|
|
texts = [t for cat in _CATEGORIES for t in cats.get(cat, [])]
|
|
return {i: t for i, t in enumerate(texts, 1)}
|
|
return _parse_auswahl(text)
|
|
|
|
|
|
_FRAGMENT_KAPITEL_RE = re.compile(r"<!--\s*kapitel\s*:\s*(.*?)\s*-->", re.IGNORECASE)
|
|
_FRAGMENT_SECTION_RE = re.compile(r"<!--\s*section\s*:\s*(.*?)\s*-->", re.IGNORECASE)
|
|
|
|
|
|
def _parse_fragment(text: str) -> list[dict]:
|
|
"""Parst eine Writer-Datei → [{"kapitel", "titel", "md"}] in Datei-Reihenfolge."""
|
|
sections: list[dict] = []
|
|
kapitel = None
|
|
current = None
|
|
for line in text.splitlines():
|
|
s = line.strip()
|
|
m = _FRAGMENT_KAPITEL_RE.match(s)
|
|
if m:
|
|
kapitel = m.group(1)
|
|
current = None
|
|
continue
|
|
m = _FRAGMENT_SECTION_RE.match(s)
|
|
if m:
|
|
current = {"kapitel": kapitel, "titel": m.group(1), "md": []}
|
|
sections.append(current)
|
|
continue
|
|
if current is not None:
|
|
current["md"].append(line)
|
|
for sec in sections:
|
|
sec["md"] = "\n".join(sec["md"]).strip()
|
|
return sections
|
|
|
|
|
|
def _split_chunks(chapters: list[dict], n: int) -> list[list[dict]]:
|
|
"""Teilt Kapitel in bis zu n zusammenhängende Chunks, balanciert nach Section-Anzahl."""
|
|
n = max(1, min(n, len(chapters)))
|
|
chunks: list[list[dict]] = []
|
|
current: list[dict] = []
|
|
count = 0
|
|
remaining_total = sum(len(c["nums"]) for c in chapters)
|
|
remaining_chunks = n
|
|
for ch in chapters:
|
|
current.append(ch)
|
|
count += len(ch["nums"])
|
|
if remaining_chunks > 1 and count >= remaining_total / remaining_chunks:
|
|
chunks.append(current)
|
|
remaining_total -= count
|
|
remaining_chunks -= 1
|
|
current = []
|
|
count = 0
|
|
if current:
|
|
chunks.append(current)
|
|
return chunks
|
|
|
|
|
|
def _zuteilung_text(chunk: list[dict], entries: dict[int, str]) -> str:
|
|
lines = []
|
|
for ch in chunk:
|
|
lines.append(f"KAPITEL: {ch['title']}")
|
|
lines.extend(f"- {entries[num]}" for num in ch["nums"])
|
|
return "\n".join(lines)
|