#!/usr/bin/env python3
import csv
import os
import re
import subprocess
import sys
import tempfile
from collections import Counter
from pathlib import Path
from typing import Dict, Optional, List, Tuple

import fitz  # PyMuPDF


# =========================
# KONFIG (RELATIV ZUM SCRIPT)
# =========================
SCRIPT_DIR = Path(__file__).resolve().parent

INPUT_PDF = SCRIPT_DIR / "in" / "input.pdf"

# mapping.csv columns:
# SAP-Kundennummer;Vertragskonto;lima-Kundennummer;debitor
MAPPING_CSV = SCRIPT_DIR / "mapping.csv"

OUTPUT_DIR = SCRIPT_DIR / "out"
SPLIT_DIR = OUTPUT_DIR / "split"
ERRORS_DIR = OUTPUT_DIR / "errors"
OUTPUT_CSV = OUTPUT_DIR / "results.csv"
DEBUG_PAGES_CSV = OUTPUT_DIR / "debug_pages.csv"

OCR_LANG = "deu"
KEEP_OCR_PDF = False  # True: OCR-PDF unter ./out/ocr_input.pdf behalten
OCR_JOBS = max(1, min(4, (os.cpu_count() or 2)))  # parallel, falls möglich

# OCR-Strategie:
# - "auto": OCR nur wenn Erkennung vermutlich unzureichend ist
# - "always": immer OCR
# - "never": nie OCR
OCR_MODE = "auto"
# in AUTO: OCR wenn ID-Hit-Quote < diese Schwelle
OCR_ID_HIT_RATIO_THRESHOLD = 0.60


# =========================
# HARD-CODED DATUM + BETREFF
# =========================
# Datum im Dateinamen: YYMMDD (6 Ziffern), z.B. 260113
HARDCODE_DATE_YYMMDD = "260101"
# Betreff im Dateinamen (Kürzel)
HARDCODE_BETREFF_KUERZEL = "XVV"


# =========================
# REGEX / NORMALISIERUNG
# =========================
# Variante A:
#   Kundennummer: <SAP>;    und   Vertragskonto: <VK>;
REGEX_SAP_LABEL = re.compile(r"Kundennummer:\s*([0-9]{3,})\s*;", re.IGNORECASE)
REGEX_VK_LABEL = re.compile(r"Vertragskonto:\s*([0-9]{3,})\s*;", re.IGNORECASE)

# Variante B:
#   "<SAP> / <VK>"
REGEX_SAP_VK_PAIR = re.compile(r"\b([0-9]{3,})\s*/\s*([0-9]{3,})\b")

# Startseiten-Detektor (für gleiche IDs direkt hintereinander)
REGEX_SEITE = re.compile(r"\bSeite\s+(\d{1,3})(?:\s*(?:von|/)\s*\d{1,3})?\b", re.IGNORECASE)

# Steuerzeichen raus (z.B. \x07)
CONTROL_CHARS = re.compile(r"[\x00-\x1F\x7F]")


def normalize_text(text: str) -> str:
    text = CONTROL_CHARS.sub(" ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def digits_only(s: str) -> str:
    return re.sub(r"\D+", "", s or "")


def extract_ids_with_method(norm_text: str) -> Tuple[Optional[str], Optional[str], str]:
    """
    Returns (sap, vertragskonto, method)

    method in:
      - "labels_both" / "sap_label" / "vk_label"
      - "pair"
      - "none"
    """
    sap = None
    vk = None

    m_sap = REGEX_SAP_LABEL.search(norm_text)
    if m_sap:
        sap = digits_only(m_sap.group(1))

    m_vk = REGEX_VK_LABEL.search(norm_text)
    if m_vk:
        vk = digits_only(m_vk.group(1))

    if sap or vk:
        if sap and vk:
            return sap, vk, "labels_both"
        if sap:
            return sap, None, "sap_label"
        return None, vk, "vk_label"

    m_pair = REGEX_SAP_VK_PAIR.search(norm_text)
    if m_pair:
        sap = digits_only(m_pair.group(1))
        vk = digits_only(m_pair.group(2))
        return sap or None, vk or None, "pair"

    return None, None, "none"


def detect_page_no(norm_text: str) -> Optional[int]:
    m = REGEX_SEITE.search(norm_text)
    if not m:
        return None
    try:
        return int(m.group(1))
    except ValueError:
        return None


def looks_like_first_page(norm_text: str) -> bool:
    return detect_page_no(norm_text) == 1


def stable_text_fingerprint(norm_text: str) -> str:
    # reicht für "identische Seite direkt danach" (Doppelscan)
    t = (norm_text or "").strip().lower()
    t = re.sub(r"\s+", " ", t)
    return t[:2000]


def sanitize_filename_part(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"[^0-9A-Za-z_-]+", "_", s)
    return s or "UNKNOWN"


def ensure_unique_path(path: Path) -> Path:
    if not path.exists():
        return path
    stem = path.stem
    suffix = path.suffix
    parent = path.parent
    for i in range(2, 1000):
        candidate = parent / f"{stem}_{i:02d}{suffix}"
        if not candidate.exists():
            return candidate
    return parent / f"{stem}_{os.getpid()}{suffix}"


# =========================
# CSV LOADING
# =========================
def _detect_delimiter(csv_path: Path) -> str:
    sample = csv_path.read_text(encoding="utf-8-sig", errors="replace")[:4096]
    try:
        dialect = csv.Sniffer().sniff(sample, delimiters=";,\t|")
        return dialect.delimiter
    except Exception:
        return ";"


def load_main_mapping(mapping_csv: Path) -> Dict[str, List[Tuple[str, str, str]]]:
    """
    mapping.csv columns:
      - SAP-Kundennummer
      - Vertragskonto
      - lima-Kundennummer
      - debitor

    Returns: sap_digits -> list of (vk_digits, lima_raw, debitor_raw)
    (list, weil SAP theoretisch mehrfach vorkommen könnte)
    """
    if not mapping_csv.exists() or not mapping_csv.is_file():
        raise RuntimeError(f"mapping.csv nicht gefunden: {mapping_csv}")

    delim = _detect_delimiter(mapping_csv)
    out: Dict[str, List[Tuple[str, str, str]]] = {}

    with mapping_csv.open("r", encoding="utf-8-sig", newline="") as f:
        reader = csv.DictReader(f, delimiter=delim)
        required = {"SAP-Kundennummer", "Vertragskonto", "lima-Kundennummer", "debitor"}
        if not reader.fieldnames or not required.issubset(set(reader.fieldnames)):
            raise RuntimeError(
                f"mapping.csv hat nicht die erwarteten Spaltennamen. "
                f"Erwartet: {sorted(required)} | Gefunden: {reader.fieldnames}"
            )

        for row in reader:
            sap_raw = (row.get("SAP-Kundennummer") or "").strip()
            vk_raw = (row.get("Vertragskonto") or "").strip()
            lima = (row.get("lima-Kundennummer") or "").strip()
            deb = (row.get("debitor") or "").strip()

            sap = digits_only(sap_raw)
            vk = digits_only(vk_raw)

            if not sap:
                continue

            out.setdefault(sap, []).append((vk, lima, deb))

    return out


def find_mapping_row(
    main_map: Dict[str, List[Tuple[str, str, str]]],
    sap: str,
    vk: str,
) -> Optional[Tuple[str, str, str]]:
    """
    Returns (vk_digits, lima_raw, debitor_raw) if a row matches sap+vk.
    """
    rows = main_map.get(sap)
    if not rows:
        return None
    for vk_map, lima, deb in rows:
        if (vk_map or "") == (vk or ""):
            return (vk_map, lima, deb)
    return None


# =========================
# OCR (schnell -> fallback safe)
# =========================
def repair_pdf_with_pymupdf(input_pdf: Path, repaired_pdf: Path) -> None:
    with fitz.open(input_pdf) as doc:
        doc.save(repaired_pdf, garbage=4, deflate=True, clean=True)


def run_ocrmypdf(input_pdf: Path, output_pdf: Path, lang: str = "deu") -> None:
    def _run(cmd: List[str]) -> None:
        try:
            subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        except FileNotFoundError:
            raise RuntimeError("ocrmypdf wurde nicht gefunden. Installiere es z.B. via: pip install ocrmypdf")
        except subprocess.CalledProcessError as e:
            msg = (e.stderr or e.stdout or "").strip()
            raise RuntimeError(f"OCR fehlgeschlagen: {msg}")

    base = ["ocrmypdf", "--skip-text", "--language", lang, "--output-type", "pdf", "--optimize", "0"]

    # Versuch 1: parallel
    try:
        _run(base + ["--jobs", str(OCR_JOBS), str(input_pdf), str(output_pdf)])
        return
    except RuntimeError:
        # Versuch 2: reparieren + jobs=1
        with tempfile.TemporaryDirectory() as td:
            repaired = Path(td) / "repaired.pdf"
            repair_pdf_with_pymupdf(input_pdf, repaired)
            _run(base + ["--jobs", "1", str(repaired), str(output_pdf)])


# =========================
# TEXT-PASS + SEGMENTIERUNG
# =========================
def get_page_texts(pdf_path: Path) -> List[str]:
    """Returns list of normalized texts per page (case preserved)."""
    texts: List[str] = []
    with fitz.open(pdf_path) as doc:
        for i in range(doc.page_count):
            t = doc.load_page(i).get_text("text") or ""
            texts.append(normalize_text(t))
    return texts


def should_run_ocr(page_texts: List[str]) -> bool:
    """
    Heuristik für AUTO:
    - OCR wenn GAR keine IDs erkannt werden (SAP oder VK)
    - oder wenn ID-Hit-Quote zu niedrig ist
    """
    n = len(page_texts)
    if n == 0:
        return False

    hits = 0
    for t in page_texts:
        sap, vk, _ = extract_ids_with_method(t)
        if sap or vk:
            hits += 1

    if hits == 0:
        return True

    ratio = hits / n
    return ratio < OCR_ID_HIT_RATIO_THRESHOLD


def build_segments_from_page_texts(page_texts: List[str]) -> List[Tuple[Optional[str], Optional[str], int, int]]:
    """
    Returns segments: (sap_or_None, vk_or_None, start_page, end_page) end inclusive.

    Segmentgrenzen:
    - bevorzugt nach (SAP + Vertragskonto), sobald beides erkannt wird
    - SAP-Wechsel triggert ebenfalls neues Segment (auch wenn VK gerade fehlt)
    - Doppelscan/Neustart: bei gleicher ID -> trennen, wenn "Seite 1" oder Text identisch zur Vorseite
    """
    n = len(page_texts)
    segments: List[Tuple[Optional[str], Optional[str], int, int]] = []

    current_start: Optional[int] = None
    current_sap: Optional[str] = None
    current_vk: Optional[str] = None
    # current_strong_id ist nur gesetzt, wenn wir im Segment schon mal SAP+VK gesehen haben
    current_strong_id: Optional[Tuple[str, str]] = None

    prev_fp: Optional[str] = None
    prev_strong_id: Optional[Tuple[str, str]] = None

    for i in range(n):
        norm = page_texts[i]
        sap, vk, _method = extract_ids_with_method(norm)
        fp = stable_text_fingerprint(norm) if norm else ""

        strong_id = (sap, vk) if sap and vk else None

        start_new = False

        if current_start is None:
            # Segmentstart sobald irgendwas erkannt wird; notfalls starten wir trotzdem bei erster Seite
            if strong_id is not None or sap is not None:
                start_new = True
            else:
                # noch nichts erkannt -> wir warten; falls bis zum Ende nichts kommt, gibts ein Segment (None)
                start_new = False
        else:
            # 1) Starker Wechsel (SAP+VK)
            if strong_id is not None:
                if current_strong_id is None:
                    # Wir sind im Segment, aber bisher ohne VK. Wenn SAP gleich, übernehmen wir nur VK ohne Split.
                    if current_sap == sap:
                        current_strong_id = strong_id
                        current_vk = vk
                        start_new = False
                    else:
                        start_new = True
                elif strong_id != current_strong_id:
                    start_new = True
                else:
                    # gleiche starke ID -> evtl Doppelscan
                    if looks_like_first_page(norm):
                        start_new = True
                    elif prev_strong_id == strong_id and prev_fp is not None and fp == prev_fp:
                        start_new = True

            # 2) Kein starker Treffer, aber SAP vorhanden -> SAP-Wechsel triggert Segmentwechsel
            elif sap is not None:
                if current_sap is None:
                    start_new = True
                elif sap != current_sap:
                    start_new = True
                else:
                    # gleiche SAP, VK fehlt -> kein Split
                    start_new = False

        if start_new:
            if current_start is not None:
                segments.append((current_sap, current_vk, current_start, i - 1))

            current_start = i if i != 0 else 0
            current_sap = sap
            current_vk = vk
            current_strong_id = strong_id  # kann None sein

        prev_fp = fp
        prev_strong_id = strong_id

    if current_start is not None:
        segments.append((current_sap, current_vk, current_start, n - 1))
    else:
        segments.append((None, None, 0, n - 1))

    return segments


def identify_segment_ids(page_texts: List[str], start: int, end: int) -> Tuple[Optional[str], Optional[str], str]:
    """
    Bestimmt die wahrscheinlichste SAP & VK für ein Segment.
    Priorität:
      1) Häufigstes (SAP,VK)-Paar, wenn vorhanden
      2) sonst Mehrheitsentscheidung separat für SAP und VK
    Returns: (sap_best, vk_best, method_summary)
    """
    pair_counts: Counter = Counter()
    sap_counts: Counter = Counter()
    vk_counts: Counter = Counter()
    method_counts: Counter = Counter()

    for i in range(start, end + 1):
        sap, vk, method = extract_ids_with_method(page_texts[i])
        method_counts[method] += 1
        if sap:
            sap_counts[sap] += 1
        if vk:
            vk_counts[vk] += 1
        if sap and vk:
            pair_counts[(sap, vk)] += 1

    if pair_counts:
        (sap_best, vk_best), _ = pair_counts.most_common(1)[0]
    else:
        sap_best = sap_counts.most_common(1)[0][0] if sap_counts else None
        vk_best = vk_counts.most_common(1)[0][0] if vk_counts else None

    method_summary = method_counts.most_common(1)[0][0] if method_counts else "none"
    return sap_best, vk_best, method_summary


def write_split_pdf_range(src_original: fitz.Document, start: int, end: int, out_path: Path) -> None:
    out = fitz.open()
    out.insert_pdf(src_original, from_page=start, to_page=end)
    out.save(out_path)
    out.close()


def debitor_9(debitor_raw: str) -> str:
    """Debitor muss 9 Zeichen haben, aufgefüllt mit führenden 0 (nur Ziffern)."""
    digits = re.sub(r"\D+", "", debitor_raw or "")
    return digits.zfill(9) if digits else "000000000"


def lima_digits_only(lima_raw: str) -> str:
    """
    Lima-Kundennummer im Dateinamen: nur Ziffern, ohne '_' oder andere Trennzeichen.
    Beispiel: '147_871_891' -> '147871891'
    """
    return re.sub(r"\D+", "", lima_raw or "")


def write_debug_pages_csv(page_texts: List[str], main_map: Dict[str, List[Tuple[str, str, str]]], debug_csv: Path) -> None:
    debug_csv.parent.mkdir(parents=True, exist_ok=True)

    rows = []
    prev_fp = None
    prev_sap = None
    prev_vk = None

    for i, norm in enumerate(page_texts, start=1):
        sap, vk, method = extract_ids_with_method(norm)
        page_no = detect_page_no(norm)
        is_first = (page_no == 1)
        fp = stable_text_fingerprint(norm)
        same_as_prev = (prev_fp is not None and fp == prev_fp)
        same_sap_as_prev = (prev_sap is not None and sap is not None and sap == prev_sap)
        same_vk_as_prev = (prev_vk is not None and vk is not None and vk == prev_vk)

        in_mapping_sap = bool(sap and sap in main_map)
        in_mapping_pair = False
        if sap and vk and sap in main_map:
            in_mapping_pair = find_mapping_row(main_map, sap, vk) is not None

        text_len = len(norm or "")
        snippet = (norm[:200] + ("…" if len(norm) > 200 else "")) if norm else ""

        rows.append({
            "page": i,
            "text_len": text_len,
            "sap_detected": sap or "",
            "vertragskonto_detected": vk or "",
            "method": method,
            "in_mapping_sap": "1" if in_mapping_sap else "0",
            "in_mapping_pair": "1" if in_mapping_pair else "0",
            "seite_no": page_no if page_no is not None else "",
            "looks_like_first_page": "1" if is_first else "0",
            "same_text_as_prev": "1" if same_as_prev else "0",
            "same_sap_as_prev": "1" if same_sap_as_prev else "0",
            "same_vk_as_prev": "1" if same_vk_as_prev else "0",
            "snippet_200": snippet,
        })

        prev_fp = fp
        prev_sap = sap
        prev_vk = vk

    with debug_csv.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=[
                "page",
                "text_len",
                "sap_detected",
                "vertragskonto_detected",
                "method",
                "in_mapping_sap",
                "in_mapping_pair",
                "seite_no",
                "looks_like_first_page",
                "same_text_as_prev",
                "same_sap_as_prev",
                "same_vk_as_prev",
                "snippet_200",
            ],
            delimiter=";",
        )
        writer.writeheader()
        writer.writerows(rows)


# =========================
# MAIN
# =========================
def main() -> int:
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    SPLIT_DIR.mkdir(parents=True, exist_ok=True)
    ERRORS_DIR.mkdir(parents=True, exist_ok=True)

    if not INPUT_PDF.exists() or not INPUT_PDF.is_file():
        print(f"❌ INPUT_PDF nicht gefunden: {INPUT_PDF}", file=sys.stderr)
        return 2

    # Hardcoded Werte validieren
    if not re.fullmatch(r"\d{6}", (HARDCODE_DATE_YYMMDD or "").strip()):
        print("❌ HARDCODE_DATE_YYMMDD muss exakt 6 Ziffern im Format YYMMDD sein (z.B. 260113).", file=sys.stderr)
        return 2
    if not (HARDCODE_BETREFF_KUERZEL or "").strip():
        print("❌ HARDCODE_BETREFF_KUERZEL darf nicht leer sein.", file=sys.stderr)
        return 2

    try:
        main_map = load_main_mapping(MAPPING_CSV)
    except Exception as e:
        print(f"❌ CSV-Fehler: {e}", file=sys.stderr)
        return 2

    input_pages = 0
    try:
        with fitz.open(INPUT_PDF) as d:
            input_pages = d.page_count
    except Exception as e:
        print(f"❌ Kann INPUT_PDF nicht öffnen: {e}", file=sys.stderr)
        return 2

    # Textquelle bestimmen (mit/ohne OCR)
    ocr_tmp = None
    text_source = INPUT_PDF

    try:
        page_texts = get_page_texts(text_source)

        do_ocr = False
        if OCR_MODE.lower() == "always":
            do_ocr = True
        elif OCR_MODE.lower() == "never":
            do_ocr = False
        else:
            do_ocr = should_run_ocr(page_texts)

        if do_ocr:
            if KEEP_OCR_PDF:
                ocr_pdf = OUTPUT_DIR / "ocr_input.pdf"
            else:
                ocr_tmp = tempfile.TemporaryDirectory()
                ocr_pdf = Path(ocr_tmp.name) / "ocr_input.pdf"

            run_ocrmypdf(INPUT_PDF, ocr_pdf, lang=OCR_LANG)

            text_source = ocr_pdf
            page_texts = get_page_texts(text_source)

        # Sanity: Textseiten müssen der Input-Seitenanzahl entsprechen
        if len(page_texts) != input_pages:
            print(
                f"⚠️  WARN: Textquelle hat andere Seitenanzahl als Input "
                f"(text={len(page_texts)} vs input={input_pages}). Fallback auf Original-Textlayer.",
                file=sys.stderr,
            )
            text_source = INPUT_PDF
            page_texts = get_page_texts(text_source)

        # Debug CSV immer schreiben
        write_debug_pages_csv(page_texts, main_map, DEBUG_PAGES_CSV)

        # Segmentierung: bevorzugt SAP+VK
        segments = build_segments_from_page_texts(page_texts)

        # Notfall: falls Segmente nicht bis zur letzten Seite reichen, fehlende Seiten als ERROR-Segment ergänzen
        if segments:
            last_end = segments[-1][3]
            if last_end < input_pages - 1:
                segments.append((None, None, last_end + 1, input_pages - 1))
        else:
            segments = [(None, None, 0, input_pages - 1)]

    except Exception as e:
        if ocr_tmp and not KEEP_OCR_PDF:
            ocr_tmp.cleanup()
        print(f"❌ Fehler bei Text/OCR/Splitting: {e}", file=sys.stderr)
        return 2
    finally:
        if ocr_tmp and not KEEP_OCR_PDF:
            ocr_tmp.cleanup()

    # Original-PDF einmal öffnen (Range-Splitting)
    try:
        src_original = fitz.open(INPUT_PDF)
    except Exception as e:
        print(f"❌ Kann INPUT_PDF nicht öffnen (Split): {e}", file=sys.stderr)
        return 2

    total = len(segments)
    results = []

    date_str = (HARDCODE_DATE_YYMMDD or "").strip()
    kuerzel = (HARDCODE_BETREFF_KUERZEL or "").strip()

    for idx, (_sap_seed, _vk_seed, start, end) in enumerate(segments, start=1):
        pct = (idx / total * 100) if total else 100.0
        progress = f"[{idx}/{total} | {pct:5.1f}%]"

        pages_count = max(0, end - start + 1)

        status = "OK"
        reason = ""
        out_path: Optional[Path] = None

        # Segment-ID robust bestimmen (Mehrheit über Segmentseiten, bevorzugt Paar)
        sap_best, vk_best, method_best = identify_segment_ids(page_texts, start, end)
        sap = (sap_best or "").strip()
        vk = (vk_best or "").strip()

        lima_raw = ""
        lima_clean = ""
        debitor_raw = ""
        debitor_padded = ""

        try:
            # 1) SAP + VK prüfen + mapping.csv abgleichen (gleiche Zeile!)
            if not sap:
                status = "ERROR"
                reason = "SAP_NOT_FOUND"
            elif sap not in main_map:
                status = "ERROR"
                reason = "FALSE-SAP"
            elif not vk:
                status = "ERROR"
                reason = "VK_NOT_FOUND"
            else:
                row = find_mapping_row(main_map, sap, vk)
                if row is None:
                    status = "ERROR"
                    reason = "SAP_VK_MISMATCH"
                else:
                    _vk_map, lima_raw, debitor_raw = row
                    lima_raw = (lima_raw or "").strip()
                    debitor_raw = (debitor_raw or "").strip()

                    lima_clean = lima_digits_only(lima_raw)
                    debitor_padded = debitor_9(debitor_raw)

                    if not lima_raw or not lima_clean:
                        status = "ERROR"
                        reason = "NO-LIMA"
                    if debitor_raw.strip() == "":
                        status = "ERROR"
                        reason = "NO-DEBITOR"

            # 2) Zielpfad + Dateiname
            if status == "OK":
                # Dateiname: Lima(ohne Trenner)_Datum(YYMMDD)_BetreffKuerzel_Debitor(9st).pdf
                lima_fn = sanitize_filename_part(lima_clean)
                date_fn = sanitize_filename_part(date_str)
                kuerzel_fn = sanitize_filename_part(kuerzel)
                deb_fn = sanitize_filename_part(debitor_padded)

                filename = f"{lima_fn}_{date_fn}_{kuerzel_fn}_{deb_fn}.pdf"
                out_path = ensure_unique_path(SPLIT_DIR / filename)
            else:
                # Fehlername: idx_reason_SAP[_VKxxxx].pdf
                base = sanitize_filename_part(sap) if sap else "UNKNOWN"
                if vk:
                    base = sanitize_filename_part(f"{base}_VK{vk}")
                reason_fn = sanitize_filename_part(reason)
                filename = f"{idx:03d}_{reason_fn}_{base}.pdf"
                out_path = ensure_unique_path(ERRORS_DIR / filename)

            # 3) Split schreiben (immer aus ORIGINAL)
            write_split_pdf_range(src_original, start, end, out_path)

            print(
                f"{progress} {status:5} | {reason or '-':14} | {out_path.name} | "
                f"SAP={sap or '-'} | VK={vk or '-'} | meth={method_best} | Seiten={pages_count} | Range={start+1}-{end+1}"
            )

        except Exception as e:
            status = "ERROR"
            reason = f"EXCEPTION: {e}"
            print(f"{progress} ERROR | {reason}", file=sys.stderr)

        results.append({
            "input_file": str(INPUT_PDF),
            "text_source": str(text_source),
            "split_file": str(out_path) if out_path else "",
            "sap_kundennummer": sap,
            "vertragskonto": vk,
            "segment_method": method_best,
            "lima_kundennummer_raw": lima_raw,
            "lima_kundennummer_clean_digits": lima_clean,
            "debitor_raw": debitor_raw,
            "debitor_9": debitor_padded,
            "date_yymmdd": date_str,
            "betreff_kuerzel": kuerzel,
            "status": status,
            "reason": reason,
            "pages_count": pages_count,
            "pages_first": start + 1,
            "pages_last": end + 1,
        })

    src_original.close()

    # Log CSV
    OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
    with OUTPUT_CSV.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=[
                "input_file",
                "text_source",
                "split_file",
                "sap_kundennummer",
                "vertragskonto",
                "segment_method",
                "lima_kundennummer_raw",
                "lima_kundennummer_clean_digits",
                "debitor_raw",
                "debitor_9",
                "date_yymmdd",
                "betreff_kuerzel",
                "status",
                "reason",
                "pages_count",
                "pages_first",
                "pages_last",
            ],
            delimiter=";",
        )
        writer.writeheader()
        writer.writerows(results)

    print("\n✅ Fertig.")
    print(f"✅ OK PDFs:         {SPLIT_DIR}")
    print(f"⚠️  Fehler PDFs:    {ERRORS_DIR}")
    print(f"✅ Log-CSV:         {OUTPUT_CSV}")
    print(f"🧾 Debug Pages CSV: {DEBUG_PAGES_CSV}")
    if KEEP_OCR_PDF and (OUTPUT_DIR / "ocr_input.pdf").exists():
        print(f"🧾 OCR-PDF:         {OUTPUT_DIR / 'ocr_input.pdf'}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
#!/usr/bin/env python3
import csv
import os
import re
import subprocess
import sys
import tempfile
from collections import Counter
from pathlib import Path
from typing import Dict, Optional, List, Tuple

import fitz  # PyMuPDF


# =========================
# KONFIG (RELATIV ZUM SCRIPT)
# =========================
SCRIPT_DIR = Path(__file__).resolve().parent

INPUT_PDF = SCRIPT_DIR / "in" / "input.pdf"

# mapping.csv columns:
# SAP-Kundennummer;Vertragskonto;lima-Kundennummer;debitor
MAPPING_CSV = SCRIPT_DIR / "mapping.csv"

OUTPUT_DIR = SCRIPT_DIR / "out"
SPLIT_DIR = OUTPUT_DIR / "split"
ERRORS_DIR = OUTPUT_DIR / "errors"
OUTPUT_CSV = OUTPUT_DIR / "results.csv"
DEBUG_PAGES_CSV = OUTPUT_DIR / "debug_pages.csv"

OCR_LANG = "deu"
KEEP_OCR_PDF = False  # True: OCR-PDF unter ./out/ocr_input.pdf behalten
OCR_JOBS = max(1, min(4, (os.cpu_count() or 2)))  # parallel, falls möglich

# OCR-Strategie:
# - "auto": OCR nur wenn Erkennung vermutlich unzureichend ist
# - "always": immer OCR
# - "never": nie OCR
OCR_MODE = "auto"
# in AUTO: OCR wenn ID-Hit-Quote < diese Schwelle
OCR_ID_HIT_RATIO_THRESHOLD = 0.60


# =========================
# HARD-CODED DATUM + BETREFF
# =========================
# Datum im Dateinamen: YYMMDD (6 Ziffern), z.B. 260113
HARDCODE_DATE_YYMMDD = "260101"
# Betreff im Dateinamen (Kürzel)
HARDCODE_BETREFF_KUERZEL = "XVV"


# =========================
# REGEX / NORMALISIERUNG
# =========================
# Variante A:
#   Kundennummer: <SAP>;    und   Vertragskonto: <VK>;
REGEX_SAP_LABEL = re.compile(r"Kundennummer:\s*([0-9]{3,})\s*;", re.IGNORECASE)
REGEX_VK_LABEL = re.compile(r"Vertragskonto:\s*([0-9]{3,})\s*;", re.IGNORECASE)

# Variante B:
#   "<SAP> / <VK>"
REGEX_SAP_VK_PAIR = re.compile(r"\b([0-9]{3,})\s*/\s*([0-9]{3,})\b")

# Startseiten-Detektor (für gleiche IDs direkt hintereinander)
REGEX_SEITE = re.compile(r"\bSeite\s+(\d{1,3})(?:\s*(?:von|/)\s*\d{1,3})?\b", re.IGNORECASE)

# Steuerzeichen raus (z.B. \x07)
CONTROL_CHARS = re.compile(r"[\x00-\x1F\x7F]")


def normalize_text(text: str) -> str:
    text = CONTROL_CHARS.sub(" ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def digits_only(s: str) -> str:
    return re.sub(r"\D+", "", s or "")


def extract_ids_with_method(norm_text: str) -> Tuple[Optional[str], Optional[str], str]:
    """
    Returns (sap, vertragskonto, method)

    method in:
      - "labels_both" / "sap_label" / "vk_label"
      - "pair"
      - "none"
    """
    sap = None
    vk = None

    m_sap = REGEX_SAP_LABEL.search(norm_text)
    if m_sap:
        sap = digits_only(m_sap.group(1))

    m_vk = REGEX_VK_LABEL.search(norm_text)
    if m_vk:
        vk = digits_only(m_vk.group(1))

    if sap or vk:
        if sap and vk:
            return sap, vk, "labels_both"
        if sap:
            return sap, None, "sap_label"
        return None, vk, "vk_label"

    m_pair = REGEX_SAP_VK_PAIR.search(norm_text)
    if m_pair:
        sap = digits_only(m_pair.group(1))
        vk = digits_only(m_pair.group(2))
        return sap or None, vk or None, "pair"

    return None, None, "none"


def detect_page_no(norm_text: str) -> Optional[int]:
    m = REGEX_SEITE.search(norm_text)
    if not m:
        return None
    try:
        return int(m.group(1))
    except ValueError:
        return None


def looks_like_first_page(norm_text: str) -> bool:
    return detect_page_no(norm_text) == 1


def stable_text_fingerprint(norm_text: str) -> str:
    # reicht für "identische Seite direkt danach" (Doppelscan)
    t = (norm_text or "").strip().lower()
    t = re.sub(r"\s+", " ", t)
    return t[:2000]


def sanitize_filename_part(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"[^0-9A-Za-z_-]+", "_", s)
    return s or "UNKNOWN"


def ensure_unique_path(path: Path) -> Path:
    if not path.exists():
        return path
    stem = path.stem
    suffix = path.suffix
    parent = path.parent
    for i in range(2, 1000):
        candidate = parent / f"{stem}_{i:02d}{suffix}"
        if not candidate.exists():
            return candidate
    return parent / f"{stem}_{os.getpid()}{suffix}"


# =========================
# CSV LOADING
# =========================
def _detect_delimiter(csv_path: Path) -> str:
    sample = csv_path.read_text(encoding="utf-8-sig", errors="replace")[:4096]
    try:
        dialect = csv.Sniffer().sniff(sample, delimiters=";,\t|")
        return dialect.delimiter
    except Exception:
        return ";"


def load_main_mapping(mapping_csv: Path) -> Dict[str, List[Tuple[str, str, str]]]:
    """
    mapping.csv columns:
      - SAP-Kundennummer
      - Vertragskonto
      - lima-Kundennummer
      - debitor

    Returns: sap_digits -> list of (vk_digits, lima_raw, debitor_raw)
    (list, weil SAP theoretisch mehrfach vorkommen könnte)
    """
    if not mapping_csv.exists() or not mapping_csv.is_file():
        raise RuntimeError(f"mapping.csv nicht gefunden: {mapping_csv}")

    delim = _detect_delimiter(mapping_csv)
    out: Dict[str, List[Tuple[str, str, str]]] = {}

    with mapping_csv.open("r", encoding="utf-8-sig", newline="") as f:
        reader = csv.DictReader(f, delimiter=delim)
        required = {"SAP-Kundennummer", "Vertragskonto", "lima-Kundennummer", "debitor"}
        if not reader.fieldnames or not required.issubset(set(reader.fieldnames)):
            raise RuntimeError(
                f"mapping.csv hat nicht die erwarteten Spaltennamen. "
                f"Erwartet: {sorted(required)} | Gefunden: {reader.fieldnames}"
            )

        for row in reader:
            sap_raw = (row.get("SAP-Kundennummer") or "").strip()
            vk_raw = (row.get("Vertragskonto") or "").strip()
            lima = (row.get("lima-Kundennummer") or "").strip()
            deb = (row.get("debitor") or "").strip()

            sap = digits_only(sap_raw)
            vk = digits_only(vk_raw)

            if not sap:
                continue

            out.setdefault(sap, []).append((vk, lima, deb))

    return out


def find_mapping_row(
    main_map: Dict[str, List[Tuple[str, str, str]]],
    sap: str,
    vk: str,
) -> Optional[Tuple[str, str, str]]:
    """
    Returns (vk_digits, lima_raw, debitor_raw) if a row matches sap+vk.
    """
    rows = main_map.get(sap)
    if not rows:
        return None
    for vk_map, lima, deb in rows:
        if (vk_map or "") == (vk or ""):
            return (vk_map, lima, deb)
    return None


# =========================
# OCR (schnell -> fallback safe)
# =========================
def repair_pdf_with_pymupdf(input_pdf: Path, repaired_pdf: Path) -> None:
    with fitz.open(input_pdf) as doc:
        doc.save(repaired_pdf, garbage=4, deflate=True, clean=True)


def run_ocrmypdf(input_pdf: Path, output_pdf: Path, lang: str = "deu") -> None:
    def _run(cmd: List[str]) -> None:
        try:
            subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        except FileNotFoundError:
            raise RuntimeError("ocrmypdf wurde nicht gefunden. Installiere es z.B. via: pip install ocrmypdf")
        except subprocess.CalledProcessError as e:
            msg = (e.stderr or e.stdout or "").strip()
            raise RuntimeError(f"OCR fehlgeschlagen: {msg}")

    base = ["ocrmypdf", "--skip-text", "--language", lang, "--output-type", "pdf", "--optimize", "0"]

    # Versuch 1: parallel
    try:
        _run(base + ["--jobs", str(OCR_JOBS), str(input_pdf), str(output_pdf)])
        return
    except RuntimeError:
        # Versuch 2: reparieren + jobs=1
        with tempfile.TemporaryDirectory() as td:
            repaired = Path(td) / "repaired.pdf"
            repair_pdf_with_pymupdf(input_pdf, repaired)
            _run(base + ["--jobs", "1", str(repaired), str(output_pdf)])


# =========================
# TEXT-PASS + SEGMENTIERUNG
# =========================
def get_page_texts(pdf_path: Path) -> List[str]:
    """Returns list of normalized texts per page (case preserved)."""
    texts: List[str] = []
    with fitz.open(pdf_path) as doc:
        for i in range(doc.page_count):
            t = doc.load_page(i).get_text("text") or ""
            texts.append(normalize_text(t))
    return texts


def should_run_ocr(page_texts: List[str]) -> bool:
    """
    Heuristik für AUTO:
    - OCR wenn GAR keine IDs erkannt werden (SAP oder VK)
    - oder wenn ID-Hit-Quote zu niedrig ist
    """
    n = len(page_texts)
    if n == 0:
        return False

    hits = 0
    for t in page_texts:
        sap, vk, _ = extract_ids_with_method(t)
        if sap or vk:
            hits += 1

    if hits == 0:
        return True

    ratio = hits / n
    return ratio < OCR_ID_HIT_RATIO_THRESHOLD


def build_segments_from_page_texts(page_texts: List[str]) -> List[Tuple[Optional[str], Optional[str], int, int]]:
    """
    Returns segments: (sap_or_None, vk_or_None, start_page, end_page) end inclusive.

    Segmentgrenzen:
    - bevorzugt nach (SAP + Vertragskonto), sobald beides erkannt wird
    - SAP-Wechsel triggert ebenfalls neues Segment (auch wenn VK gerade fehlt)
    - Doppelscan/Neustart: bei gleicher ID -> trennen, wenn "Seite 1" oder Text identisch zur Vorseite
    """
    n = len(page_texts)
    segments: List[Tuple[Optional[str], Optional[str], int, int]] = []

    current_start: Optional[int] = None
    current_sap: Optional[str] = None
    current_vk: Optional[str] = None
    # current_strong_id ist nur gesetzt, wenn wir im Segment schon mal SAP+VK gesehen haben
    current_strong_id: Optional[Tuple[str, str]] = None

    prev_fp: Optional[str] = None
    prev_strong_id: Optional[Tuple[str, str]] = None

    for i in range(n):
        norm = page_texts[i]
        sap, vk, _method = extract_ids_with_method(norm)
        fp = stable_text_fingerprint(norm) if norm else ""

        strong_id = (sap, vk) if sap and vk else None

        start_new = False

        if current_start is None:
            # Segmentstart sobald irgendwas erkannt wird; notfalls starten wir trotzdem bei erster Seite
            if strong_id is not None or sap is not None:
                start_new = True
            else:
                # noch nichts erkannt -> wir warten; falls bis zum Ende nichts kommt, gibts ein Segment (None)
                start_new = False
        else:
            # 1) Starker Wechsel (SAP+VK)
            if strong_id is not None:
                if current_strong_id is None:
                    # Wir sind im Segment, aber bisher ohne VK. Wenn SAP gleich, übernehmen wir nur VK ohne Split.
                    if current_sap == sap:
                        current_strong_id = strong_id
                        current_vk = vk
                        start_new = False
                    else:
                        start_new = True
                elif strong_id != current_strong_id:
                    start_new = True
                else:
                    # gleiche starke ID -> evtl Doppelscan
                    if looks_like_first_page(norm):
                        start_new = True
                    elif prev_strong_id == strong_id and prev_fp is not None and fp == prev_fp:
                        start_new = True

            # 2) Kein starker Treffer, aber SAP vorhanden -> SAP-Wechsel triggert Segmentwechsel
            elif sap is not None:
                if current_sap is None:
                    start_new = True
                elif sap != current_sap:
                    start_new = True
                else:
                    # gleiche SAP, VK fehlt -> kein Split
                    start_new = False

        if start_new:
            if current_start is not None:
                segments.append((current_sap, current_vk, current_start, i - 1))

            current_start = i if i != 0 else 0
            current_sap = sap
            current_vk = vk
            current_strong_id = strong_id  # kann None sein

        prev_fp = fp
        prev_strong_id = strong_id

    if current_start is not None:
        segments.append((current_sap, current_vk, current_start, n - 1))
    else:
        segments.append((None, None, 0, n - 1))

    return segments


def identify_segment_ids(page_texts: List[str], start: int, end: int) -> Tuple[Optional[str], Optional[str], str]:
    """
    Bestimmt die wahrscheinlichste SAP & VK für ein Segment.
    Priorität:
      1) Häufigstes (SAP,VK)-Paar, wenn vorhanden
      2) sonst Mehrheitsentscheidung separat für SAP und VK
    Returns: (sap_best, vk_best, method_summary)
    """
    pair_counts: Counter = Counter()
    sap_counts: Counter = Counter()
    vk_counts: Counter = Counter()
    method_counts: Counter = Counter()

    for i in range(start, end + 1):
        sap, vk, method = extract_ids_with_method(page_texts[i])
        method_counts[method] += 1
        if sap:
            sap_counts[sap] += 1
        if vk:
            vk_counts[vk] += 1
        if sap and vk:
            pair_counts[(sap, vk)] += 1

    if pair_counts:
        (sap_best, vk_best), _ = pair_counts.most_common(1)[0]
    else:
        sap_best = sap_counts.most_common(1)[0][0] if sap_counts else None
        vk_best = vk_counts.most_common(1)[0][0] if vk_counts else None

    method_summary = method_counts.most_common(1)[0][0] if method_counts else "none"
    return sap_best, vk_best, method_summary


def write_split_pdf_range(src_original: fitz.Document, start: int, end: int, out_path: Path) -> None:
    out = fitz.open()
    out.insert_pdf(src_original, from_page=start, to_page=end)
    out.save(out_path)
    out.close()


def debitor_9(debitor_raw: str) -> str:
    """Debitor muss 9 Zeichen haben, aufgefüllt mit führenden 0 (nur Ziffern)."""
    digits = re.sub(r"\D+", "", debitor_raw or "")
    return digits.zfill(9) if digits else "000000000"


def lima_digits_only(lima_raw: str) -> str:
    """
    Lima-Kundennummer im Dateinamen: nur Ziffern, ohne '_' oder andere Trennzeichen.
    Beispiel: '147_871_891' -> '147871891'
    """
    return re.sub(r"\D+", "", lima_raw or "")


def write_debug_pages_csv(page_texts: List[str], main_map: Dict[str, List[Tuple[str, str, str]]], debug_csv: Path) -> None:
    debug_csv.parent.mkdir(parents=True, exist_ok=True)

    rows = []
    prev_fp = None
    prev_sap = None
    prev_vk = None

    for i, norm in enumerate(page_texts, start=1):
        sap, vk, method = extract_ids_with_method(norm)
        page_no = detect_page_no(norm)
        is_first = (page_no == 1)
        fp = stable_text_fingerprint(norm)
        same_as_prev = (prev_fp is not None and fp == prev_fp)
        same_sap_as_prev = (prev_sap is not None and sap is not None and sap == prev_sap)
        same_vk_as_prev = (prev_vk is not None and vk is not None and vk == prev_vk)

        in_mapping_sap = bool(sap and sap in main_map)
        in_mapping_pair = False
        if sap and vk and sap in main_map:
            in_mapping_pair = find_mapping_row(main_map, sap, vk) is not None

        text_len = len(norm or "")
        snippet = (norm[:200] + ("…" if len(norm) > 200 else "")) if norm else ""

        rows.append({
            "page": i,
            "text_len": text_len,
            "sap_detected": sap or "",
            "vertragskonto_detected": vk or "",
            "method": method,
            "in_mapping_sap": "1" if in_mapping_sap else "0",
            "in_mapping_pair": "1" if in_mapping_pair else "0",
            "seite_no": page_no if page_no is not None else "",
            "looks_like_first_page": "1" if is_first else "0",
            "same_text_as_prev": "1" if same_as_prev else "0",
            "same_sap_as_prev": "1" if same_sap_as_prev else "0",
            "same_vk_as_prev": "1" if same_vk_as_prev else "0",
            "snippet_200": snippet,
        })

        prev_fp = fp
        prev_sap = sap
        prev_vk = vk

    with debug_csv.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=[
                "page",
                "text_len",
                "sap_detected",
                "vertragskonto_detected",
                "method",
                "in_mapping_sap",
                "in_mapping_pair",
                "seite_no",
                "looks_like_first_page",
                "same_text_as_prev",
                "same_sap_as_prev",
                "same_vk_as_prev",
                "snippet_200",
            ],
            delimiter=";",
        )
        writer.writeheader()
        writer.writerows(rows)


# =========================
# MAIN
# =========================
def main() -> int:
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    SPLIT_DIR.mkdir(parents=True, exist_ok=True)
    ERRORS_DIR.mkdir(parents=True, exist_ok=True)

    if not INPUT_PDF.exists() or not INPUT_PDF.is_file():
        print(f"❌ INPUT_PDF nicht gefunden: {INPUT_PDF}", file=sys.stderr)
        return 2

    # Hardcoded Werte validieren
    if not re.fullmatch(r"\d{6}", (HARDCODE_DATE_YYMMDD or "").strip()):
        print("❌ HARDCODE_DATE_YYMMDD muss exakt 6 Ziffern im Format YYMMDD sein (z.B. 260113).", file=sys.stderr)
        return 2
    if not (HARDCODE_BETREFF_KUERZEL or "").strip():
        print("❌ HARDCODE_BETREFF_KUERZEL darf nicht leer sein.", file=sys.stderr)
        return 2

    try:
        main_map = load_main_mapping(MAPPING_CSV)
    except Exception as e:
        print(f"❌ CSV-Fehler: {e}", file=sys.stderr)
        return 2

    input_pages = 0
    try:
        with fitz.open(INPUT_PDF) as d:
            input_pages = d.page_count
    except Exception as e:
        print(f"❌ Kann INPUT_PDF nicht öffnen: {e}", file=sys.stderr)
        return 2

    # Textquelle bestimmen (mit/ohne OCR)
    ocr_tmp = None
    text_source = INPUT_PDF

    try:
        page_texts = get_page_texts(text_source)

        do_ocr = False
        if OCR_MODE.lower() == "always":
            do_ocr = True
        elif OCR_MODE.lower() == "never":
            do_ocr = False
        else:
            do_ocr = should_run_ocr(page_texts)

        if do_ocr:
            if KEEP_OCR_PDF:
                ocr_pdf = OUTPUT_DIR / "ocr_input.pdf"
            else:
                ocr_tmp = tempfile.TemporaryDirectory()
                ocr_pdf = Path(ocr_tmp.name) / "ocr_input.pdf"

            run_ocrmypdf(INPUT_PDF, ocr_pdf, lang=OCR_LANG)

            text_source = ocr_pdf
            page_texts = get_page_texts(text_source)

        # Sanity: Textseiten müssen der Input-Seitenanzahl entsprechen
        if len(page_texts) != input_pages:
            print(
                f"⚠️  WARN: Textquelle hat andere Seitenanzahl als Input "
                f"(text={len(page_texts)} vs input={input_pages}). Fallback auf Original-Textlayer.",
                file=sys.stderr,
            )
            text_source = INPUT_PDF
            page_texts = get_page_texts(text_source)

        # Debug CSV immer schreiben
        write_debug_pages_csv(page_texts, main_map, DEBUG_PAGES_CSV)

        # Segmentierung: bevorzugt SAP+VK
        segments = build_segments_from_page_texts(page_texts)

        # Notfall: falls Segmente nicht bis zur letzten Seite reichen, fehlende Seiten als ERROR-Segment ergänzen
        if segments:
            last_end = segments[-1][3]
            if last_end < input_pages - 1:
                segments.append((None, None, last_end + 1, input_pages - 1))
        else:
            segments = [(None, None, 0, input_pages - 1)]

    except Exception as e:
        if ocr_tmp and not KEEP_OCR_PDF:
            ocr_tmp.cleanup()
        print(f"❌ Fehler bei Text/OCR/Splitting: {e}", file=sys.stderr)
        return 2
    finally:
        if ocr_tmp and not KEEP_OCR_PDF:
            ocr_tmp.cleanup()

    # Original-PDF einmal öffnen (Range-Splitting)
    try:
        src_original = fitz.open(INPUT_PDF)
    except Exception as e:
        print(f"❌ Kann INPUT_PDF nicht öffnen (Split): {e}", file=sys.stderr)
        return 2

    total = len(segments)
    results = []

    date_str = (HARDCODE_DATE_YYMMDD or "").strip()
    kuerzel = (HARDCODE_BETREFF_KUERZEL or "").strip()

    for idx, (_sap_seed, _vk_seed, start, end) in enumerate(segments, start=1):
        pct = (idx / total * 100) if total else 100.0
        progress = f"[{idx}/{total} | {pct:5.1f}%]"

        pages_count = max(0, end - start + 1)

        status = "OK"
        reason = ""
        out_path: Optional[Path] = None

        # Segment-ID robust bestimmen (Mehrheit über Segmentseiten, bevorzugt Paar)
        sap_best, vk_best, method_best = identify_segment_ids(page_texts, start, end)
        sap = (sap_best or "").strip()
        vk = (vk_best or "").strip()

        lima_raw = ""
        lima_clean = ""
        debitor_raw = ""
        debitor_padded = ""

        try:
            # 1) SAP + VK prüfen + mapping.csv abgleichen (gleiche Zeile!)
            if not sap:
                status = "ERROR"
                reason = "SAP_NOT_FOUND"
            elif sap not in main_map:
                status = "ERROR"
                reason = "FALSE-SAP"
            elif not vk:
                status = "ERROR"
                reason = "VK_NOT_FOUND"
            else:
                row = find_mapping_row(main_map, sap, vk)
                if row is None:
                    status = "ERROR"
                    reason = "SAP_VK_MISMATCH"
                else:
                    _vk_map, lima_raw, debitor_raw = row
                    lima_raw = (lima_raw or "").strip()
                    debitor_raw = (debitor_raw or "").strip()

                    lima_clean = lima_digits_only(lima_raw)
                    debitor_padded = debitor_9(debitor_raw)

                    if not lima_raw or not lima_clean:
                        status = "ERROR"
                        reason = "NO-LIMA"
                    if debitor_raw.strip() == "":
                        status = "ERROR"
                        reason = "NO-DEBITOR"

            # 2) Zielpfad + Dateiname
            if status == "OK":
                # Dateiname: Lima(ohne Trenner)_Datum(YYMMDD)_BetreffKuerzel_Debitor(9st).pdf
                lima_fn = sanitize_filename_part(lima_clean)
                date_fn = sanitize_filename_part(date_str)
                kuerzel_fn = sanitize_filename_part(kuerzel)
                deb_fn = sanitize_filename_part(debitor_padded)

                filename = f"{lima_fn}_{date_fn}_{kuerzel_fn}_{deb_fn}.pdf"
                out_path = ensure_unique_path(SPLIT_DIR / filename)
            else:
                # Fehlername: idx_reason_SAP[_VKxxxx].pdf
                base = sanitize_filename_part(sap) if sap else "UNKNOWN"
                if vk:
                    base = sanitize_filename_part(f"{base}_VK{vk}")
                reason_fn = sanitize_filename_part(reason)
                filename = f"{idx:03d}_{reason_fn}_{base}.pdf"
                out_path = ensure_unique_path(ERRORS_DIR / filename)

            # 3) Split schreiben (immer aus ORIGINAL)
            write_split_pdf_range(src_original, start, end, out_path)

            print(
                f"{progress} {status:5} | {reason or '-':14} | {out_path.name} | "
                f"SAP={sap or '-'} | VK={vk or '-'} | meth={method_best} | Seiten={pages_count} | Range={start+1}-{end+1}"
            )

        except Exception as e:
            status = "ERROR"
            reason = f"EXCEPTION: {e}"
            print(f"{progress} ERROR | {reason}", file=sys.stderr)

        results.append({
            "input_file": str(INPUT_PDF),
            "text_source": str(text_source),
            "split_file": str(out_path) if out_path else "",
            "sap_kundennummer": sap,
            "vertragskonto": vk,
            "segment_method": method_best,
            "lima_kundennummer_raw": lima_raw,
            "lima_kundennummer_clean_digits": lima_clean,
            "debitor_raw": debitor_raw,
            "debitor_9": debitor_padded,
            "date_yymmdd": date_str,
            "betreff_kuerzel": kuerzel,
            "status": status,
            "reason": reason,
            "pages_count": pages_count,
            "pages_first": start + 1,
            "pages_last": end + 1,
        })

    src_original.close()

    # Log CSV
    OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
    with OUTPUT_CSV.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=[
                "input_file",
                "text_source",
                "split_file",
                "sap_kundennummer",
                "vertragskonto",
                "segment_method",
                "lima_kundennummer_raw",
                "lima_kundennummer_clean_digits",
                "debitor_raw",
                "debitor_9",
                "date_yymmdd",
                "betreff_kuerzel",
                "status",
                "reason",
                "pages_count",
                "pages_first",
                "pages_last",
            ],
            delimiter=";",
        )
        writer.writeheader()
        writer.writerows(results)

    print("\n✅ Fertig.")
    print(f"✅ OK PDFs:         {SPLIT_DIR}")
    print(f"⚠️  Fehler PDFs:    {ERRORS_DIR}")
    print(f"✅ Log-CSV:         {OUTPUT_CSV}")
    print(f"🧾 Debug Pages CSV: {DEBUG_PAGES_CSV}")
    if KEEP_OCR_PDF and (OUTPUT_DIR / "ocr_input.pdf").exists():
        print(f"🧾 OCR-PDF:         {OUTPUT_DIR / 'ocr_input.pdf'}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
