# INSTRUMENT-AUSNAHME: Victor-Go — Holmes/Watson Mix, präzises Trimmen, keine Atmo, keine Fade-Ins
"""
Schnittregeln (Victor-Direktive):
  - Clip beginnt hart am Wortanfang, 25ms Sicherheitsluft davor
  - Clip endet mit natürlichem Tail: 200ms nach letztem Sprachsignal
  - Interrupt-Stellen (d07/d09/d11/d17): Tail nur 100ms, nächste Stimme überlapppt 120ms
  - Keine künstlichen Fade-Ins
  - Keine Atmo
  - Keine dicken Pausen
"""
import subprocess, json
from pathlib import Path
from pydub import AudioSegment, effects

CLIPS   = Path("/Users/victorholland/Vibe Coding/dispatcher/cockpit/watson_demo_clips")
OUT     = CLIPS / "holmes_watson_geplänkel.mp3"
LOG     = Path("/tmp/watson_mix.log")

DIALOG = [
    ("d01","watson"),
    ("d02","holmes_trim"),
    ("d03","watson"),
    ("d04","holmes_trim"),
    ("d05","watson"),
    ("d06","holmes_trim"),
    ("d07","watson"),      # interrupt
    ("d08","holmes_trim"),
    ("d09","watson"),      # interrupt
    ("d10","holmes_trim"),
    ("d11","watson"),      # interrupt
    ("d12","holmes_trim"),
    ("d13","watson"),
    ("d14","holmes_trim"),
    ("d15","watson"),
    ("d16","holmes_trim"),
    ("d17","watson"),      # interrupt
    ("d18","holmes_trim"),
    ("d19","watson"),
    ("d20","holmes_trim"),
]

# Diese Clips kommen kurz nach der vorherigen Stimme rein (echter Interrupt)
INTERRUPT = {"d07", "d09", "d11", "d17"}

LEAD_MS          = 25    # ms Sicherheitsluft vor Wortanfang
NORMAL_TAIL_MS   = 210   # ms natürlicher Tail nach Satzende
INTERRUPT_TAIL_MS= 100   # ms Tail bei Unterbrechungen (knapper)
INTERRUPT_XFADE  = 120   # ms Überlappung bei Interrupts (kein Fade, nur Anschluss)

def log(msg):
    print(msg, flush=True)
    with LOG.open("a") as f:
        f.write(msg + "\n")

def detect_speech_bounds(path):
    """
    Findet per silencedetect:
      speech_start = Ende der ersten Stille am Anfang (nach SSML-Pause)
      speech_end   = Beginn der letzten Stille im letzten Drittel des Clips
    Threshold -35dB, Mindestdauer 40ms.
    """
    dur_cmd = ["ffprobe","-v","quiet","-show_entries","format=duration",
               "-of","csv=p=0", str(path)]
    duration = float(subprocess.check_output(dur_cmd, text=True).strip())

    cmd = [
        "ffmpeg", "-i", str(path),
        "-af", "silencedetect=noise=-35dB:duration=0.04",
        "-f", "null", "-"
    ]
    out = subprocess.run(cmd, capture_output=True, text=True).stderr

    silence_ends   = []
    silence_starts = []
    for line in out.splitlines():
        if "silence_end"   in line:
            val = line.split("silence_end:")[1].split()[0]
            silence_ends.append(float(val))
        if "silence_start" in line:
            val = line.split("silence_start:")[1].split()[0]
            silence_starts.append(float(val))

    # speech_start: erste silence_end im Bereich 0.1–1.3s (erwartete SSML-Pause)
    # Wenn die erste silence_end > 1.3s ist, ignorieren — vermutlich Fehldetektion
    candidates_start = [e for e in silence_ends if 0.05 < e < 1.3]
    speech_start = candidates_start[0] if candidates_start else 0.05

    # speech_end: suche silence_start im letzten 40% des Clips
    # (Mitte ausblenden, damit innere Sprachpausen nicht als Ende gelten)
    cutoff = duration * 0.60
    tail_starts = [s for s in silence_starts if s >= cutoff]
    if tail_starts:
        speech_end = tail_starts[0]
    else:
        # Fallback: Clip-Ende minus erwartete SSML-Endpause
        speech_end = duration - 0.55

    # Sanity: speech_end muss nach speech_start liegen
    if speech_end <= speech_start + 0.15:
        speech_end = duration - 0.35

    return speech_start, speech_end, duration

LOG.write_text("")

# ── 1. Trimmen ───────────────────────────────────────────────────────────────
log("=== Trimme Clips ===")
trimmed = []

for did, sp in DIALOG:
    fname    = CLIPS / f"{did}_{sp}.mp3"
    raw      = AudioSegment.from_mp3(str(fname))
    norm     = effects.normalize(raw, headroom=1.0)

    s_start, s_end, dur = detect_speech_bounds(fname)

    # Tail: kürzer bei Interrupts
    tail_ms  = INTERRUPT_TAIL_MS if did in INTERRUPT else NORMAL_TAIL_MS
    lead_ms  = LEAD_MS

    # Schnittpunkte in ms
    cut_start = max(0,   int(s_start * 1000) - lead_ms)
    cut_end   = min(len(norm), int(s_end   * 1000) + tail_ms)

    clipped = norm[cut_start:cut_end]

    # Mindestlänge 400ms — bei kürzerem Ergebnis grob trimmen (200ms je Seite)
    if len(clipped) < 400:
        cut_start = min(200, int(s_start * 1000))
        cut_end   = max(len(norm) - 200, len(norm) // 2)
        clipped   = norm[cut_start:cut_end]
        log(f"    ↳ Fallback-Trim: {cut_start}–{cut_end}ms  result:{len(clipped)}ms")

    trimmed.append((did, sp, clipped))

    log(f"  {did} ({sp:12s})  "
        f"raw:{dur:.2f}s  speech:{s_start:.2f}–{s_end:.2f}s  "
        f"→ trim:{cut_start}–{cut_end}ms  result:{len(clipped)}ms")

# ── 2. Aneinandersetzen ───────────────────────────────────────────────────────
log("\n=== Zusammensetzen ===")
result = trimmed[0][2]

for i in range(1, len(trimmed)):
    did, sp, seg = trimmed[i]

    if did in INTERRUPT:
        xfade = min(INTERRUPT_XFADE, len(seg) - 10, len(result) - 10)
        xfade = max(0, xfade)
        result = result.append(seg, crossfade=xfade)
        log(f"  + {did} INTERRUPT  {xfade}ms overlap")
    else:
        # Normaler Anschluss: direkt aneinander
        # Natürliche Pause entsteht durch Tail des vorherigen (210ms)
        result = result.append(seg, crossfade=0)
        log(f"  + {did} direkt")

log(f"\nGesamt: {len(result)/1000:.1f}s")

# ── 3. Export ────────────────────────────────────────────────────────────────
log(f"Exportiere → {OUT}")
result.export(str(OUT), format="mp3", bitrate="320k",
              tags={"title":"Holmes / Watson — Geplänkel","artist":"Watson Demo"})
log(f"Fertig: {OUT.stat().st_size//1024} KB")
print(f"\n✓ {OUT.name}  {OUT.stat().st_size//1024} KB  {len(result)/1000:.1f}s")
