#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
llm_double_take.py — 「書かせて→もう一度読ませて→直す」日本語原稿レビュー（ローカル/LM Studio対応）

MIT License

概要:
  - LM Studio / Ollama / OpenAI など OpenAI互換APIに接続し、*.txt / *.md の原稿を一括レビュー
  - LLMに「違和感の指摘」と「修正後全文（fixed_text）」をJSONで返させ、
    問題があるファイルだけ .review.json / .diff.txt / 修正本文 を出力します（--only-problem）。
  - 事前ヒント（軽い正規表現プリスキャン）で見落としを減らし、strictモードではチェック項目のpass/failを可視化。
"""

from __future__ import annotations
import argparse, difflib, json, os, re, sys, time
from pathlib import Path
from typing import Any, Dict, List, Tuple, Optional

import requests

DEFAULT_START_PREFIX = "春日部つむぎのIT用語講座へようこそ！　今日のキーワードは"
DEFAULT_CLOSING = "春日部つむぎのIT用語講座、また聞いてね"

WHITELIST_UPPER = {
    "API","HTTP","HTTPS","TCP","UDP","DNS","DHCP","SSH","TLS","SSL","JSON","XML","HTML","CSS",
    "CPU","GPU","WAF","NAS","RAID","VM","AI","OIDC","SAML","JWT","URL","URI","GET","POST",
    "SMTP","IMAP","POP3","SQL","DB","UI","UX"
}

SYSTEM_PROMPT_BASE = """あなたは日本語ライティングの監査官かつ校正者です。用途は技術系記事（日本語）の原稿チェックです。
以下の観点で【指摘】し、必要であれば【修正】を加えてください。意味改変は最小限、冗長化しない。

[重点チェック（必須）]
1) カタカナ+英字が1語に混在する表記（例: ポリモorphism）→ 一般的表記へ統一
2) 全角/半角の混在（ＡＩ/AＩ 等）→ 半角英数に統一
3) 半角ｶﾀｶﾅ → 全角カタカナへ
4) 文末の過度な記号（！？多重）や体裁の乱れ → 抑制
5) 不自然な大文字語（ALL CAPS）の誤用。ただし一般略語(API/HTTP/JSON/CPU/GPU等)は許容
6) 誤変換や日本語としての違和感
7) 開始/締めの定型句の有無（必要な場合のみ自然に補う）
8) 固有名詞・専門用語の表記正規化（例: Transformer → トランスフォーマー(Transformer)）

[日本語のゆらぎの扱い]
- ら抜き言葉は原則正す（例: 見れる→見られる／見ることができる）。
- い抜き（〜してる等）は、文体がフォーマルなら「〜している」に統一。
- 二重敬語（おっしゃられる 等）は単純敬語へ。
- 重ね言葉（違和感を感じる 等）は自然な表現へ。
- ただし引用/会話や意図的な口語は、文脈上自然なら尊重する。

[スタイル]
- 文体は指定トーンに合わせる（formal=です・ます調 / casual=口語許容）。
- 箇条書き・見出し・改行を尊重し、構成は維持。
- Markdownのコードフェンス（``` で囲まれた範囲）は内容・改行・インデントを変更しない。

[出力形式（JSON, 日本語, 厳守）]
{json_spec}
出力は**必ず**有効なJSONのみ。JSON以外の文字を付け加えないこと。
"""

STRICT_JSON_SPEC = """{
  "checks": [
    {"category":"mixed_kana_english","passed":true,"details":"指摘なし/軽微"},
    {"category":"zenhan_mixed","passed":true,"details":"指摘なし/軽微"},
    {"category":"halfwidth_katakana","passed":true,"details":"指摘なし/軽微"},
    {"category":"punct_marks","passed":true,"details":"指摘なし/軽微"},
    {"category":"all_caps_misuse","passed":true,"details":"指摘なし/軽微"},
    {"category":"weird_wording","passed":true,"details":"指摘なし/軽微"},
    {"category":"missing_prefix","passed":true,"details":"満たしている/不要"},
    {"category":"missing_closing","passed":true,"details":"満たしている/不要"},
    {"category":"term_normalize","passed":true,"details":"適切"}
  ],
  "issues": [],
  "fixed_text": "（修正後の全文。入力と同じ形式で）",
  "notes": "全体方針や曖昧箇所に関する補足（必要なら）。"
}
空のときでも "checks" は必ず各カテゴリ1件ずつ含め、"issues" は空配列にしてください。
"""

RELAXED_JSON_SPEC = """{
  "issues": [],
  "fixed_text": "（修正後の全文。入力と同じ形式で）",
  "notes": "全体方針や曖昧箇所に関する補足（必要なら）。"
}
空のときは "issues": [] とし、"fixed_text" は入力全文をそのまま返してください。
"""

def build_system_prompt(strict: bool, enforce_templates: bool, tone: str) -> str:
    spec = STRICT_JSON_SPEC if strict else RELAXED_JSON_SPEC
    base = SYSTEM_PROMPT_BASE.format(json_spec=spec)
    if not enforce_templates:
        base += "\n※今回は開始/締めの定型句チェックは不要です。"
    if tone == "formal":
        base += "\n※文体: です・ます調を基本。口語は必要最小限。"
    else:
        base += "\n※文体: カジュアル許容。ただし誤用は正す。"
    return base

def build_user_prompt(text: str, start_prefix: str, closing: str,
                      suspects: Dict[str, List[str]], enforce_templates: bool) -> str:
    hint = ""
    if suspects:
        lines = ["[事前ヒント: 機械抽出した要確認候補]"]
        for k, arr in suspects.items():
            sample = ", ".join(arr[:10])
            if sample:
                lines.append(f"- {k}: {sample}" + (" ..." if len(arr) > 10 else ""))
        hint = "\n" + "\n".join(lines) + "\n"
    tmpl = ""
    if enforce_templates:
        tmpl = f"\n[開始定型句（期待）]: {start_prefix}\n[締め定型句（期待）]: {closing}\n"
    return (
        "以下は日本語原稿です。日本語の自然さ/表記統一/専門用語の正規化/必要に応じた定型句の補完を行い、"
        "必要最小限の修正にとどめてください。\n"
        + tmpl + hint + "\n【原稿】\n" + text
    )

def pre_scan(text: str) -> Dict[str, List[str]]:
    suspects: Dict[str, List[str]] = {
        "mixed_kana_english": [],
        "halfwidth_katakana": [],
        "punct_marks": [],
        "all_caps": [],
        "zenhan_mixed": [],
        "ra_nuki": [],
        "i_nuki": [],
        "niju_keigo": [],
        "kasane_kotoba": [],
    }
    for m in re.finditer(r"(?:[ァ-ヴー]+[A-Za-z]+|[A-Za-z]+[ァ-ヴー]+)", text):
        suspects["mixed_kana_english"].append(m.group(0))
    if re.search(r"[ｦ-ﾟ]", text):
        suspects["halfwidth_katakana"].append("半角ｶﾀｶﾅあり")
    for m in re.finditer(r"[!?！？]{3,}", text):
        suspects["punct_marks"].append(m.group(0))
    for m in re.finditer(r"\b[A-Z]{2,}(?:[0-9]+)?\b", text):
        tok = m.group(0)
        if tok not in WHITELIST_UPPER:
            suspects["all_caps"].append(tok)
    if re.search(r"[Ａ-Ｚａ-ｚ０-９]", text) and re.search(r"[A-Za-z0-9]", text):
        suspects["zenhan_mixed"].append("全角/半角が混在")
    for m in re.finditer(r"(見|来|着|起|出|入|寝|乗|降|読|書|食|飲|使|買|売|言|行|聞|待|払|押|変|決)れ(る|ない)", text):
        suspects["ra_nuki"].append(m.group(0))
    for m in re.finditer(r"(してる|なってる|いってる|きてる|できてる|思ってる)", text):
        suspects["i_nuki"].append(m.group(0))
    for m in re.finditer(r"(おっしゃられ|ご覧になられ|お伺いさせて|拝見させて|ご説明させて)", text):
        suspects["niju_keigo"].append(m.group(0))
    for m in re.finditer(r"(違和感を感じ|頭痛が痛|後で後悔|過半数の半分|十分満足|一番最初)", text):
        suspects["kasane_kotoba"].append(m.group(0))
    return {k: v for k, v in suspects.items() if v}

def _extract_json_loose(s: str) -> Dict[str, Any]:
    start = s.find('{')
    end = s.rfind('}')
    if start == -1 or end == -1 or end <= start:
        raise ValueError("JSONを抽出できませんでした")
    snippet = s[start:end+1]
    return json.loads(snippet)

def call_chat(api_base: str, api_key: str, model: str, system_prompt: str, user_prompt: str,
              temperature: float=0.2, max_tokens: int=2048, timeout: int=120,
              use_response_format: bool=True) -> Dict[str, Any]:
    url = api_base.rstrip("/") + "/chat/completions"
    headers = {"Content-Type": "application/json"}
    if api_key:
        headers["Authorization"] = f"Bearer {api_key}"
    payload: Dict[str, Any] = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
    }
    if use_response_format:
        payload["response_format"] = {"type": "json_object"}

    s = requests.Session()
    resp = s.post(url, headers=headers, data=json.dumps(payload), timeout=timeout)
    try:
        resp.raise_for_status()
    except requests.HTTPError as e:
        raise RuntimeError(f"{e} | details: {resp.text[:500]}") from e

    data = resp.json()
    content = data.get("choices", [{}])[0].get("message", {}).get("content")
    if not content:
        raise RuntimeError(f"Unexpected API response: {data}")
    try:
        return json.loads(content)
    except Exception:
        return _extract_json_loose(content)

def process_file(path: Path, out_dir: Path, log: Path,
                 api_base: str, api_key: str, model: str,
                 dry_run: bool, start_prefix: str, closing: str,
                 use_response_format: bool, strict: bool, only_problem: bool,
                 enforce_templates: bool, tone: str, notes: bool,
                 retries: int=3) -> Tuple[int, int]:
    raw = path.read_text(encoding="utf-8")
    suspects = pre_scan(raw)
    system_prompt = build_system_prompt(strict, enforce_templates, tone)
    user_prompt = build_user_prompt(raw, start_prefix, closing, suspects, enforce_templates)

    result: Optional[Dict[str, Any]] = None
    last_err: Optional[Exception] = None

    for _ in range(max(1, retries)):
        try:
            result = call_chat(api_base, api_key, model, system_prompt, user_prompt,
                               use_response_format=use_response_format)
            if strict and "checks" not in result:
                raise ValueError("strictモード: JSONに 'checks' がありません")
            if "issues" not in result or "fixed_text" not in result:
                raise ValueError("JSONに必要なキー(issues, fixed_text)がありません")
            break
        except Exception as e:
            last_err = e
            time.sleep(1.0)

    if result is None:
        raise RuntimeError(f"LLM応答エラー: {last_err}")

    issues: List[Dict[str, Any]] = result.get("issues", []) or []
    fixed_text: str = result.get("fixed_text", raw)

    has_fail = False
    if strict and "checks" in result:
        has_fail = any(c.get("passed") is False for c in result["checks"])

    had_problem = bool(issues) or has_fail

    if only_problem and not had_problem:
        with log.open("a", encoding="utf-8") as lf:
            lf.write(f"\n## {path.name}\n")
            if suspects:
                lf.write(f"- prescan: { {k: len(v) for k,v in suspects.items()} }\n")
            lf.write("- clean: no issues found\n")
        return 1, 0

    out_dir.mkdir(parents=True, exist_ok=True)

    review_json = out_dir / f"{path.stem}.review.json"
    review_json.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")

    diff_text = "".join(difflib.unified_diff(
        raw.splitlines(keepends=True),
        fixed_text.splitlines(keepends=True),
        fromfile=f"{path.name}:original",
        tofile=f"{path.name}:fixed",
        n=2
    ))
    (out_dir / f"{path.stem}.diff.txt").write_text(diff_text, encoding="utf-8")

    if not dry_run:
        (out_dir / path.name).write_text(fixed_text, encoding="utf-8")

    if notes:
        rows = []
        for it in issues:
            before = str(it.get('before') or it.get('original') or '').replace('|','\|')
            after  = str(it.get('after') or it.get('suggestion') or '').replace('|','\|')
            rows.append(f"| [ ] | {it.get('line','-')} | {it.get('category', it.get('type','-'))} | {before} | {after} |")
        md = "# Review Notes\n\n"              f"- File: **{path.name}**\n"              f"- Issues: **{len(issues)}**\n\n"              "| Done | Line | Category | Before | After |\n|---|---:|---|---|---|\n" +              "\n".join(rows) + "\n"
        (out_dir / f"{path.stem}.review.md").write_text(md, encoding="utf-8")

    with log.open("a", encoding="utf-8") as lf:
        lf.write(f"\n## {path.name}\n")
        if suspects:
            lf.write(f"- prescan: { {k: len(v) for k,v in suspects.items()} }\n")
        lf.write(f"- issues: {len(issues)}\n")
        if strict and "checks" in result:
            passes = sum(1 for c in result["checks"] if c.get("passed") is True)
            fails  = sum(1 for c in result["checks"] if c.get("passed") is False)
            lf.write(f"- checks: pass={passes}, fail={fails}\n")
        for it in issues:
            line = it.get("line", "-")
            cat  = it.get("category", it.get("type", "-"))
            bf   = it.get("before", it.get("original", ""))
            af   = it.get("after", it.get("suggestion", ""))
            exp  = it.get("explanation", "")
            lf.write(f"  - line={line} cat={cat} '{bf}' -> '{af}' | {exp}\n")

    return 1, len(issues)

def main() -> None:
    ap = argparse.ArgumentParser(description="LLM二度見レビュー（ローカル/LM Studio対応）")
    ap.add_argument("--in", dest="indir", default="scripts", help="入力ディレクトリ（*.txt / *.md）")
    ap.add_argument("--out", dest="outdir", default="reviewed", help="出力先ディレクトリ")
    ap.add_argument("--log", dest="logfile", default="logs/review.log", help="ログファイル")
    ap.add_argument("--ext", default=".txt,.md", help="対象拡張子カンマ区切り（例: .txt,.md）")
    ap.add_argument("--provider", choices=["openai","lmstudio","ollama"], default="lmstudio")
    ap.add_argument("--base_url", default=None, help="APIベースURL（未指定ならprovider既定値）")
    ap.add_argument("--model", default="gpt-oss-20b", help="使用モデル名（LM StudioのモデルIDに合わせる）")
    ap.add_argument("--api_key", default=None, help="APIキー（OPENAI_API_KEY が既定）")
    ap.add_argument("--start_prefix", default=DEFAULT_START_PREFIX, help="開始定型句（必要時のみ使用）")
    ap.add_argument("--closing", default=DEFAULT_CLOSING, help="締め定型句（必要時のみ使用）")
    ap.add_argument("--tone", choices=["formal","casual"], default="formal", help="文体トーン")
    ap.add_argument("--dry-run", action="store_true", help="修正本文を書き出さない（JSON/diffのみ）")
    ap.add_argument("--no-jsonfmt", action="store_true", help="response_format=json_object を送らない（LM Studioでの400回避用）")
    ap.add_argument("--strict", action="store_true", help="各カテゴリの pass/fail を 'checks' で可視化")
    ap.add_argument("--only-problem", action="store_true", help="問題があるファイルだけ出力（クリーンはログのみ）")
    ap.add_argument("--no-templates", action="store_true", help="開始/締めの定型句チェックを無効化")
    ap.add_argument("--notes", action="store_true", help="問題があった場合に <name>.review.md を併せて出力")
    ap.add_argument("--retries", type=int, default=3, help="JSON整形や軽微な応答エラーのリトライ回数")
    args = ap.parse_args()

    indir = Path(args.indir)
    outdir = Path(args.outdir)
    logfile = Path(args.logfile)
    logfile.parent.mkdir(parents=True, exist_ok=True)
    logfile.write_text("# llm double-take review log\n", encoding="utf-8")

    if not indir.exists():
        print(f"[error] 入力ディレクトリが存在しません: {indir}", file=sys.stderr)
        sys.exit(1)

    if args.base_url is None:
        if args.provider == "openai":
            args.base_url = "https://api.openai.com/v1"
        elif args.provider == "lmstudio":
            args.base_url = "http://localhost:1234/v1"
        elif args.provider == "ollama":
            args.base_url = "http://localhost:11434/v1"

    api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "")

    exts = [e.strip() for e in args.ext.split(",") if e.strip().startswith(".")]
    files: List[Path] = []
    for ext in exts:
        files.extend(sorted(indir.glob(f"*{ext}")))

    if not files:
        print(f"[info] 対象ファイルが見つかりません: {indir} / ext={exts}")
        sys.exit(0)

    total_files = 0
    total_issues = 0
    for p in files:
        try:
            f, n = process_file(
                p, outdir, logfile,
                api_base=args.base_url, api_key=api_key, model=args.model,
                dry_run=args.dry_run, start_prefix=args.start_prefix, closing=args.closing,
                use_response_format=not args.no_jsonfmt, strict=args.strict, only_problem=args.only_problem,
                enforce_templates=not args.no_templates, tone=args.tone, notes=args.notes,
                retries=max(1, args.retries)
            )
            total_files += f
            total_issues += n
        except Exception as e:
            with logfile.open("a", encoding="utf-8") as lf:
                lf.write(f"\n## {p.name}\n- ERROR: {e}\n")
            print(f"[warn] {p.name}: {e}", file=sys.stderr)

    print(f"[done] files={total_files}, issues={total_issues}")
    print(f" -> reviewed dir: {outdir} ; log: {logfile}")

if __name__ == "__main__":
    main()
