X-Financial/server/src/app/services/user_agent_knowledge_helpers.py

from __future__ import annotations

import re
from typing import Any

from app.services.user_agent_knowledge_constants import (
    KNOWLEDGE_ARTICLE_PATTERN,
    KNOWLEDGE_LIST_ITEM_PATTERN,
    KNOWLEDGE_NUMBERED_ITEM_PATTERN,
    KNOWLEDGE_QUERY_STOPWORDS,
    KNOWLEDGE_SECTION_HEADING_PATTERN,
    MAX_KNOWLEDGE_MODEL_HITS,
    MAX_KNOWLEDGE_QUERY_TERMS,
)


class UserAgentKnowledgeHelpersMixin:
    GENERIC_KNOWLEDGE_TITLE_TERMS = {"远光软件", "股份有限", "有限公司"}
    KNOWLEDGE_QUERY_ANCHOR_TERMS = (
        "财务基础知识手册",
        "基础知识手册",
        "会计科目",
        "常用会计科目",
        "财务报表",
        "主要税种",
        "税种",
        "标准",
        "清单",
        "明细",
        "流程",
    )

    @staticmethod
    def _select_knowledge_model_hits(
        tool_payload: dict[str, Any],
        *,
        question: str | None = None,
    ) -> list[dict[str, Any]]:
        raw_hits = [
            item
            for item in list(tool_payload.get("hits") or [])
            if isinstance(item, dict)
        ][: max(MAX_KNOWLEDGE_MODEL_HITS + 3, 8)]
        if not raw_hits:
            return []

        query_terms = UserAgentKnowledgeHelpersMixin._extract_knowledge_query_terms(question or "")
        if not query_terms:
            return raw_hits[:MAX_KNOWLEDGE_MODEL_HITS]

        ranked_hits = sorted(
            enumerate(raw_hits),
            key=lambda value: (
                UserAgentKnowledgeHelpersMixin._score_knowledge_model_hit(
                    value[1],
                    query_terms=query_terms,
                    rank_index=value[0],
                ),
                -value[0],
            ),
            reverse=True,
        )
        return [item for _, item in ranked_hits[:MAX_KNOWLEDGE_MODEL_HITS]]


    @staticmethod
    def _score_knowledge_model_hit(
        item: dict[str, Any],
        *,
        query_terms: list[str],
        rank_index: int,
    ) -> int:
        title = str(item.get("title") or item.get("document_name") or "").lower()
        excerpt = str(item.get("excerpt") or "").lower()
        content = str(item.get("content") or "").lower()
        haystack = "\n".join([title, excerpt, content[:1400]])

        matched_terms = [term for term in query_terms if term in haystack]
        score = max(1, 48 - rank_index * 4)
        score += len(matched_terms) * 10
        score += sum(max(0, len(term) - 4) * 8 for term in matched_terms)
        score += sum(1 for term in matched_terms if term in title) * 8
        score += sum(max(0, len(term) - 4) * 6 for term in matched_terms if term in title)
        score += sum(
            (len(term) - 3) * 10
            for term in matched_terms
            if len(term) >= 4
            and term in title
            and term not in UserAgentKnowledgeHelpersMixin.GENERIC_KNOWLEDGE_TITLE_TERMS
        )

        leading_marker = UserAgentKnowledgeHelpersMixin._leading_knowledge_appendix_marker(content)
        if leading_marker == "# 章节导航":
            score -= 22
        elif leading_marker == "# 问答线索补充":
            score += 6 if matched_terms else -8
        elif leading_marker == "# 重点章节摘录":
            score += 4 if matched_terms else -4
        elif leading_marker == "# 结构化表格补充":
            score += 8 if matched_terms else -3

        if matched_terms and "|" in content:
            score += 8
        if matched_terms and any(marker in content for marker in ("：", ":")):
            score += 10
        if matched_terms and "\n" in content:
            score += 4
        if matched_terms and any(marker in content for marker in ("附表", "第", "条")):
            score += 4
        if matched_terms and any(marker in content for marker in ("第", "条", "：", "-", "•")):
            score += 4
        if re.search(r"没有.{0,8}(信息|规定|说明|依据)", content):
            score -= 12
        return score


    @staticmethod
    def _leading_knowledge_appendix_marker(content: str) -> str:
        normalized = str(content or "").lstrip()
        for marker in ("# 章节导航", "# 重点章节摘录", "# 问答线索补充", "# 结构化表格补充"):
            index = normalized.find(marker)
            if 0 <= index <= 220:
                return marker
        return ""


    def _prioritize_knowledge_evidence_items(
        self,
        question: str,
        evidence_items: list[dict[str, Any]],
    ) -> list[dict[str, Any]]:
        if not evidence_items or not self._question_requires_explicit_condition(question):
            return evidence_items

        for preferred_kind in ("table", "kv", "clause", "list"):
            for index, item in enumerate(evidence_items):
                if str(item.get("kind") or "") != preferred_kind:
                    continue
                return [item, *evidence_items[:index], *evidence_items[index + 1 :]]

        for index, item in enumerate(evidence_items):
            if re.search(r"\d", str(item.get("content") or "")):
                return [item, *evidence_items[:index], *evidence_items[index + 1 :]]

        return evidence_items


    @staticmethod
    def _is_knowledge_lead_in_segment(item: dict[str, str]) -> bool:
        kind = str(item.get("kind") or "").strip()
        content = str(item.get("content") or "").strip()
        return kind in {"kv", "list", "clause"} and content.endswith(("：", ":"))


    @staticmethod
    def _extract_knowledge_marker_family(content: str) -> str:
        normalized = str(content or "").strip()
        if not normalized:
            return ""
        if KNOWLEDGE_ARTICLE_PATTERN.match(normalized):
            return "article"
        if re.match(r"^\d+[.)、]\s*", normalized):
            return "arabic"
        if re.match(r"^[（(][一二三四五六七八九十百零0-9]+[)）]\s*", normalized):
            return "paren"
        if re.match(r"^[①②③④⑤⑥⑦⑧⑨⑩]\s*", normalized):
            return "circled"
        if KNOWLEDGE_LIST_ITEM_PATTERN.match(normalized):
            return "bullet"
        return ""


    @staticmethod
    def _knowledge_list_marker_sort_key(content: str) -> int:
        normalized = str(content or "").strip()
        match = re.match(r"^[（(]([一二三四五六七八九十百零0-9]+)[)）]", normalized)
        if not match:
            return 999
        marker = match.group(1)
        if marker.isdigit():
            return int(marker)
        values = {
            "零": 0,
            "一": 1,
            "二": 2,
            "三": 3,
            "四": 4,
            "五": 5,
            "六": 6,
            "七": 7,
            "八": 8,
            "九": 9,
            "十": 10,
        }
        if marker in values:
            return values[marker]
        if marker.startswith("十") and len(marker) == 2:
            return 10 + values.get(marker[1], 0)
        if marker.endswith("十") and len(marker) == 2:
            return values.get(marker[0], 0) * 10
        if "十" in marker:
            left, right = marker.split("十", 1)
            return values.get(left, 1) * 10 + values.get(right, 0)
        return 999


    @staticmethod
    def _format_knowledge_heading_label(heading: str) -> str:
        parts = [item.strip() for item in str(heading or "").split(">") if item.strip()]
        return " / ".join(parts)


    @staticmethod
    def _has_inline_numbered_knowledge_items(content: str) -> bool:
        return len(
            re.findall(
                r"[（(][一二三四五六七八九十百零0-9]+[)）]",
                str(content or ""),
            )
        ) >= 2


    @staticmethod
    def _split_inline_numbered_knowledge_items(content: str) -> list[str]:
        normalized = str(content or "").strip()
        if not UserAgentKnowledgeHelpersMixin._has_inline_numbered_knowledge_items(normalized):
            return [normalized] if normalized else []

        marker_pattern = r"[（(][一二三四五六七八九十百零0-9]+[)）]"
        first_marker = re.search(marker_pattern, normalized)
        if first_marker is None:
            return [normalized] if normalized else []

        prefix = normalized[: first_marker.start()].strip(" ：:")
        tail = normalized[first_marker.start() :].strip()
        item_pattern = (
            r"([（(][一二三四五六七八九十百零0-9]+[)）]\s*.*?"
            r"(?=\s*[（(][一二三四五六七八九十百零0-9]+[)）]|\s*$))"
        )
        items = [item.strip() for item in re.findall(item_pattern, tail) if item.strip()]
        if prefix:
            return [prefix, *items]
        return items or [normalized]


    @staticmethod
    def _focus_knowledge_segment_content(content: str, query_terms: list[str]) -> str:
        normalized = re.sub(r"\s+", " ", str(content or "").strip())
        if not normalized:
            return ""

        anchor_terms = sorted(
            {
                str(term or "").strip()
                for term in query_terms
                if len(str(term or "").strip()) >= 3
            },
            key=len,
            reverse=True,
        )
        anchor_index = -1
        for term in anchor_terms:
            anchor_index = normalized.lower().find(term.lower())
            if anchor_index >= 0:
                break
        if anchor_index < 0:
            return normalized

        prefix_window = normalized[max(0, anchor_index - 40) : anchor_index]
        marker_match = None
        for match in re.finditer(
            r"(?:第[一二三四五六七八九十百零0-9]+[部分章节条]|[一二三四五六七八九十]+、|[（(][一二三四五六七八九十百零0-9]+[)）])",
            prefix_window,
        ):
            marker_match = match
        start = anchor_index
        if marker_match is not None:
            start = max(0, anchor_index - len(prefix_window) + marker_match.start())

        return normalized[start : start + 700].strip()


    @staticmethod
    def _split_markdown_table_cells(line: str) -> list[str]:
        stripped = str(line or "").strip()
        if stripped.startswith("|"):
            stripped = stripped[1:]
        if stripped.endswith("|"):
            stripped = stripped[:-1]
        return [
            re.sub(r"\s+", " ", cell.replace("**", "").strip())
            for cell in stripped.split("|")
        ]


    @classmethod
    def _summarize_knowledge_table_preview(cls, preview: str) -> str:
        rows: list[list[str]] = []
        for line in str(preview or "").splitlines():
            if line.count("|") < 2:
                continue
            cells = cls._split_markdown_table_cells(line)
            if not cells or all(re.fullmatch(r":?-{2,}:?", cell.replace(" ", "")) for cell in cells):
                continue
            rows.append(cells)

        if len(rows) < 2:
            return "可直接参考的标准表如下。"

        header = rows[0]
        data_rows = [row for row in rows[1:] if len(row) == len(header)]
        if len(data_rows) == 1 and len(header) >= 2:
            row = data_rows[0]
            subject = row[0] or "该项目"
            pairs = [
                f"{label}：{value}"
                for label, value in zip(header[1:], row[1:])
                if label and value and value not in {"-", "—"}
            ]
            if pairs:
                return f"{subject}的标准为：{'；'.join(pairs)}。"

        return "相关标准项如下，请按表头和行内容对应使用。"


    def _summarize_knowledge_lines_conclusion(
        self,
        lines: list[str],
        *,
        heading: str = "",
    ) -> str:
        clean_lines = [
            self._clean_knowledge_segment_text(line)
            for line in lines
            if self._clean_knowledge_segment_text(line)
        ]
        if not clean_lines:
            return ""

        clean_heading = str(heading or "").strip()
        if not clean_heading and clean_lines and "：" not in clean_lines[0] and ":" not in clean_lines[0]:
            clean_heading = clean_lines[0]
        clean_heading = re.sub(
            r"^[一二三四五六七八九十百零0-9]+、\s*",
            "",
            clean_heading,
        )
        item_labels: list[str] = []
        for line in clean_lines:
            if "：" not in line and ":" not in line:
                continue
            label = re.split(r"[：:]", line, maxsplit=1)[0].strip()
            if 1 <= len(label) <= 24:
                item_labels.append(label)

        if clean_heading and len(item_labels) >= 2:
            return f"{clean_heading}包括：{'、'.join(item_labels[:6])}。"
        if item_labels:
            return f"{item_labels[0]}：{clean_lines[0].split('：', 1)[-1].strip()}"
        return clean_lines[0]


    @staticmethod
    def _knowledge_lines_have_multiple_labeled_items(lines: list[str]) -> bool:
        labeled_count = 0
        for line in lines:
            normalized = str(line or "").strip()
            if "：" not in normalized and ":" not in normalized:
                continue
            label = re.split(r"[：:]", normalized, maxsplit=1)[0].strip()
            if 1 <= len(label) <= 24:
                labeled_count += 1
        return labeled_count >= 2


    def _score_knowledge_evidence_candidate(
        self,
        item: dict[str, str],
        query_terms: list[str],
    ) -> int:
        heading = str(item.get("heading") or "").lower()
        content = str(item.get("content") or "").lower()
        kind = str(item.get("kind") or "").strip()
        haystack = "\n".join([heading, content])

        matched_terms = [term for term in query_terms if term in haystack]
        score = len(matched_terms) * 10
        score += sum(max(0, len(term) - 4) * 8 for term in matched_terms)
        score += sum(1 for term in matched_terms if term in heading) * 6
        score += sum(max(0, len(term) - 4) * 6 for term in matched_terms if term in heading)

        if kind == "table":
            score += 10
            if content.count("\n") < 2:
                score -= 24
        elif kind in {"kv", "clause", "list"}:
            score += 8
        elif kind == "paragraph":
            score += 4

        if "问答线索补充" in heading or "重点章节摘录" in heading:
            score += 8
        if "结构化表格补充" in heading:
            score += 10
        if "章节导航" in heading or "目录" in heading:
            score -= 16
        if re.search(r"[.。…]{6,}", content):
            score -= 12
        if any(hint in content for hint in ("应", "需", "不得", "可以", "标准", "条件", "材料", "审批", "流程", "包括")):
            score += 3

        content_length = len(content)
        if content_length > 220:
            score -= min(8, (content_length - 220) // 40)
        return score


    @staticmethod
    def _extract_knowledge_query_terms(question: str) -> list[str]:
        normalized_question = str(question or "").strip().lower()
        if not normalized_question:
            return []

        terms: list[str] = []
        seen: set[str] = set()

        def remember(term: str) -> None:
            normalized = str(term or "").strip().lower()
            if (
                not normalized
                or normalized in seen
                or normalized in KNOWLEDGE_QUERY_STOPWORDS
            ):
                return
            seen.add(normalized)
            terms.append(normalized)

        for item in re.findall(r"[a-z0-9][a-z0-9_\-]{1,}", normalized_question):
            remember(item)

        for block in re.findall(r"[\u4e00-\u9fff]{2,20}", normalized_question):
            remember(block)
            if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS:
                return terms
            for marker in ("标准", "金额", "限额", "额度"):
                marker_index = block.find(marker)
                if marker_index <= 0:
                    continue
                subject = block[:marker_index]
                for width in (6, 4, 3, 2):
                    remember(subject[-width:])
            for anchor in UserAgentKnowledgeHelpersMixin.KNOWLEDGE_QUERY_ANCHOR_TERMS:
                if anchor in block:
                    remember(anchor)
            tail = block[-14:]
            for size in (8, 7, 6, 5, 4):
                for start in range(0, len(tail) - size + 1):
                    piece = tail[start : start + size]
                    if any(
                        anchor in piece
                        for anchor in UserAgentKnowledgeHelpersMixin.KNOWLEDGE_QUERY_ANCHOR_TERMS
                    ):
                        remember(piece)
                        if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS:
                            return terms
            if len(block) <= 4:
                remember(block)
                continue
            for size in (4, 3, 2):
                for start in range(0, len(block) - size + 1):
                    remember(block[start : start + size])
                    if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS:
                        return terms

        return terms[:MAX_KNOWLEDGE_QUERY_TERMS]


    @staticmethod
    def _clean_knowledge_segment_text(content: str) -> str:
        normalized = str(content or "").strip()
        normalized = re.sub(r"^[-*•]\s*", "", normalized)
        normalized = re.sub(r"^(?:\d+[.)、]|[①②③④⑤⑥⑦⑧⑨⑩])\s*", "", normalized)
        normalized = re.sub(r"^[（(][一二三四五六七八九十百零0-9]+[)）]\s*", "", normalized)
        normalized = re.sub(r"\s+", " ", normalized)
        if len(normalized) <= 180:
            return normalized
        return f"{normalized[:177].rstrip()}..."


    @staticmethod
    def _normalize_knowledge_line(content: str, *, preserve_marker: bool) -> str:
        normalized = str(content or "").strip()
        normalized = re.sub(r"^[-*•]\s*", "", normalized)
        if not preserve_marker:
            normalized = re.sub(r"^(?:\d+[.)、]|[①②③④⑤⑥⑦⑧⑨⑩])\s*", "", normalized)
            normalized = re.sub(r"^[（(][一二三四五六七八九十百零0-9]+[)）]\s*", "", normalized)
        normalized = re.sub(r"\s+", " ", normalized)
        return normalized


    def _split_clean_knowledge_lines(
        self,
        content: str,
        *,
        preserve_marker: bool,
    ) -> list[str]:
        return [
            line
            for line in (
                self._normalize_knowledge_line(item, preserve_marker=preserve_marker)
                for item in str(content or "").splitlines()
            )
            if line
        ]


    @staticmethod
    def _extract_relevant_table_preview(
        content: str,
        query_terms: list[str],
        *,
        preferred_terms: list[str] | None = None,
        max_rows: int = 3,
        fallback_rows: int = 2,
    ) -> str:
        lines = [line.strip() for line in str(content or "").splitlines() if line.strip()]
        if len(lines) <= 3:
            return "\n".join(lines)

        header = lines[0]
        divider = lines[1] if len(lines) > 1 else ""
        body = lines[2:] if divider.count("|") >= 2 else lines[1:]

        preferred = [
            str(term or "").strip().lower()
            for term in list(preferred_terms or [])
            if str(term or "").strip()
        ]
        base_terms = preferred + [
            str(term or "").strip().lower()
            for term in query_terms
            if str(term or "").strip().lower() not in preferred
        ]
        derived_terms: list[str] = []
        for term in base_terms:
            for marker in ("标准", "金额", "限额", "额度", "是多少"):
                marker_index = term.find(marker)
                if marker_index <= 0:
                    continue
                subject = term[:marker_index].strip()
                if len(subject) < 2:
                    continue
                for width in (6, 4, 3, 2):
                    derived_terms.append(subject[-width:])

        search_terms: list[str] = []
        for term in [*preferred, *derived_terms, *base_terms]:
            if term and term not in search_terms:
                search_terms.append(term)

        matched_rows = [
            row
            for row in body
            if any(term in row.lower() for term in search_terms)
        ]
        selected_rows = matched_rows[:max_rows] or body[:fallback_rows]
        preview_lines = [header]
        if divider:
            preview_lines.append(divider)
        preview_lines.extend(selected_rows)
        return "\n".join(preview_lines).strip()


    @staticmethod
    def _question_requests_broad_knowledge_table(question: str) -> bool:
        normalized = str(question or "").strip()
        if not normalized:
            return False
        broad_hints = ("有哪些", "是什么", "介绍", "说明", "列表", "清单", "全部", "完整")
        table_subject_hints = ("科目", "目录", "清单", "列表", "表", "明细")
        return any(hint in normalized for hint in broad_hints) and any(
            hint in normalized for hint in table_subject_hints
        )


    @staticmethod
    def _question_requires_explicit_condition(question: str) -> bool:
        normalized = str(question or "").strip()
        return any(keyword in normalized for keyword in ("多少", "金额", "上限", "限额", "标准", "条件", "需要"))


    @staticmethod
    def _answer_evidence_has_numeric_or_condition(evidence_items: list[dict[str, Any]]) -> bool:
        for item in evidence_items:
            content = str(item.get("content") or "")
            if re.search(r"\d", content):
                return True
            if any(
                keyword in content
                for keyword in ("应", "需", "不得", "可以", "条件", "材料", "审批", "流程", "标准", "适用")
            ):
                return True
        return False