feat: 增强规则资产管理与审计页面运行时调试

后端新增规则资产版本管理和规则文件 CRUD 接口，优化风险规则生成模板执行和员工数据模型字段，知识库 RAG 增强本地回退和文档提取能力，清理旧风险规则文件统一由生成引擎管理，前端审计页面增加运行时调试面板和规则资产编辑交互，补充单元测试覆盖。
2026-05-24 21:44:17 +08:00
parent 575f093c74
commit 50b1c3f9a9
113 changed files with 13896 additions and 5044 deletions
--- a/server/src/app/services/user_agent_knowledge_helpers.py
+++ b/server/src/app/services/user_agent_knowledge_helpers.py
@@ -15,6 +15,20 @@ from app.services.user_agent_knowledge_constants import (


 class UserAgentKnowledgeHelpersMixin:
+    GENERIC_KNOWLEDGE_TITLE_TERMS = {"远光软件", "股份有限", "有限公司"}
+    KNOWLEDGE_QUERY_ANCHOR_TERMS = (
+        "财务基础知识手册",
+        "基础知识手册",
+        "会计科目",
+        "常用会计科目",
+        "财务报表",
+        "主要税种",
+        "税种",
+        "标准",
+        "清单",
+        "明细",
+        "流程",
+    )

    @staticmethod
    def _select_knowledge_model_hits(
@@ -26,7 +40,7 @@ class UserAgentKnowledgeHelpersMixin:
            item
            for item in list(tool_payload.get("hits") or [])
            if isinstance(item, dict)
-        ][: max(MAX_KNOWLEDGE_MODEL_HITS + 1, 6)]
+        ][: max(MAX_KNOWLEDGE_MODEL_HITS + 3, 8)]
        if not raw_hits:
            return []

@@ -64,7 +78,16 @@ class UserAgentKnowledgeHelpersMixin:
        matched_terms = [term for term in query_terms if term in haystack]
        score = max(1, 48 - rank_index * 4)
        score += len(matched_terms) * 10
+        score += sum(max(0, len(term) - 4) * 8 for term in matched_terms)
        score += sum(1 for term in matched_terms if term in title) * 8
+        score += sum(max(0, len(term) - 4) * 6 for term in matched_terms if term in title)
+        score += sum(
+            (len(term) - 3) * 10
+            for term in matched_terms
+            if len(term) >= 4
+            and term in title
+            and term not in UserAgentKnowledgeHelpersMixin.GENERIC_KNOWLEDGE_TITLE_TERMS
+        )

        leading_marker = UserAgentKnowledgeHelpersMixin._leading_knowledge_appendix_marker(content)
        if leading_marker == "# 章节导航":
@@ -149,6 +172,40 @@ class UserAgentKnowledgeHelpersMixin:
        return ""


+    @staticmethod
+    def _knowledge_list_marker_sort_key(content: str) -> int:
+        normalized = str(content or "").strip()
+        match = re.match(r"^[（(]([一二三四五六七八九十百零0-9]+)[)）]", normalized)
+        if not match:
+            return 999
+        marker = match.group(1)
+        if marker.isdigit():
+            return int(marker)
+        values = {
+            "零": 0,
+            "一": 1,
+            "二": 2,
+            "三": 3,
+            "四": 4,
+            "五": 5,
+            "六": 6,
+            "七": 7,
+            "八": 8,
+            "九": 9,
+            "十": 10,
+        }
+        if marker in values:
+            return values[marker]
+        if marker.startswith("十") and len(marker) == 2:
+            return 10 + values.get(marker[1], 0)
+        if marker.endswith("十") and len(marker) == 2:
+            return values.get(marker[0], 0) * 10
+        if "十" in marker:
+            left, right = marker.split("十", 1)
+            return values.get(left, 1) * 10 + values.get(right, 0)
+        return 999
+
+

    @staticmethod
    def _format_knowledge_heading_label(heading: str) -> str:
@@ -156,6 +213,169 @@ class UserAgentKnowledgeHelpersMixin:
        return " / ".join(parts)


+    @staticmethod
+    def _has_inline_numbered_knowledge_items(content: str) -> bool:
+        return len(
+            re.findall(
+                r"[（(][一二三四五六七八九十百零0-9]+[)）]",
+                str(content or ""),
+            )
+        ) >= 2
+
+
+    @staticmethod
+    def _split_inline_numbered_knowledge_items(content: str) -> list[str]:
+        normalized = str(content or "").strip()
+        if not UserAgentKnowledgeHelpersMixin._has_inline_numbered_knowledge_items(normalized):
+            return [normalized] if normalized else []
+
+        marker_pattern = r"[（(][一二三四五六七八九十百零0-9]+[)）]"
+        first_marker = re.search(marker_pattern, normalized)
+        if first_marker is None:
+            return [normalized] if normalized else []
+
+        prefix = normalized[: first_marker.start()].strip(" ：:")
+        tail = normalized[first_marker.start() :].strip()
+        item_pattern = (
+            r"([（(][一二三四五六七八九十百零0-9]+[)）]\s*.*?"
+            r"(?=\s*[（(][一二三四五六七八九十百零0-9]+[)）]|\s*$))"
+        )
+        items = [item.strip() for item in re.findall(item_pattern, tail) if item.strip()]
+        if prefix:
+            return [prefix, *items]
+        return items or [normalized]
+
+
+    @staticmethod
+    def _focus_knowledge_segment_content(content: str, query_terms: list[str]) -> str:
+        normalized = re.sub(r"\s+", " ", str(content or "").strip())
+        if not normalized:
+            return ""
+
+        anchor_terms = sorted(
+            {
+                str(term or "").strip()
+                for term in query_terms
+                if len(str(term or "").strip()) >= 3
+            },
+            key=len,
+            reverse=True,
+        )
+        anchor_index = -1
+        for term in anchor_terms:
+            anchor_index = normalized.lower().find(term.lower())
+            if anchor_index >= 0:
+                break
+        if anchor_index < 0:
+            return normalized
+
+        prefix_window = normalized[max(0, anchor_index - 40) : anchor_index]
+        marker_match = None
+        for match in re.finditer(
+            r"(?:第[一二三四五六七八九十百零0-9]+[部分章节条]|[一二三四五六七八九十]+、|[（(][一二三四五六七八九十百零0-9]+[)）])",
+            prefix_window,
+        ):
+            marker_match = match
+        start = anchor_index
+        if marker_match is not None:
+            start = max(0, anchor_index - len(prefix_window) + marker_match.start())
+
+        return normalized[start : start + 700].strip()
+
+
+    @staticmethod
+    def _split_markdown_table_cells(line: str) -> list[str]:
+        stripped = str(line or "").strip()
+        if stripped.startswith("|"):
+            stripped = stripped[1:]
+        if stripped.endswith("|"):
+            stripped = stripped[:-1]
+        return [
+            re.sub(r"\s+", " ", cell.replace("**", "").strip())
+            for cell in stripped.split("|")
+        ]
+
+
+    @classmethod
+    def _summarize_knowledge_table_preview(cls, preview: str) -> str:
+        rows: list[list[str]] = []
+        for line in str(preview or "").splitlines():
+            if line.count("|") < 2:
+                continue
+            cells = cls._split_markdown_table_cells(line)
+            if not cells or all(re.fullmatch(r":?-{2,}:?", cell.replace(" ", "")) for cell in cells):
+                continue
+            rows.append(cells)
+
+        if len(rows) < 2:
+            return "可直接参考的标准表如下。"
+
+        header = rows[0]
+        data_rows = [row for row in rows[1:] if len(row) == len(header)]
+        if len(data_rows) == 1 and len(header) >= 2:
+            row = data_rows[0]
+            subject = row[0] or "该项目"
+            pairs = [
+                f"{label}：{value}"
+                for label, value in zip(header[1:], row[1:])
+                if label and value and value not in {"-", "—"}
+            ]
+            if pairs:
+                return f"{subject}的标准为：{'；'.join(pairs)}。"
+
+        return "相关标准项如下，请按表头和行内容对应使用。"
+
+
+    def _summarize_knowledge_lines_conclusion(
+        self,
+        lines: list[str],
+        *,
+        heading: str = "",
+    ) -> str:
+        clean_lines = [
+            self._clean_knowledge_segment_text(line)
+            for line in lines
+            if self._clean_knowledge_segment_text(line)
+        ]
+        if not clean_lines:
+            return ""
+
+        clean_heading = str(heading or "").strip()
+        if not clean_heading and clean_lines and "：" not in clean_lines[0] and ":" not in clean_lines[0]:
+            clean_heading = clean_lines[0]
+        clean_heading = re.sub(
+            r"^[一二三四五六七八九十百零0-9]+、\s*",
+            "",
+            clean_heading,
+        )
+        item_labels: list[str] = []
+        for line in clean_lines:
+            if "：" not in line and ":" not in line:
+                continue
+            label = re.split(r"[：:]", line, maxsplit=1)[0].strip()
+            if 1 <= len(label) <= 24:
+                item_labels.append(label)
+
+        if clean_heading and len(item_labels) >= 2:
+            return f"{clean_heading}包括：{'、'.join(item_labels[:6])}。"
+        if item_labels:
+            return f"{item_labels[0]}：{clean_lines[0].split('：', 1)[-1].strip()}"
+        return clean_lines[0]
+
+
+    @staticmethod
+    def _knowledge_lines_have_multiple_labeled_items(lines: list[str]) -> bool:
+        labeled_count = 0
+        for line in lines:
+            normalized = str(line or "").strip()
+            if "：" not in normalized and ":" not in normalized:
+                continue
+            label = re.split(r"[：:]", normalized, maxsplit=1)[0].strip()
+            if 1 <= len(label) <= 24:
+                labeled_count += 1
+        return labeled_count >= 2
+
+

    def _score_knowledge_evidence_candidate(
        self,
@@ -169,10 +389,14 @@ class UserAgentKnowledgeHelpersMixin:

        matched_terms = [term for term in query_terms if term in haystack]
        score = len(matched_terms) * 10
+        score += sum(max(0, len(term) - 4) * 8 for term in matched_terms)
        score += sum(1 for term in matched_terms if term in heading) * 6
+        score += sum(max(0, len(term) - 4) * 6 for term in matched_terms if term in heading)

        if kind == "table":
            score += 10
+            if content.count("\n") < 2:
+                score -= 24
        elif kind in {"kv", "clause", "list"}:
            score += 8
        elif kind == "paragraph":
@@ -220,6 +444,30 @@ class UserAgentKnowledgeHelpersMixin:
            remember(item)

        for block in re.findall(r"[\u4e00-\u9fff]{2,20}", normalized_question):
+            remember(block)
+            if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS:
+                return terms
+            for marker in ("标准", "金额", "限额", "额度"):
+                marker_index = block.find(marker)
+                if marker_index <= 0:
+                    continue
+                subject = block[:marker_index]
+                for width in (6, 4, 3, 2):
+                    remember(subject[-width:])
+            for anchor in UserAgentKnowledgeHelpersMixin.KNOWLEDGE_QUERY_ANCHOR_TERMS:
+                if anchor in block:
+                    remember(anchor)
+            tail = block[-14:]
+            for size in (8, 7, 6, 5, 4):
+                for start in range(0, len(tail) - size + 1):
+                    piece = tail[start : start + size]
+                    if any(
+                        anchor in piece
+                        for anchor in UserAgentKnowledgeHelpersMixin.KNOWLEDGE_QUERY_ANCHOR_TERMS
+                    ):
+                        remember(piece)
+                        if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS:
+                            return terms
            if len(block) <= 4:
                remember(block)
                continue
@@ -276,7 +524,14 @@ class UserAgentKnowledgeHelpersMixin:


    @staticmethod
-    def _extract_relevant_table_preview(content: str, query_terms: list[str]) -> str:
+    def _extract_relevant_table_preview(
+        content: str,
+        query_terms: list[str],
+        *,
+        preferred_terms: list[str] | None = None,
+        max_rows: int = 3,
+        fallback_rows: int = 2,
+    ) -> str:
        lines = [line.strip() for line in str(content or "").splitlines() if line.strip()]
        if len(lines) <= 3:
            return "\n".join(lines)
@@ -285,12 +540,39 @@ class UserAgentKnowledgeHelpersMixin:
        divider = lines[1] if len(lines) > 1 else ""
        body = lines[2:] if divider.count("|") >= 2 else lines[1:]

+        preferred = [
+            str(term or "").strip().lower()
+            for term in list(preferred_terms or [])
+            if str(term or "").strip()
+        ]
+        base_terms = preferred + [
+            str(term or "").strip().lower()
+            for term in query_terms
+            if str(term or "").strip().lower() not in preferred
+        ]
+        derived_terms: list[str] = []
+        for term in base_terms:
+            for marker in ("标准", "金额", "限额", "额度", "是多少"):
+                marker_index = term.find(marker)
+                if marker_index <= 0:
+                    continue
+                subject = term[:marker_index].strip()
+                if len(subject) < 2:
+                    continue
+                for width in (6, 4, 3, 2):
+                    derived_terms.append(subject[-width:])
+
+        search_terms: list[str] = []
+        for term in [*preferred, *derived_terms, *base_terms]:
+            if term and term not in search_terms:
+                search_terms.append(term)
+
        matched_rows = [
            row
            for row in body
-            if any(term in row.lower() for term in query_terms)
+            if any(term in row.lower() for term in search_terms)
        ]
-        selected_rows = matched_rows[:3] or body[:2]
+        selected_rows = matched_rows[:max_rows] or body[:fallback_rows]
        preview_lines = [header]
        if divider:
            preview_lines.append(divider)
@@ -298,6 +580,18 @@ class UserAgentKnowledgeHelpersMixin:
        return "\n".join(preview_lines).strip()


+    @staticmethod
+    def _question_requests_broad_knowledge_table(question: str) -> bool:
+        normalized = str(question or "").strip()
+        if not normalized:
+            return False
+        broad_hints = ("有哪些", "是什么", "介绍", "说明", "列表", "清单", "全部", "完整")
+        table_subject_hints = ("科目", "目录", "清单", "列表", "表", "明细")
+        return any(hint in normalized for hint in broad_hints) and any(
+            hint in normalized for hint in table_subject_hints
+        )
+
+

    @staticmethod
    def _question_requires_explicit_condition(question: str) -> bool: