feat: 重构知识库系统，移除Hermes集成，增强RAG和同步功能

主要变更: - 移除Hermes智能体及相关回调服务 - 新增知识库RAG、同步、调度、规范化和索引任务服务 - 重构orchestrator服务，增强运行时聊天功能 - 更新前端聊天、政策制度、设置等页面样式和逻辑 - 更新expense_claims和document_intelligence服务 - 删除llm_wiki相关服务和测试文件 - 更新docker-compose配置和启动脚本
2026-05-17 08:38:41 +00:00
parent 212c935308
commit 68f663f2f4
308 changed files with 83729 additions and 13588 deletions
--- a/server/src/app/services/knowledge_normalizer.py
+++ b/server/src/app/services/knowledge_normalizer.py
@@ -0,0 +1,414 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+
+from sqlalchemy.orm import Session
+
+from app.core.logging import get_logger
+from app.services.runtime_chat import RuntimeChatService
+
+logger = get_logger("app.services.knowledge_normalizer")
+
+TABLE_MARKER_PATTERN = re.compile(r"表\s*(\d+)")
+SECTION_HEADING_PATTERN = re.compile(
+    r"^(第[一二三四五六七八九十百零0-9]+[章节]\s*.*|[一二三四五六七八九十]+、.*|（[一二三四五六七八九十]+）.*|\([一二三四五六七八九十]+\).*)$"
+)
+LIST_ITEM_PATTERN = re.compile(r"^[-*•]\s+.+$")
+NUMBERED_ITEM_PATTERN = re.compile(r"^(?:\d+[.)、]|[①②③④⑤⑥⑦⑧⑨⑩])\s*.+$")
+ARTICLE_PATTERN = re.compile(r"^(第[一二三四五六七八九十百零0-9]+条)\s*.*$")
+KEY_VALUE_PATTERN = re.compile(r"^[^：:\s][^：:]{0,40}[：:]\s*.+$")
+MAX_TABLE_WINDOW_CHARS = 1800
+MAX_TABLES_PER_DOCUMENT = 8
+MAX_SECTION_OUTLINE_ITEMS = 12
+MAX_SECTION_SNIPPETS = 8
+MAX_SECTION_SNIPPET_CHARS = 220
+MAX_SECTION_QA_CLUES = 4
+MAX_TOTAL_QA_CLUES = 24
+MAX_QA_CLUE_CHARS = 180
+FACT_KEYWORDS = (
+    "适用",
+    "标准",
+    "条件",
+    "流程",
+    "审批",
+    "提交",
+    "附件",
+    "材料",
+    "票据",
+    "报销",
+    "限额",
+    "金额",
+    "比例",
+    "范围",
+    "对象",
+    "人员",
+    "时限",
+    "工作日",
+    "不得",
+    "可以",
+    "应当",
+    "应",
+    "需",
+)
+
+
+@dataclass(frozen=True, slots=True)
+class TableCandidate:
+    title: str
+    excerpt: str
+
+
+@dataclass(frozen=True, slots=True)
+class SectionCandidate:
+    title: str
+    excerpt: str
+    body_lines: tuple[str, ...]
+
+
+class KnowledgeNormalizationService:
+    def __init__(self, db: Session) -> None:
+        self.runtime_chat_service = RuntimeChatService(db)
+
+    def build_enriched_text(self, raw_text: str) -> str:
+        normalized_text = str(raw_text or "").strip()
+        if not normalized_text:
+            return ""
+
+        section_appendix = self._build_section_appendix(normalized_text)
+        answer_clue_appendix = self._build_answer_clue_appendix(normalized_text)
+        normalized_tables: list[str] = []
+        for candidate in self._extract_table_candidates(normalized_text):
+            rendered = self._normalize_table_candidate(candidate)
+            if rendered:
+                normalized_tables.append(f"## {candidate.title}\n\n{rendered}")
+
+        parts: list[str] = []
+        if section_appendix:
+            parts.append(section_appendix)
+        if answer_clue_appendix:
+            parts.append(answer_clue_appendix)
+        if normalized_tables:
+            appendix = "\n\n".join(normalized_tables)
+            parts.append(
+                "# 结构化表格补充\n\n"
+                "以下表格由知识归纳阶段依据原文重新整理，供问答检索时优先理解行列关系。\n\n"
+                f"{appendix}"
+            )
+
+        if not parts:
+            return normalized_text
+
+        parts.append(f"# 原文\n\n{normalized_text}")
+        return "\n\n".join(parts)
+
+    @staticmethod
+    def _extract_table_candidates(text: str) -> list[TableCandidate]:
+        candidates: list[TableCandidate] = []
+        occupied_ranges: list[tuple[int, int]] = []
+
+        for match in TABLE_MARKER_PATTERN.finditer(text):
+            if len(candidates) >= MAX_TABLES_PER_DOCUMENT:
+                break
+
+            start = text.rfind("\n", 0, match.start())
+            start = 0 if start < 0 else start + 1
+            end = min(len(text), start + MAX_TABLE_WINDOW_CHARS)
+            if any(start < existing_end and end > existing_start for existing_start, existing_end in occupied_ranges):
+                continue
+
+            excerpt = text[start:end].strip()
+            head = excerpt[:360]
+            if "单位：" not in head and "标准" not in head:
+                continue
+            if excerpt.count("\n") < 6 or sum(char.isdigit() for char in excerpt) < 4:
+                continue
+
+            marker = match.group(0).replace(" ", "")
+            first_line = next((line.strip() for line in excerpt.splitlines() if line.strip()), marker)
+            title = first_line if first_line.startswith(marker) else marker
+            candidates.append(TableCandidate(title=title, excerpt=excerpt))
+            occupied_ranges.append((start, end))
+
+        return candidates
+
+    def _normalize_table_candidate(self, candidate: TableCandidate) -> str:
+        messages = [
+            {
+                "role": "system",
+                "content": (
+                    "你是制度文档结构化助手。"
+                    "只依据用户提供的原文，提炼其中的表格为清晰 Markdown。"
+                    "必须严格按照表头从左到右对齐每个数值，不能猜测、不能改列顺序、不能擅自补全。"
+                    "只输出一张 Markdown 表格本身，不要输出标题、说明、注释、脚注或正文解释。"
+                    "如果原文不足以确认表格关系，只回复“无法确认”。"
+                    "不要输出思考过程，不要复述原文，不要添加制度之外的新事实。"
+                ),
+            },
+            {
+                "role": "user",
+                "content": (
+                    f"请仅整理下面这段制度表格，标题为《{candidate.title}》。\n\n"
+                    f"{candidate.excerpt}"
+                ),
+            },
+        ]
+        answer = self.runtime_chat_service.complete(
+            messages,
+            max_tokens=900,
+            temperature=0.0,
+        )
+        cleaned = self._sanitize_answer(answer)
+        if not cleaned or cleaned == "无法确认":
+            return ""
+        if cleaned.count("|") < 6:
+            logger.info("Skip non-tabular normalization candidate title=%s", candidate.title)
+            return ""
+        return cleaned
+
+    @staticmethod
+    def _build_section_appendix(text: str) -> str:
+        candidates = KnowledgeNormalizationService._extract_section_candidates(text)
+        if len(candidates) < 2:
+            return ""
+
+        outline = "\n".join(
+            f"- {item.title}"
+            for item in candidates[:MAX_SECTION_OUTLINE_ITEMS]
+        )
+        snippets = "\n\n".join(
+            [
+                f"## {item.title}\n\n{item.excerpt}"
+                for item in candidates[:MAX_SECTION_SNIPPETS]
+                if item.excerpt
+            ]
+        )
+        if not snippets:
+            return ""
+
+        return (
+            "# 章节导航\n\n"
+            "以下内容由入库阶段从制度原文中提取，供检索时优先理解制度层级、条目和标准所在章节。\n\n"
+            f"{outline}\n\n"
+            "# 重点章节摘录\n\n"
+            f"{snippets}"
+        )
+
+    @staticmethod
+    def _build_answer_clue_appendix(text: str) -> str:
+        candidates = KnowledgeNormalizationService._extract_section_candidates(text)
+        clue_lines: list[str] = []
+
+        if candidates:
+            for candidate in candidates:
+                clue_lines.extend(
+                    KnowledgeNormalizationService._extract_section_clues(candidate)
+                )
+        else:
+            clue_lines.extend(KnowledgeNormalizationService._extract_freeform_clues(text))
+
+        deduped: list[str] = []
+        seen: set[str] = set()
+        for item in clue_lines:
+            normalized = re.sub(r"\s+", " ", str(item or "")).strip()
+            if not normalized or normalized in seen:
+                continue
+            seen.add(normalized)
+            deduped.append(normalized)
+            if len(deduped) >= MAX_TOTAL_QA_CLUES:
+                break
+
+        if len(deduped) < 2:
+            return ""
+
+        return (
+            "# 问答线索补充\n\n"
+            "以下内容由入库阶段根据章节标题、条款、列表、键值对与相邻正文提炼，"
+            "供问答检索时优先命中更短、更直接的制度依据。\n\n"
+            + "\n".join(f"- {item}" for item in deduped)
+        )
+
+    @staticmethod
+    def _extract_section_candidates(text: str) -> list[SectionCandidate]:
+        lines = [line.rstrip() for line in str(text or "").splitlines()]
+        sections: list[SectionCandidate] = []
+        current_title = ""
+        current_body: list[str] = []
+
+        def flush() -> None:
+            nonlocal current_title, current_body
+            if not current_title:
+                current_body = []
+                return
+            excerpt = KnowledgeNormalizationService._build_section_excerpt(current_body)
+            if excerpt:
+                sections.append(
+                    SectionCandidate(
+                        title=current_title,
+                        excerpt=excerpt,
+                        body_lines=tuple(current_body),
+                    )
+                )
+            current_title = ""
+            current_body = []
+
+        for raw_line in lines:
+            line = raw_line.strip()
+            if not line:
+                if current_body:
+                    current_body.append("")
+                continue
+
+            if SECTION_HEADING_PATTERN.match(line) and len(line) <= 80:
+                flush()
+                current_title = line
+                continue
+
+            if current_title:
+                current_body.append(line)
+
+        flush()
+        return sections
+
+    @staticmethod
+    def _build_section_excerpt(lines: list[str]) -> str:
+        cleaned_lines = [line.strip() for line in lines if line.strip()]
+        if not cleaned_lines:
+            return ""
+        excerpt = "；".join(cleaned_lines[:3]).strip()
+        if len(excerpt) <= MAX_SECTION_SNIPPET_CHARS:
+            return excerpt
+        return f"{excerpt[: MAX_SECTION_SNIPPET_CHARS - 3].rstrip()}..."
+
+    @staticmethod
+    def _extract_section_clues(candidate: SectionCandidate) -> list[str]:
+        clues: list[str] = []
+        fallback: list[str] = []
+
+        for raw_line in candidate.body_lines:
+            normalized_line = KnowledgeNormalizationService._normalize_fact_line(raw_line)
+            if not normalized_line or KnowledgeNormalizationService._is_table_like_line(normalized_line):
+                continue
+
+            fact_units = KnowledgeNormalizationService._split_fact_units(normalized_line)
+            for unit in fact_units:
+                rendered = KnowledgeNormalizationService._render_clue(candidate.title, unit)
+                if not rendered:
+                    continue
+                if KnowledgeNormalizationService._looks_like_fact_line(unit):
+                    clues.append(rendered)
+                elif len(fallback) < 2:
+                    fallback.append(rendered)
+
+                if len(clues) >= MAX_SECTION_QA_CLUES:
+                    return clues[:MAX_SECTION_QA_CLUES]
+
+        return clues[:MAX_SECTION_QA_CLUES] or fallback[:2]
+
+    @staticmethod
+    def _extract_freeform_clues(text: str) -> list[str]:
+        clues: list[str] = []
+        for raw_line in str(text or "").splitlines():
+            normalized_line = KnowledgeNormalizationService._normalize_fact_line(raw_line)
+            if (
+                not normalized_line
+                or SECTION_HEADING_PATTERN.match(normalized_line)
+                or KnowledgeNormalizationService._is_table_like_line(normalized_line)
+                or not KnowledgeNormalizationService._looks_like_fact_line(normalized_line)
+            ):
+                continue
+
+            for unit in KnowledgeNormalizationService._split_fact_units(normalized_line):
+                rendered = KnowledgeNormalizationService._render_clue("正文", unit)
+                if rendered:
+                    clues.append(rendered)
+                if len(clues) >= MAX_TOTAL_QA_CLUES:
+                    return clues
+        return clues
+
+    @staticmethod
+    def _split_fact_units(line: str) -> list[str]:
+        normalized = KnowledgeNormalizationService._normalize_fact_line(line)
+        if not normalized:
+            return []
+        if len(normalized) <= MAX_QA_CLUE_CHARS and all(mark not in normalized for mark in ("；", ";", "。")):
+            return [normalized]
+
+        units: list[str] = []
+        for part in re.split(r"[；;。]\s*", normalized):
+            cleaned = KnowledgeNormalizationService._normalize_fact_line(part)
+            if not cleaned:
+                continue
+            units.append(cleaned)
+        return units or [KnowledgeNormalizationService._truncate_clue(normalized)]
+
+    @staticmethod
+    def _normalize_fact_line(line: str) -> str:
+        normalized = str(line or "").strip()
+        normalized = re.sub(r"\s+", " ", normalized)
+        return normalized.strip(" -")
+
+    @staticmethod
+    def _is_table_like_line(line: str) -> bool:
+        normalized = str(line or "").strip()
+        if not normalized:
+            return False
+        if normalized.count("|") >= 2:
+            return True
+        if normalized.count("\t") >= 2:
+            return True
+        number_tokens = re.findall(r"\d+(?:[.][0-9]+)?", normalized)
+        if len(number_tokens) >= 3 and len(normalized.split()) >= 4 and not any(
+            punct in normalized for punct in ("。", "；", ";", "：", ":")
+        ):
+            return True
+        return "单位：" in normalized and sum(char.isdigit() for char in normalized) >= 3
+
+    @staticmethod
+    def _looks_like_fact_line(line: str) -> bool:
+        normalized = KnowledgeNormalizationService._normalize_fact_line(line)
+        if len(normalized) < 6:
+            return False
+        if TABLE_MARKER_PATTERN.search(normalized) or normalized.startswith(("单位：", "单位:")):
+            return False
+        if (
+            ARTICLE_PATTERN.match(normalized)
+            or LIST_ITEM_PATTERN.match(normalized)
+            or NUMBERED_ITEM_PATTERN.match(normalized)
+            or KEY_VALUE_PATTERN.match(normalized)
+        ):
+            return True
+        if any(keyword in normalized for keyword in FACT_KEYWORDS):
+            return True
+        return any(char.isdigit() for char in normalized)
+
+    @staticmethod
+    def _render_clue(section_title: str, line: str) -> str:
+        normalized_line = KnowledgeNormalizationService._truncate_clue(line)
+        if not normalized_line:
+            return ""
+        normalized_title = str(section_title or "").strip()
+        if not normalized_title:
+            return normalized_line
+        return f"{normalized_title}：{normalized_line}"
+
+    @staticmethod
+    def _truncate_clue(line: str) -> str:
+        normalized = KnowledgeNormalizationService._normalize_fact_line(line)
+        if len(normalized) <= MAX_QA_CLUE_CHARS:
+            return normalized
+        return f"{normalized[: MAX_QA_CLUE_CHARS - 3].rstrip()}..."
+
+    @staticmethod
+    def _sanitize_answer(answer: str | None) -> str:
+        cleaned = re.sub(r"<think>.*?</think>", "", str(answer or ""), flags=re.DOTALL | re.IGNORECASE)
+        lines = [line.rstrip() for line in cleaned.strip().splitlines()]
+        table_lines: list[str] = []
+        for line in lines:
+            normalized = line.strip()
+            if "|" not in normalized:
+                if table_lines:
+                    break
+                continue
+            table_lines.append(normalized)
+        return "\n".join(table_lines).strip()