from __future__ import annotations import re from dataclasses import dataclass from sqlalchemy.orm import Session from app.core.logging import get_logger from app.services.runtime_chat import RuntimeChatService logger = get_logger("app.services.knowledge_normalizer") TABLE_MARKER_PATTERN = re.compile(r"表\s*(\d+)") SECTION_HEADING_PATTERN = re.compile( r"^(第[一二三四五六七八九十百零0-9]+[章节]\s*.*|[一二三四五六七八九十]+、.*|([一二三四五六七八九十]+).*|\([一二三四五六七八九十]+\).*)$" ) LIST_ITEM_PATTERN = re.compile(r"^[-*•]\s+.+$") NUMBERED_ITEM_PATTERN = re.compile(r"^(?:\d+[.)、]|[①②③④⑤⑥⑦⑧⑨⑩])\s*.+$") ARTICLE_PATTERN = re.compile(r"^(第[一二三四五六七八九十百零0-9]+条)\s*.*$") KEY_VALUE_PATTERN = re.compile(r"^[^::\s][^::]{0,40}[::]\s*.+$") MAX_TABLE_WINDOW_CHARS = 1800 MAX_TABLES_PER_DOCUMENT = 8 MAX_SECTION_OUTLINE_ITEMS = 12 MAX_SECTION_SNIPPETS = 8 MAX_SECTION_SNIPPET_CHARS = 220 MAX_SECTION_QA_CLUES = 4 MAX_TOTAL_QA_CLUES = 24 MAX_QA_CLUE_CHARS = 180 FACT_KEYWORDS = ( "适用", "标准", "条件", "流程", "审批", "提交", "附件", "材料", "票据", "报销", "限额", "金额", "比例", "范围", "对象", "人员", "时限", "工作日", "不得", "可以", "应当", "应", "需", ) @dataclass(frozen=True, slots=True) class TableCandidate: title: str excerpt: str @dataclass(frozen=True, slots=True) class SectionCandidate: title: str excerpt: str body_lines: tuple[str, ...] class KnowledgeNormalizationService: def __init__(self, db: Session) -> None: self.runtime_chat_service = RuntimeChatService(db) def build_enriched_text(self, raw_text: str) -> str: normalized_text = str(raw_text or "").strip() if not normalized_text: return "" section_appendix = self._build_section_appendix(normalized_text) answer_clue_appendix = self._build_answer_clue_appendix(normalized_text) normalized_tables: list[str] = [] for candidate in self._extract_table_candidates(normalized_text): rendered = self._normalize_table_candidate(candidate) if rendered: normalized_tables.append(f"## {candidate.title}\n\n{rendered}") appendix_parts: list[str] = [] if section_appendix: appendix_parts.append(section_appendix) if answer_clue_appendix: appendix_parts.append(answer_clue_appendix) if normalized_tables: appendix = "\n\n".join(normalized_tables) appendix_parts.append( "# 结构化表格补充\n\n" "以下表格由知识归纳阶段依据原文重新整理,供问答检索时优先理解行列关系。\n\n" f"{appendix}" ) if not appendix_parts: return normalized_text return "\n\n".join([normalized_text, *appendix_parts]) @staticmethod def _extract_table_candidates(text: str) -> list[TableCandidate]: candidates: list[TableCandidate] = [] occupied_ranges: list[tuple[int, int]] = [] for match in TABLE_MARKER_PATTERN.finditer(text): if len(candidates) >= MAX_TABLES_PER_DOCUMENT: break start = text.rfind("\n", 0, match.start()) start = 0 if start < 0 else start + 1 end = min(len(text), start + MAX_TABLE_WINDOW_CHARS) if any(start < existing_end and end > existing_start for existing_start, existing_end in occupied_ranges): continue excerpt = text[start:end].strip() head = excerpt[:360] if "单位:" not in head and "标准" not in head: continue if excerpt.count("\n") < 6 or sum(char.isdigit() for char in excerpt) < 4: continue marker = match.group(0).replace(" ", "") first_line = next((line.strip() for line in excerpt.splitlines() if line.strip()), marker) title = first_line if first_line.startswith(marker) else marker candidates.append(TableCandidate(title=title, excerpt=excerpt)) occupied_ranges.append((start, end)) return candidates def _normalize_table_candidate(self, candidate: TableCandidate) -> str: messages = [ { "role": "system", "content": ( "你是制度文档结构化助手。" "只依据用户提供的原文,提炼其中的表格为清晰 Markdown。" "必须严格按照表头从左到右对齐每个数值,不能猜测、不能改列顺序、不能擅自补全。" "只输出一张 Markdown 表格本身,不要输出标题、说明、注释、脚注或正文解释。" "如果原文不足以确认表格关系,只回复“无法确认”。" "不要输出思考过程,不要复述原文,不要添加制度之外的新事实。" ), }, { "role": "user", "content": ( f"请仅整理下面这段制度表格,标题为《{candidate.title}》。\n\n" f"{candidate.excerpt}" ), }, ] answer = self.runtime_chat_service.complete( messages, max_tokens=900, temperature=0.0, ) cleaned = self._sanitize_answer(answer) if not cleaned or cleaned == "无法确认": return "" if cleaned.count("|") < 6: logger.info("Skip non-tabular normalization candidate title=%s", candidate.title) return "" return cleaned @staticmethod def _build_section_appendix(text: str) -> str: candidates = KnowledgeNormalizationService._extract_section_candidates(text) if len(candidates) < 2: return "" outline = "\n".join( f"- {item.title}" for item in candidates[:MAX_SECTION_OUTLINE_ITEMS] ) snippets = "\n\n".join( [ f"## {item.title}\n\n{item.excerpt}" for item in candidates[:MAX_SECTION_SNIPPETS] if item.excerpt ] ) if not snippets: return "" return ( "# 章节导航\n\n" "以下内容由入库阶段从制度原文中提取,供检索时优先理解制度层级、条目和标准所在章节。\n\n" f"{outline}\n\n" "# 重点章节摘录\n\n" f"{snippets}" ) @staticmethod def _build_answer_clue_appendix(text: str) -> str: candidates = KnowledgeNormalizationService._extract_section_candidates(text) clue_lines: list[str] = [] if candidates: for candidate in candidates: clue_lines.extend( KnowledgeNormalizationService._extract_section_clues(candidate) ) else: clue_lines.extend(KnowledgeNormalizationService._extract_freeform_clues(text)) deduped: list[str] = [] seen: set[str] = set() for item in clue_lines: normalized = re.sub(r"\s+", " ", str(item or "")).strip() if not normalized or normalized in seen: continue seen.add(normalized) deduped.append(normalized) if len(deduped) >= MAX_TOTAL_QA_CLUES: break if len(deduped) < 2: return "" return ( "# 问答线索补充\n\n" "以下内容由入库阶段根据章节标题、条款、列表、键值对与相邻正文提炼," "供问答检索时优先命中更短、更直接的制度依据。\n\n" + "\n".join(f"- {item}" for item in deduped) ) @staticmethod def _extract_section_candidates(text: str) -> list[SectionCandidate]: lines = [line.rstrip() for line in str(text or "").splitlines()] sections: list[SectionCandidate] = [] current_title = "" current_body: list[str] = [] def flush() -> None: nonlocal current_title, current_body if not current_title: current_body = [] return excerpt = KnowledgeNormalizationService._build_section_excerpt(current_body) if excerpt: sections.append( SectionCandidate( title=current_title, excerpt=excerpt, body_lines=tuple(current_body), ) ) current_title = "" current_body = [] for raw_line in lines: line = raw_line.strip() if not line: if current_body: current_body.append("") continue if SECTION_HEADING_PATTERN.match(line) and len(line) <= 80: flush() current_title = line continue if current_title: current_body.append(line) flush() return sections @staticmethod def _build_section_excerpt(lines: list[str]) -> str: cleaned_lines = [line.strip() for line in lines if line.strip()] if not cleaned_lines: return "" excerpt = ";".join(cleaned_lines[:3]).strip() if len(excerpt) <= MAX_SECTION_SNIPPET_CHARS: return excerpt return f"{excerpt[: MAX_SECTION_SNIPPET_CHARS - 3].rstrip()}..." @staticmethod def _extract_section_clues(candidate: SectionCandidate) -> list[str]: clues: list[str] = [] fallback: list[str] = [] for raw_line in candidate.body_lines: normalized_line = KnowledgeNormalizationService._normalize_fact_line(raw_line) if not normalized_line or KnowledgeNormalizationService._is_table_like_line(normalized_line): continue fact_units = KnowledgeNormalizationService._split_fact_units(normalized_line) for unit in fact_units: rendered = KnowledgeNormalizationService._render_clue(candidate.title, unit) if not rendered: continue if KnowledgeNormalizationService._looks_like_fact_line(unit): clues.append(rendered) elif len(fallback) < 2: fallback.append(rendered) if len(clues) >= MAX_SECTION_QA_CLUES: return clues[:MAX_SECTION_QA_CLUES] return clues[:MAX_SECTION_QA_CLUES] or fallback[:2] @staticmethod def _extract_freeform_clues(text: str) -> list[str]: clues: list[str] = [] for raw_line in str(text or "").splitlines(): normalized_line = KnowledgeNormalizationService._normalize_fact_line(raw_line) if ( not normalized_line or SECTION_HEADING_PATTERN.match(normalized_line) or KnowledgeNormalizationService._is_table_like_line(normalized_line) or not KnowledgeNormalizationService._looks_like_fact_line(normalized_line) ): continue for unit in KnowledgeNormalizationService._split_fact_units(normalized_line): rendered = KnowledgeNormalizationService._render_clue("正文", unit) if rendered: clues.append(rendered) if len(clues) >= MAX_TOTAL_QA_CLUES: return clues return clues @staticmethod def _split_fact_units(line: str) -> list[str]: normalized = KnowledgeNormalizationService._normalize_fact_line(line) if not normalized: return [] if len(normalized) <= MAX_QA_CLUE_CHARS and all(mark not in normalized for mark in (";", ";", "。")): return [normalized] units: list[str] = [] for part in re.split(r"[;;。]\s*", normalized): cleaned = KnowledgeNormalizationService._normalize_fact_line(part) if not cleaned: continue units.append(cleaned) return units or [KnowledgeNormalizationService._truncate_clue(normalized)] @staticmethod def _normalize_fact_line(line: str) -> str: normalized = str(line or "").strip() normalized = re.sub(r"\s+", " ", normalized) return normalized.strip(" -") @staticmethod def _is_table_like_line(line: str) -> bool: normalized = str(line or "").strip() if not normalized: return False if normalized.count("|") >= 2: return True if normalized.count("\t") >= 2: return True number_tokens = re.findall(r"\d+(?:[.][0-9]+)?", normalized) if len(number_tokens) >= 3 and len(normalized.split()) >= 4 and not any( punct in normalized for punct in ("。", ";", ";", ":", ":") ): return True return "单位:" in normalized and sum(char.isdigit() for char in normalized) >= 3 @staticmethod def _looks_like_fact_line(line: str) -> bool: normalized = KnowledgeNormalizationService._normalize_fact_line(line) if len(normalized) < 6: return False if TABLE_MARKER_PATTERN.search(normalized) or normalized.startswith(("单位:", "单位:")): return False if ( ARTICLE_PATTERN.match(normalized) or LIST_ITEM_PATTERN.match(normalized) or NUMBERED_ITEM_PATTERN.match(normalized) or KEY_VALUE_PATTERN.match(normalized) ): return True if any(keyword in normalized for keyword in FACT_KEYWORDS): return True return any(char.isdigit() for char in normalized) @staticmethod def _render_clue(section_title: str, line: str) -> str: normalized_line = KnowledgeNormalizationService._truncate_clue(line) if not normalized_line: return "" normalized_title = str(section_title or "").strip() if not normalized_title: return normalized_line return f"{normalized_title}:{normalized_line}" @staticmethod def _truncate_clue(line: str) -> str: normalized = KnowledgeNormalizationService._normalize_fact_line(line) if len(normalized) <= MAX_QA_CLUE_CHARS: return normalized return f"{normalized[: MAX_QA_CLUE_CHARS - 3].rstrip()}..." @staticmethod def _sanitize_answer(answer: str | None) -> str: cleaned = re.sub(r".*?", "", str(answer or ""), flags=re.DOTALL | re.IGNORECASE) lines = [line.rstrip() for line in cleaned.strip().splitlines()] table_lines: list[str] = [] for line in lines: normalized = line.strip() if "|" not in normalized: if table_lines: break continue table_lines.append(normalized) return "\n".join(table_lines).strip()