Files
X-Financial/server/src/app/services/knowledge_normalizer.py

414 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import re
from dataclasses import dataclass
from sqlalchemy.orm import Session
from app.core.logging import get_logger
from app.services.runtime_chat import RuntimeChatService
logger = get_logger("app.services.knowledge_normalizer")
TABLE_MARKER_PATTERN = re.compile(r"\s*(\d+)")
SECTION_HEADING_PATTERN = re.compile(
r"^(第[一二三四五六七八九十百零0-9]+[章节]\s*.*|[一二三四五六七八九十]+、.*|[一二三四五六七八九十]+.*|\([一二三四五六七八九十]+\).*)$"
)
LIST_ITEM_PATTERN = re.compile(r"^[-*•]\s+.+$")
NUMBERED_ITEM_PATTERN = re.compile(r"^(?:\d+[.)、]|[①②③④⑤⑥⑦⑧⑨⑩])\s*.+$")
ARTICLE_PATTERN = re.compile(r"^(第[一二三四五六七八九十百零0-9]+条)\s*.*$")
KEY_VALUE_PATTERN = re.compile(r"^[^:\s][^:]{0,40}[:]\s*.+$")
MAX_TABLE_WINDOW_CHARS = 1800
MAX_TABLES_PER_DOCUMENT = 8
MAX_SECTION_OUTLINE_ITEMS = 12
MAX_SECTION_SNIPPETS = 8
MAX_SECTION_SNIPPET_CHARS = 220
MAX_SECTION_QA_CLUES = 4
MAX_TOTAL_QA_CLUES = 24
MAX_QA_CLUE_CHARS = 180
FACT_KEYWORDS = (
"适用",
"标准",
"条件",
"流程",
"审批",
"提交",
"附件",
"材料",
"票据",
"报销",
"限额",
"金额",
"比例",
"范围",
"对象",
"人员",
"时限",
"工作日",
"不得",
"可以",
"应当",
"",
"",
)
@dataclass(frozen=True, slots=True)
class TableCandidate:
title: str
excerpt: str
@dataclass(frozen=True, slots=True)
class SectionCandidate:
title: str
excerpt: str
body_lines: tuple[str, ...]
class KnowledgeNormalizationService:
def __init__(self, db: Session) -> None:
self.runtime_chat_service = RuntimeChatService(db)
def build_enriched_text(self, raw_text: str) -> str:
normalized_text = str(raw_text or "").strip()
if not normalized_text:
return ""
section_appendix = self._build_section_appendix(normalized_text)
answer_clue_appendix = self._build_answer_clue_appendix(normalized_text)
normalized_tables: list[str] = []
for candidate in self._extract_table_candidates(normalized_text):
rendered = self._normalize_table_candidate(candidate)
if rendered:
normalized_tables.append(f"## {candidate.title}\n\n{rendered}")
appendix_parts: list[str] = []
if section_appendix:
appendix_parts.append(section_appendix)
if answer_clue_appendix:
appendix_parts.append(answer_clue_appendix)
if normalized_tables:
appendix = "\n\n".join(normalized_tables)
appendix_parts.append(
"# 结构化表格补充\n\n"
"以下表格由知识归纳阶段依据原文重新整理,供问答检索时优先理解行列关系。\n\n"
f"{appendix}"
)
if not appendix_parts:
return normalized_text
return "\n\n".join([normalized_text, *appendix_parts])
@staticmethod
def _extract_table_candidates(text: str) -> list[TableCandidate]:
candidates: list[TableCandidate] = []
occupied_ranges: list[tuple[int, int]] = []
for match in TABLE_MARKER_PATTERN.finditer(text):
if len(candidates) >= MAX_TABLES_PER_DOCUMENT:
break
start = text.rfind("\n", 0, match.start())
start = 0 if start < 0 else start + 1
end = min(len(text), start + MAX_TABLE_WINDOW_CHARS)
if any(start < existing_end and end > existing_start for existing_start, existing_end in occupied_ranges):
continue
excerpt = text[start:end].strip()
head = excerpt[:360]
if "单位:" not in head and "标准" not in head:
continue
if excerpt.count("\n") < 6 or sum(char.isdigit() for char in excerpt) < 4:
continue
marker = match.group(0).replace(" ", "")
first_line = next((line.strip() for line in excerpt.splitlines() if line.strip()), marker)
title = first_line if first_line.startswith(marker) else marker
candidates.append(TableCandidate(title=title, excerpt=excerpt))
occupied_ranges.append((start, end))
return candidates
def _normalize_table_candidate(self, candidate: TableCandidate) -> str:
messages = [
{
"role": "system",
"content": (
"你是制度文档结构化助手。"
"只依据用户提供的原文,提炼其中的表格为清晰 Markdown。"
"必须严格按照表头从左到右对齐每个数值,不能猜测、不能改列顺序、不能擅自补全。"
"只输出一张 Markdown 表格本身,不要输出标题、说明、注释、脚注或正文解释。"
"如果原文不足以确认表格关系,只回复“无法确认”。"
"不要输出思考过程,不要复述原文,不要添加制度之外的新事实。"
),
},
{
"role": "user",
"content": (
f"请仅整理下面这段制度表格,标题为《{candidate.title}》。\n\n"
f"{candidate.excerpt}"
),
},
]
answer = self.runtime_chat_service.complete(
messages,
max_tokens=900,
temperature=0.0,
)
cleaned = self._sanitize_answer(answer)
if not cleaned or cleaned == "无法确认":
return ""
if cleaned.count("|") < 6:
logger.info("Skip non-tabular normalization candidate title=%s", candidate.title)
return ""
return cleaned
@staticmethod
def _build_section_appendix(text: str) -> str:
candidates = KnowledgeNormalizationService._extract_section_candidates(text)
if len(candidates) < 2:
return ""
outline = "\n".join(
f"- {item.title}"
for item in candidates[:MAX_SECTION_OUTLINE_ITEMS]
)
snippets = "\n\n".join(
[
f"## {item.title}\n\n{item.excerpt}"
for item in candidates[:MAX_SECTION_SNIPPETS]
if item.excerpt
]
)
if not snippets:
return ""
return (
"# 章节导航\n\n"
"以下内容由入库阶段从制度原文中提取,供检索时优先理解制度层级、条目和标准所在章节。\n\n"
f"{outline}\n\n"
"# 重点章节摘录\n\n"
f"{snippets}"
)
@staticmethod
def _build_answer_clue_appendix(text: str) -> str:
candidates = KnowledgeNormalizationService._extract_section_candidates(text)
clue_lines: list[str] = []
if candidates:
for candidate in candidates:
clue_lines.extend(
KnowledgeNormalizationService._extract_section_clues(candidate)
)
else:
clue_lines.extend(KnowledgeNormalizationService._extract_freeform_clues(text))
deduped: list[str] = []
seen: set[str] = set()
for item in clue_lines:
normalized = re.sub(r"\s+", " ", str(item or "")).strip()
if not normalized or normalized in seen:
continue
seen.add(normalized)
deduped.append(normalized)
if len(deduped) >= MAX_TOTAL_QA_CLUES:
break
if len(deduped) < 2:
return ""
return (
"# 问答线索补充\n\n"
"以下内容由入库阶段根据章节标题、条款、列表、键值对与相邻正文提炼,"
"供问答检索时优先命中更短、更直接的制度依据。\n\n"
+ "\n".join(f"- {item}" for item in deduped)
)
@staticmethod
def _extract_section_candidates(text: str) -> list[SectionCandidate]:
lines = [line.rstrip() for line in str(text or "").splitlines()]
sections: list[SectionCandidate] = []
current_title = ""
current_body: list[str] = []
def flush() -> None:
nonlocal current_title, current_body
if not current_title:
current_body = []
return
excerpt = KnowledgeNormalizationService._build_section_excerpt(current_body)
if excerpt:
sections.append(
SectionCandidate(
title=current_title,
excerpt=excerpt,
body_lines=tuple(current_body),
)
)
current_title = ""
current_body = []
for raw_line in lines:
line = raw_line.strip()
if not line:
if current_body:
current_body.append("")
continue
if SECTION_HEADING_PATTERN.match(line) and len(line) <= 80:
flush()
current_title = line
continue
if current_title:
current_body.append(line)
flush()
return sections
@staticmethod
def _build_section_excerpt(lines: list[str]) -> str:
cleaned_lines = [line.strip() for line in lines if line.strip()]
if not cleaned_lines:
return ""
excerpt = "".join(cleaned_lines[:3]).strip()
if len(excerpt) <= MAX_SECTION_SNIPPET_CHARS:
return excerpt
return f"{excerpt[: MAX_SECTION_SNIPPET_CHARS - 3].rstrip()}..."
@staticmethod
def _extract_section_clues(candidate: SectionCandidate) -> list[str]:
clues: list[str] = []
fallback: list[str] = []
for raw_line in candidate.body_lines:
normalized_line = KnowledgeNormalizationService._normalize_fact_line(raw_line)
if not normalized_line or KnowledgeNormalizationService._is_table_like_line(normalized_line):
continue
fact_units = KnowledgeNormalizationService._split_fact_units(normalized_line)
for unit in fact_units:
rendered = KnowledgeNormalizationService._render_clue(candidate.title, unit)
if not rendered:
continue
if KnowledgeNormalizationService._looks_like_fact_line(unit):
clues.append(rendered)
elif len(fallback) < 2:
fallback.append(rendered)
if len(clues) >= MAX_SECTION_QA_CLUES:
return clues[:MAX_SECTION_QA_CLUES]
return clues[:MAX_SECTION_QA_CLUES] or fallback[:2]
@staticmethod
def _extract_freeform_clues(text: str) -> list[str]:
clues: list[str] = []
for raw_line in str(text or "").splitlines():
normalized_line = KnowledgeNormalizationService._normalize_fact_line(raw_line)
if (
not normalized_line
or SECTION_HEADING_PATTERN.match(normalized_line)
or KnowledgeNormalizationService._is_table_like_line(normalized_line)
or not KnowledgeNormalizationService._looks_like_fact_line(normalized_line)
):
continue
for unit in KnowledgeNormalizationService._split_fact_units(normalized_line):
rendered = KnowledgeNormalizationService._render_clue("正文", unit)
if rendered:
clues.append(rendered)
if len(clues) >= MAX_TOTAL_QA_CLUES:
return clues
return clues
@staticmethod
def _split_fact_units(line: str) -> list[str]:
normalized = KnowledgeNormalizationService._normalize_fact_line(line)
if not normalized:
return []
if len(normalized) <= MAX_QA_CLUE_CHARS and all(mark not in normalized for mark in ("", ";", "")):
return [normalized]
units: list[str] = []
for part in re.split(r"[;。]\s*", normalized):
cleaned = KnowledgeNormalizationService._normalize_fact_line(part)
if not cleaned:
continue
units.append(cleaned)
return units or [KnowledgeNormalizationService._truncate_clue(normalized)]
@staticmethod
def _normalize_fact_line(line: str) -> str:
normalized = str(line or "").strip()
normalized = re.sub(r"\s+", " ", normalized)
return normalized.strip(" -")
@staticmethod
def _is_table_like_line(line: str) -> bool:
normalized = str(line or "").strip()
if not normalized:
return False
if normalized.count("|") >= 2:
return True
if normalized.count("\t") >= 2:
return True
number_tokens = re.findall(r"\d+(?:[.][0-9]+)?", normalized)
if len(number_tokens) >= 3 and len(normalized.split()) >= 4 and not any(
punct in normalized for punct in ("", "", ";", "", ":")
):
return True
return "单位:" in normalized and sum(char.isdigit() for char in normalized) >= 3
@staticmethod
def _looks_like_fact_line(line: str) -> bool:
normalized = KnowledgeNormalizationService._normalize_fact_line(line)
if len(normalized) < 6:
return False
if TABLE_MARKER_PATTERN.search(normalized) or normalized.startswith(("单位:", "单位:")):
return False
if (
ARTICLE_PATTERN.match(normalized)
or LIST_ITEM_PATTERN.match(normalized)
or NUMBERED_ITEM_PATTERN.match(normalized)
or KEY_VALUE_PATTERN.match(normalized)
):
return True
if any(keyword in normalized for keyword in FACT_KEYWORDS):
return True
return any(char.isdigit() for char in normalized)
@staticmethod
def _render_clue(section_title: str, line: str) -> str:
normalized_line = KnowledgeNormalizationService._truncate_clue(line)
if not normalized_line:
return ""
normalized_title = str(section_title or "").strip()
if not normalized_title:
return normalized_line
return f"{normalized_title}{normalized_line}"
@staticmethod
def _truncate_clue(line: str) -> str:
normalized = KnowledgeNormalizationService._normalize_fact_line(line)
if len(normalized) <= MAX_QA_CLUE_CHARS:
return normalized
return f"{normalized[: MAX_QA_CLUE_CHARS - 3].rstrip()}..."
@staticmethod
def _sanitize_answer(answer: str | None) -> str:
cleaned = re.sub(r"<think>.*?</think>", "", str(answer or ""), flags=re.DOTALL | re.IGNORECASE)
lines = [line.rstrip() for line in cleaned.strip().splitlines()]
table_lines: list[str] = []
for line in lines:
normalized = line.strip()
if "|" not in normalized:
if table_lines:
break
continue
table_lines.append(normalized)
return "\n".join(table_lines).strip()