2026-05-17 08:38:41 +00:00
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
|
|
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
|
|
|
|
|
|
|
|
from app.core.logging import get_logger
|
|
|
|
|
|
from app.services.runtime_chat import RuntimeChatService
|
|
|
|
|
|
|
|
|
|
|
|
logger = get_logger("app.services.knowledge_normalizer")
|
|
|
|
|
|
|
|
|
|
|
|
TABLE_MARKER_PATTERN = re.compile(r"表\s*(\d+)")
|
|
|
|
|
|
SECTION_HEADING_PATTERN = re.compile(
|
|
|
|
|
|
r"^(第[一二三四五六七八九十百零0-9]+[章节]\s*.*|[一二三四五六七八九十]+、.*|([一二三四五六七八九十]+).*|\([一二三四五六七八九十]+\).*)$"
|
|
|
|
|
|
)
|
|
|
|
|
|
LIST_ITEM_PATTERN = re.compile(r"^[-*•]\s+.+$")
|
|
|
|
|
|
NUMBERED_ITEM_PATTERN = re.compile(r"^(?:\d+[.)、]|[①②③④⑤⑥⑦⑧⑨⑩])\s*.+$")
|
|
|
|
|
|
ARTICLE_PATTERN = re.compile(r"^(第[一二三四五六七八九十百零0-9]+条)\s*.*$")
|
|
|
|
|
|
KEY_VALUE_PATTERN = re.compile(r"^[^::\s][^::]{0,40}[::]\s*.+$")
|
|
|
|
|
|
MAX_TABLE_WINDOW_CHARS = 1800
|
|
|
|
|
|
MAX_TABLES_PER_DOCUMENT = 8
|
|
|
|
|
|
MAX_SECTION_OUTLINE_ITEMS = 12
|
|
|
|
|
|
MAX_SECTION_SNIPPETS = 8
|
|
|
|
|
|
MAX_SECTION_SNIPPET_CHARS = 220
|
|
|
|
|
|
MAX_SECTION_QA_CLUES = 4
|
|
|
|
|
|
MAX_TOTAL_QA_CLUES = 24
|
|
|
|
|
|
MAX_QA_CLUE_CHARS = 180
|
|
|
|
|
|
FACT_KEYWORDS = (
|
|
|
|
|
|
"适用",
|
|
|
|
|
|
"标准",
|
|
|
|
|
|
"条件",
|
|
|
|
|
|
"流程",
|
|
|
|
|
|
"审批",
|
|
|
|
|
|
"提交",
|
|
|
|
|
|
"附件",
|
|
|
|
|
|
"材料",
|
|
|
|
|
|
"票据",
|
|
|
|
|
|
"报销",
|
|
|
|
|
|
"限额",
|
|
|
|
|
|
"金额",
|
|
|
|
|
|
"比例",
|
|
|
|
|
|
"范围",
|
|
|
|
|
|
"对象",
|
|
|
|
|
|
"人员",
|
|
|
|
|
|
"时限",
|
|
|
|
|
|
"工作日",
|
|
|
|
|
|
"不得",
|
|
|
|
|
|
"可以",
|
|
|
|
|
|
"应当",
|
|
|
|
|
|
"应",
|
|
|
|
|
|
"需",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass(frozen=True, slots=True)
|
|
|
|
|
|
class TableCandidate:
|
|
|
|
|
|
title: str
|
|
|
|
|
|
excerpt: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass(frozen=True, slots=True)
|
|
|
|
|
|
class SectionCandidate:
|
|
|
|
|
|
title: str
|
|
|
|
|
|
excerpt: str
|
|
|
|
|
|
body_lines: tuple[str, ...]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class KnowledgeNormalizationService:
|
|
|
|
|
|
def __init__(self, db: Session) -> None:
|
|
|
|
|
|
self.runtime_chat_service = RuntimeChatService(db)
|
|
|
|
|
|
|
|
|
|
|
|
def build_enriched_text(self, raw_text: str) -> str:
|
|
|
|
|
|
normalized_text = str(raw_text or "").strip()
|
|
|
|
|
|
if not normalized_text:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
section_appendix = self._build_section_appendix(normalized_text)
|
|
|
|
|
|
answer_clue_appendix = self._build_answer_clue_appendix(normalized_text)
|
|
|
|
|
|
normalized_tables: list[str] = []
|
|
|
|
|
|
for candidate in self._extract_table_candidates(normalized_text):
|
|
|
|
|
|
rendered = self._normalize_table_candidate(candidate)
|
|
|
|
|
|
if rendered:
|
|
|
|
|
|
normalized_tables.append(f"## {candidate.title}\n\n{rendered}")
|
|
|
|
|
|
|
2026-05-18 02:49:39 +00:00
|
|
|
|
appendix_parts: list[str] = []
|
2026-05-17 08:38:41 +00:00
|
|
|
|
if section_appendix:
|
2026-05-18 02:49:39 +00:00
|
|
|
|
appendix_parts.append(section_appendix)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
if answer_clue_appendix:
|
2026-05-18 02:49:39 +00:00
|
|
|
|
appendix_parts.append(answer_clue_appendix)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
if normalized_tables:
|
|
|
|
|
|
appendix = "\n\n".join(normalized_tables)
|
2026-05-18 02:49:39 +00:00
|
|
|
|
appendix_parts.append(
|
2026-05-17 08:38:41 +00:00
|
|
|
|
"# 结构化表格补充\n\n"
|
|
|
|
|
|
"以下表格由知识归纳阶段依据原文重新整理,供问答检索时优先理解行列关系。\n\n"
|
|
|
|
|
|
f"{appendix}"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-05-18 02:49:39 +00:00
|
|
|
|
if not appendix_parts:
|
2026-05-17 08:38:41 +00:00
|
|
|
|
return normalized_text
|
|
|
|
|
|
|
2026-05-18 02:49:39 +00:00
|
|
|
|
return "\n\n".join([normalized_text, *appendix_parts])
|
2026-05-17 08:38:41 +00:00
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _extract_table_candidates(text: str) -> list[TableCandidate]:
|
|
|
|
|
|
candidates: list[TableCandidate] = []
|
|
|
|
|
|
occupied_ranges: list[tuple[int, int]] = []
|
|
|
|
|
|
|
|
|
|
|
|
for match in TABLE_MARKER_PATTERN.finditer(text):
|
|
|
|
|
|
if len(candidates) >= MAX_TABLES_PER_DOCUMENT:
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
start = text.rfind("\n", 0, match.start())
|
|
|
|
|
|
start = 0 if start < 0 else start + 1
|
|
|
|
|
|
end = min(len(text), start + MAX_TABLE_WINDOW_CHARS)
|
|
|
|
|
|
if any(start < existing_end and end > existing_start for existing_start, existing_end in occupied_ranges):
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
excerpt = text[start:end].strip()
|
|
|
|
|
|
head = excerpt[:360]
|
|
|
|
|
|
if "单位:" not in head and "标准" not in head:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if excerpt.count("\n") < 6 or sum(char.isdigit() for char in excerpt) < 4:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
marker = match.group(0).replace(" ", "")
|
|
|
|
|
|
first_line = next((line.strip() for line in excerpt.splitlines() if line.strip()), marker)
|
|
|
|
|
|
title = first_line if first_line.startswith(marker) else marker
|
|
|
|
|
|
candidates.append(TableCandidate(title=title, excerpt=excerpt))
|
|
|
|
|
|
occupied_ranges.append((start, end))
|
|
|
|
|
|
|
|
|
|
|
|
return candidates
|
|
|
|
|
|
|
|
|
|
|
|
def _normalize_table_candidate(self, candidate: TableCandidate) -> str:
|
|
|
|
|
|
messages = [
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "system",
|
|
|
|
|
|
"content": (
|
|
|
|
|
|
"你是制度文档结构化助手。"
|
|
|
|
|
|
"只依据用户提供的原文,提炼其中的表格为清晰 Markdown。"
|
|
|
|
|
|
"必须严格按照表头从左到右对齐每个数值,不能猜测、不能改列顺序、不能擅自补全。"
|
|
|
|
|
|
"只输出一张 Markdown 表格本身,不要输出标题、说明、注释、脚注或正文解释。"
|
|
|
|
|
|
"如果原文不足以确认表格关系,只回复“无法确认”。"
|
|
|
|
|
|
"不要输出思考过程,不要复述原文,不要添加制度之外的新事实。"
|
|
|
|
|
|
),
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "user",
|
|
|
|
|
|
"content": (
|
|
|
|
|
|
f"请仅整理下面这段制度表格,标题为《{candidate.title}》。\n\n"
|
|
|
|
|
|
f"{candidate.excerpt}"
|
|
|
|
|
|
),
|
|
|
|
|
|
},
|
|
|
|
|
|
]
|
|
|
|
|
|
answer = self.runtime_chat_service.complete(
|
|
|
|
|
|
messages,
|
|
|
|
|
|
max_tokens=900,
|
|
|
|
|
|
temperature=0.0,
|
|
|
|
|
|
)
|
|
|
|
|
|
cleaned = self._sanitize_answer(answer)
|
|
|
|
|
|
if not cleaned or cleaned == "无法确认":
|
|
|
|
|
|
return ""
|
|
|
|
|
|
if cleaned.count("|") < 6:
|
|
|
|
|
|
logger.info("Skip non-tabular normalization candidate title=%s", candidate.title)
|
|
|
|
|
|
return ""
|
|
|
|
|
|
return cleaned
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _build_section_appendix(text: str) -> str:
|
|
|
|
|
|
candidates = KnowledgeNormalizationService._extract_section_candidates(text)
|
|
|
|
|
|
if len(candidates) < 2:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
outline = "\n".join(
|
|
|
|
|
|
f"- {item.title}"
|
|
|
|
|
|
for item in candidates[:MAX_SECTION_OUTLINE_ITEMS]
|
|
|
|
|
|
)
|
|
|
|
|
|
snippets = "\n\n".join(
|
|
|
|
|
|
[
|
|
|
|
|
|
f"## {item.title}\n\n{item.excerpt}"
|
|
|
|
|
|
for item in candidates[:MAX_SECTION_SNIPPETS]
|
|
|
|
|
|
if item.excerpt
|
|
|
|
|
|
]
|
|
|
|
|
|
)
|
|
|
|
|
|
if not snippets:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
return (
|
|
|
|
|
|
"# 章节导航\n\n"
|
|
|
|
|
|
"以下内容由入库阶段从制度原文中提取,供检索时优先理解制度层级、条目和标准所在章节。\n\n"
|
|
|
|
|
|
f"{outline}\n\n"
|
|
|
|
|
|
"# 重点章节摘录\n\n"
|
|
|
|
|
|
f"{snippets}"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _build_answer_clue_appendix(text: str) -> str:
|
|
|
|
|
|
candidates = KnowledgeNormalizationService._extract_section_candidates(text)
|
|
|
|
|
|
clue_lines: list[str] = []
|
|
|
|
|
|
|
|
|
|
|
|
if candidates:
|
|
|
|
|
|
for candidate in candidates:
|
|
|
|
|
|
clue_lines.extend(
|
|
|
|
|
|
KnowledgeNormalizationService._extract_section_clues(candidate)
|
|
|
|
|
|
)
|
|
|
|
|
|
else:
|
|
|
|
|
|
clue_lines.extend(KnowledgeNormalizationService._extract_freeform_clues(text))
|
|
|
|
|
|
|
|
|
|
|
|
deduped: list[str] = []
|
|
|
|
|
|
seen: set[str] = set()
|
|
|
|
|
|
for item in clue_lines:
|
|
|
|
|
|
normalized = re.sub(r"\s+", " ", str(item or "")).strip()
|
|
|
|
|
|
if not normalized or normalized in seen:
|
|
|
|
|
|
continue
|
|
|
|
|
|
seen.add(normalized)
|
|
|
|
|
|
deduped.append(normalized)
|
|
|
|
|
|
if len(deduped) >= MAX_TOTAL_QA_CLUES:
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
if len(deduped) < 2:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
return (
|
|
|
|
|
|
"# 问答线索补充\n\n"
|
|
|
|
|
|
"以下内容由入库阶段根据章节标题、条款、列表、键值对与相邻正文提炼,"
|
|
|
|
|
|
"供问答检索时优先命中更短、更直接的制度依据。\n\n"
|
|
|
|
|
|
+ "\n".join(f"- {item}" for item in deduped)
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _extract_section_candidates(text: str) -> list[SectionCandidate]:
|
|
|
|
|
|
lines = [line.rstrip() for line in str(text or "").splitlines()]
|
|
|
|
|
|
sections: list[SectionCandidate] = []
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body: list[str] = []
|
|
|
|
|
|
|
|
|
|
|
|
def flush() -> None:
|
|
|
|
|
|
nonlocal current_title, current_body
|
|
|
|
|
|
if not current_title:
|
|
|
|
|
|
current_body = []
|
|
|
|
|
|
return
|
|
|
|
|
|
excerpt = KnowledgeNormalizationService._build_section_excerpt(current_body)
|
|
|
|
|
|
if excerpt:
|
|
|
|
|
|
sections.append(
|
|
|
|
|
|
SectionCandidate(
|
|
|
|
|
|
title=current_title,
|
|
|
|
|
|
excerpt=excerpt,
|
|
|
|
|
|
body_lines=tuple(current_body),
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
|
|
|
|
|
current_title = ""
|
|
|
|
|
|
current_body = []
|
|
|
|
|
|
|
|
|
|
|
|
for raw_line in lines:
|
|
|
|
|
|
line = raw_line.strip()
|
|
|
|
|
|
if not line:
|
|
|
|
|
|
if current_body:
|
|
|
|
|
|
current_body.append("")
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if SECTION_HEADING_PATTERN.match(line) and len(line) <= 80:
|
|
|
|
|
|
flush()
|
|
|
|
|
|
current_title = line
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if current_title:
|
|
|
|
|
|
current_body.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
flush()
|
|
|
|
|
|
return sections
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _build_section_excerpt(lines: list[str]) -> str:
|
|
|
|
|
|
cleaned_lines = [line.strip() for line in lines if line.strip()]
|
|
|
|
|
|
if not cleaned_lines:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
excerpt = ";".join(cleaned_lines[:3]).strip()
|
|
|
|
|
|
if len(excerpt) <= MAX_SECTION_SNIPPET_CHARS:
|
|
|
|
|
|
return excerpt
|
|
|
|
|
|
return f"{excerpt[: MAX_SECTION_SNIPPET_CHARS - 3].rstrip()}..."
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _extract_section_clues(candidate: SectionCandidate) -> list[str]:
|
|
|
|
|
|
clues: list[str] = []
|
|
|
|
|
|
fallback: list[str] = []
|
|
|
|
|
|
|
|
|
|
|
|
for raw_line in candidate.body_lines:
|
|
|
|
|
|
normalized_line = KnowledgeNormalizationService._normalize_fact_line(raw_line)
|
|
|
|
|
|
if not normalized_line or KnowledgeNormalizationService._is_table_like_line(normalized_line):
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
fact_units = KnowledgeNormalizationService._split_fact_units(normalized_line)
|
|
|
|
|
|
for unit in fact_units:
|
|
|
|
|
|
rendered = KnowledgeNormalizationService._render_clue(candidate.title, unit)
|
|
|
|
|
|
if not rendered:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if KnowledgeNormalizationService._looks_like_fact_line(unit):
|
|
|
|
|
|
clues.append(rendered)
|
|
|
|
|
|
elif len(fallback) < 2:
|
|
|
|
|
|
fallback.append(rendered)
|
|
|
|
|
|
|
|
|
|
|
|
if len(clues) >= MAX_SECTION_QA_CLUES:
|
|
|
|
|
|
return clues[:MAX_SECTION_QA_CLUES]
|
|
|
|
|
|
|
|
|
|
|
|
return clues[:MAX_SECTION_QA_CLUES] or fallback[:2]
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _extract_freeform_clues(text: str) -> list[str]:
|
|
|
|
|
|
clues: list[str] = []
|
|
|
|
|
|
for raw_line in str(text or "").splitlines():
|
|
|
|
|
|
normalized_line = KnowledgeNormalizationService._normalize_fact_line(raw_line)
|
|
|
|
|
|
if (
|
|
|
|
|
|
not normalized_line
|
|
|
|
|
|
or SECTION_HEADING_PATTERN.match(normalized_line)
|
|
|
|
|
|
or KnowledgeNormalizationService._is_table_like_line(normalized_line)
|
|
|
|
|
|
or not KnowledgeNormalizationService._looks_like_fact_line(normalized_line)
|
|
|
|
|
|
):
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
for unit in KnowledgeNormalizationService._split_fact_units(normalized_line):
|
|
|
|
|
|
rendered = KnowledgeNormalizationService._render_clue("正文", unit)
|
|
|
|
|
|
if rendered:
|
|
|
|
|
|
clues.append(rendered)
|
|
|
|
|
|
if len(clues) >= MAX_TOTAL_QA_CLUES:
|
|
|
|
|
|
return clues
|
|
|
|
|
|
return clues
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _split_fact_units(line: str) -> list[str]:
|
|
|
|
|
|
normalized = KnowledgeNormalizationService._normalize_fact_line(line)
|
|
|
|
|
|
if not normalized:
|
|
|
|
|
|
return []
|
|
|
|
|
|
if len(normalized) <= MAX_QA_CLUE_CHARS and all(mark not in normalized for mark in (";", ";", "。")):
|
|
|
|
|
|
return [normalized]
|
|
|
|
|
|
|
|
|
|
|
|
units: list[str] = []
|
|
|
|
|
|
for part in re.split(r"[;;。]\s*", normalized):
|
|
|
|
|
|
cleaned = KnowledgeNormalizationService._normalize_fact_line(part)
|
|
|
|
|
|
if not cleaned:
|
|
|
|
|
|
continue
|
|
|
|
|
|
units.append(cleaned)
|
|
|
|
|
|
return units or [KnowledgeNormalizationService._truncate_clue(normalized)]
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _normalize_fact_line(line: str) -> str:
|
|
|
|
|
|
normalized = str(line or "").strip()
|
|
|
|
|
|
normalized = re.sub(r"\s+", " ", normalized)
|
|
|
|
|
|
return normalized.strip(" -")
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _is_table_like_line(line: str) -> bool:
|
|
|
|
|
|
normalized = str(line or "").strip()
|
|
|
|
|
|
if not normalized:
|
|
|
|
|
|
return False
|
|
|
|
|
|
if normalized.count("|") >= 2:
|
|
|
|
|
|
return True
|
|
|
|
|
|
if normalized.count("\t") >= 2:
|
|
|
|
|
|
return True
|
|
|
|
|
|
number_tokens = re.findall(r"\d+(?:[.][0-9]+)?", normalized)
|
|
|
|
|
|
if len(number_tokens) >= 3 and len(normalized.split()) >= 4 and not any(
|
|
|
|
|
|
punct in normalized for punct in ("。", ";", ";", ":", ":")
|
|
|
|
|
|
):
|
|
|
|
|
|
return True
|
|
|
|
|
|
return "单位:" in normalized and sum(char.isdigit() for char in normalized) >= 3
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _looks_like_fact_line(line: str) -> bool:
|
|
|
|
|
|
normalized = KnowledgeNormalizationService._normalize_fact_line(line)
|
|
|
|
|
|
if len(normalized) < 6:
|
|
|
|
|
|
return False
|
|
|
|
|
|
if TABLE_MARKER_PATTERN.search(normalized) or normalized.startswith(("单位:", "单位:")):
|
|
|
|
|
|
return False
|
|
|
|
|
|
if (
|
|
|
|
|
|
ARTICLE_PATTERN.match(normalized)
|
|
|
|
|
|
or LIST_ITEM_PATTERN.match(normalized)
|
|
|
|
|
|
or NUMBERED_ITEM_PATTERN.match(normalized)
|
|
|
|
|
|
or KEY_VALUE_PATTERN.match(normalized)
|
|
|
|
|
|
):
|
|
|
|
|
|
return True
|
|
|
|
|
|
if any(keyword in normalized for keyword in FACT_KEYWORDS):
|
|
|
|
|
|
return True
|
|
|
|
|
|
return any(char.isdigit() for char in normalized)
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _render_clue(section_title: str, line: str) -> str:
|
|
|
|
|
|
normalized_line = KnowledgeNormalizationService._truncate_clue(line)
|
|
|
|
|
|
if not normalized_line:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
normalized_title = str(section_title or "").strip()
|
|
|
|
|
|
if not normalized_title:
|
|
|
|
|
|
return normalized_line
|
|
|
|
|
|
return f"{normalized_title}:{normalized_line}"
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _truncate_clue(line: str) -> str:
|
|
|
|
|
|
normalized = KnowledgeNormalizationService._normalize_fact_line(line)
|
|
|
|
|
|
if len(normalized) <= MAX_QA_CLUE_CHARS:
|
|
|
|
|
|
return normalized
|
|
|
|
|
|
return f"{normalized[: MAX_QA_CLUE_CHARS - 3].rstrip()}..."
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _sanitize_answer(answer: str | None) -> str:
|
|
|
|
|
|
cleaned = re.sub(r"<think>.*?</think>", "", str(answer or ""), flags=re.DOTALL | re.IGNORECASE)
|
|
|
|
|
|
lines = [line.rstrip() for line in cleaned.strip().splitlines()]
|
|
|
|
|
|
table_lines: list[str] = []
|
|
|
|
|
|
for line in lines:
|
|
|
|
|
|
normalized = line.strip()
|
|
|
|
|
|
if "|" not in normalized:
|
|
|
|
|
|
if table_lines:
|
|
|
|
|
|
break
|
|
|
|
|
|
continue
|
|
|
|
|
|
table_lines.append(normalized)
|
|
|
|
|
|
return "\n".join(table_lines).strip()
|