2026-05-22 10:42:31 +08:00
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
|
|
from app.services.user_agent_knowledge_constants import (
|
|
|
|
|
|
KNOWLEDGE_ARTICLE_PATTERN,
|
|
|
|
|
|
KNOWLEDGE_LIST_ITEM_PATTERN,
|
|
|
|
|
|
KNOWLEDGE_NUMBERED_ITEM_PATTERN,
|
|
|
|
|
|
KNOWLEDGE_QUERY_STOPWORDS,
|
|
|
|
|
|
KNOWLEDGE_SECTION_HEADING_PATTERN,
|
|
|
|
|
|
MAX_KNOWLEDGE_MODEL_HITS,
|
|
|
|
|
|
MAX_KNOWLEDGE_QUERY_TERMS,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class UserAgentKnowledgeHelpersMixin:
|
2026-05-24 21:44:17 +08:00
|
|
|
|
GENERIC_KNOWLEDGE_TITLE_TERMS = {"远光软件", "股份有限", "有限公司"}
|
|
|
|
|
|
KNOWLEDGE_QUERY_ANCHOR_TERMS = (
|
|
|
|
|
|
"财务基础知识手册",
|
|
|
|
|
|
"基础知识手册",
|
|
|
|
|
|
"会计科目",
|
|
|
|
|
|
"常用会计科目",
|
|
|
|
|
|
"财务报表",
|
|
|
|
|
|
"主要税种",
|
|
|
|
|
|
"税种",
|
|
|
|
|
|
"标准",
|
|
|
|
|
|
"清单",
|
|
|
|
|
|
"明细",
|
|
|
|
|
|
"流程",
|
|
|
|
|
|
)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _select_knowledge_model_hits(
|
|
|
|
|
|
tool_payload: dict[str, Any],
|
|
|
|
|
|
*,
|
|
|
|
|
|
question: str | None = None,
|
|
|
|
|
|
) -> list[dict[str, Any]]:
|
|
|
|
|
|
raw_hits = [
|
|
|
|
|
|
item
|
|
|
|
|
|
for item in list(tool_payload.get("hits") or [])
|
|
|
|
|
|
if isinstance(item, dict)
|
2026-05-24 21:44:17 +08:00
|
|
|
|
][: max(MAX_KNOWLEDGE_MODEL_HITS + 3, 8)]
|
2026-05-22 10:42:31 +08:00
|
|
|
|
if not raw_hits:
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
query_terms = UserAgentKnowledgeHelpersMixin._extract_knowledge_query_terms(question or "")
|
|
|
|
|
|
if not query_terms:
|
|
|
|
|
|
return raw_hits[:MAX_KNOWLEDGE_MODEL_HITS]
|
|
|
|
|
|
|
|
|
|
|
|
ranked_hits = sorted(
|
|
|
|
|
|
enumerate(raw_hits),
|
|
|
|
|
|
key=lambda value: (
|
|
|
|
|
|
UserAgentKnowledgeHelpersMixin._score_knowledge_model_hit(
|
|
|
|
|
|
value[1],
|
|
|
|
|
|
query_terms=query_terms,
|
|
|
|
|
|
rank_index=value[0],
|
|
|
|
|
|
),
|
|
|
|
|
|
-value[0],
|
|
|
|
|
|
),
|
|
|
|
|
|
reverse=True,
|
|
|
|
|
|
)
|
|
|
|
|
|
return [item for _, item in ranked_hits[:MAX_KNOWLEDGE_MODEL_HITS]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _score_knowledge_model_hit(
|
|
|
|
|
|
item: dict[str, Any],
|
|
|
|
|
|
*,
|
|
|
|
|
|
query_terms: list[str],
|
|
|
|
|
|
rank_index: int,
|
|
|
|
|
|
) -> int:
|
|
|
|
|
|
title = str(item.get("title") or item.get("document_name") or "").lower()
|
|
|
|
|
|
excerpt = str(item.get("excerpt") or "").lower()
|
|
|
|
|
|
content = str(item.get("content") or "").lower()
|
|
|
|
|
|
haystack = "\n".join([title, excerpt, content[:1400]])
|
|
|
|
|
|
|
|
|
|
|
|
matched_terms = [term for term in query_terms if term in haystack]
|
|
|
|
|
|
score = max(1, 48 - rank_index * 4)
|
|
|
|
|
|
score += len(matched_terms) * 10
|
2026-05-24 21:44:17 +08:00
|
|
|
|
score += sum(max(0, len(term) - 4) * 8 for term in matched_terms)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
score += sum(1 for term in matched_terms if term in title) * 8
|
2026-05-24 21:44:17 +08:00
|
|
|
|
score += sum(max(0, len(term) - 4) * 6 for term in matched_terms if term in title)
|
|
|
|
|
|
score += sum(
|
|
|
|
|
|
(len(term) - 3) * 10
|
|
|
|
|
|
for term in matched_terms
|
|
|
|
|
|
if len(term) >= 4
|
|
|
|
|
|
and term in title
|
|
|
|
|
|
and term not in UserAgentKnowledgeHelpersMixin.GENERIC_KNOWLEDGE_TITLE_TERMS
|
|
|
|
|
|
)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
leading_marker = UserAgentKnowledgeHelpersMixin._leading_knowledge_appendix_marker(content)
|
|
|
|
|
|
if leading_marker == "# 章节导航":
|
|
|
|
|
|
score -= 22
|
|
|
|
|
|
elif leading_marker == "# 问答线索补充":
|
|
|
|
|
|
score += 6 if matched_terms else -8
|
|
|
|
|
|
elif leading_marker == "# 重点章节摘录":
|
|
|
|
|
|
score += 4 if matched_terms else -4
|
|
|
|
|
|
elif leading_marker == "# 结构化表格补充":
|
|
|
|
|
|
score += 8 if matched_terms else -3
|
|
|
|
|
|
|
|
|
|
|
|
if matched_terms and "|" in content:
|
|
|
|
|
|
score += 8
|
|
|
|
|
|
if matched_terms and any(marker in content for marker in (":", ":")):
|
|
|
|
|
|
score += 10
|
|
|
|
|
|
if matched_terms and "\n" in content:
|
|
|
|
|
|
score += 4
|
|
|
|
|
|
if matched_terms and any(marker in content for marker in ("附表", "第", "条")):
|
|
|
|
|
|
score += 4
|
|
|
|
|
|
if matched_terms and any(marker in content for marker in ("第", "条", ":", "-", "•")):
|
|
|
|
|
|
score += 4
|
|
|
|
|
|
if re.search(r"没有.{0,8}(信息|规定|说明|依据)", content):
|
|
|
|
|
|
score -= 12
|
|
|
|
|
|
return score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _leading_knowledge_appendix_marker(content: str) -> str:
|
|
|
|
|
|
normalized = str(content or "").lstrip()
|
|
|
|
|
|
for marker in ("# 章节导航", "# 重点章节摘录", "# 问答线索补充", "# 结构化表格补充"):
|
|
|
|
|
|
index = normalized.find(marker)
|
|
|
|
|
|
if 0 <= index <= 220:
|
|
|
|
|
|
return marker
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _prioritize_knowledge_evidence_items(
|
|
|
|
|
|
self,
|
|
|
|
|
|
question: str,
|
|
|
|
|
|
evidence_items: list[dict[str, Any]],
|
|
|
|
|
|
) -> list[dict[str, Any]]:
|
|
|
|
|
|
if not evidence_items or not self._question_requires_explicit_condition(question):
|
|
|
|
|
|
return evidence_items
|
|
|
|
|
|
|
|
|
|
|
|
for preferred_kind in ("table", "kv", "clause", "list"):
|
|
|
|
|
|
for index, item in enumerate(evidence_items):
|
|
|
|
|
|
if str(item.get("kind") or "") != preferred_kind:
|
|
|
|
|
|
continue
|
|
|
|
|
|
return [item, *evidence_items[:index], *evidence_items[index + 1 :]]
|
|
|
|
|
|
|
|
|
|
|
|
for index, item in enumerate(evidence_items):
|
|
|
|
|
|
if re.search(r"\d", str(item.get("content") or "")):
|
|
|
|
|
|
return [item, *evidence_items[:index], *evidence_items[index + 1 :]]
|
|
|
|
|
|
|
|
|
|
|
|
return evidence_items
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _is_knowledge_lead_in_segment(item: dict[str, str]) -> bool:
|
|
|
|
|
|
kind = str(item.get("kind") or "").strip()
|
|
|
|
|
|
content = str(item.get("content") or "").strip()
|
|
|
|
|
|
return kind in {"kv", "list", "clause"} and content.endswith((":", ":"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _extract_knowledge_marker_family(content: str) -> str:
|
|
|
|
|
|
normalized = str(content or "").strip()
|
|
|
|
|
|
if not normalized:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
if KNOWLEDGE_ARTICLE_PATTERN.match(normalized):
|
|
|
|
|
|
return "article"
|
|
|
|
|
|
if re.match(r"^\d+[.)、]\s*", normalized):
|
|
|
|
|
|
return "arabic"
|
|
|
|
|
|
if re.match(r"^[((][一二三四五六七八九十百零0-9]+[))]\s*", normalized):
|
|
|
|
|
|
return "paren"
|
|
|
|
|
|
if re.match(r"^[①②③④⑤⑥⑦⑧⑨⑩]\s*", normalized):
|
|
|
|
|
|
return "circled"
|
|
|
|
|
|
if KNOWLEDGE_LIST_ITEM_PATTERN.match(normalized):
|
|
|
|
|
|
return "bullet"
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-24 21:44:17 +08:00
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _knowledge_list_marker_sort_key(content: str) -> int:
|
|
|
|
|
|
normalized = str(content or "").strip()
|
|
|
|
|
|
match = re.match(r"^[((]([一二三四五六七八九十百零0-9]+)[))]", normalized)
|
|
|
|
|
|
if not match:
|
|
|
|
|
|
return 999
|
|
|
|
|
|
marker = match.group(1)
|
|
|
|
|
|
if marker.isdigit():
|
|
|
|
|
|
return int(marker)
|
|
|
|
|
|
values = {
|
|
|
|
|
|
"零": 0,
|
|
|
|
|
|
"一": 1,
|
|
|
|
|
|
"二": 2,
|
|
|
|
|
|
"三": 3,
|
|
|
|
|
|
"四": 4,
|
|
|
|
|
|
"五": 5,
|
|
|
|
|
|
"六": 6,
|
|
|
|
|
|
"七": 7,
|
|
|
|
|
|
"八": 8,
|
|
|
|
|
|
"九": 9,
|
|
|
|
|
|
"十": 10,
|
|
|
|
|
|
}
|
|
|
|
|
|
if marker in values:
|
|
|
|
|
|
return values[marker]
|
|
|
|
|
|
if marker.startswith("十") and len(marker) == 2:
|
|
|
|
|
|
return 10 + values.get(marker[1], 0)
|
|
|
|
|
|
if marker.endswith("十") and len(marker) == 2:
|
|
|
|
|
|
return values.get(marker[0], 0) * 10
|
|
|
|
|
|
if "十" in marker:
|
|
|
|
|
|
left, right = marker.split("十", 1)
|
|
|
|
|
|
return values.get(left, 1) * 10 + values.get(right, 0)
|
|
|
|
|
|
return 999
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _format_knowledge_heading_label(heading: str) -> str:
|
|
|
|
|
|
parts = [item.strip() for item in str(heading or "").split(">") if item.strip()]
|
|
|
|
|
|
return " / ".join(parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-24 21:44:17 +08:00
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _has_inline_numbered_knowledge_items(content: str) -> bool:
|
|
|
|
|
|
return len(
|
|
|
|
|
|
re.findall(
|
|
|
|
|
|
r"[((][一二三四五六七八九十百零0-9]+[))]",
|
|
|
|
|
|
str(content or ""),
|
|
|
|
|
|
)
|
|
|
|
|
|
) >= 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _split_inline_numbered_knowledge_items(content: str) -> list[str]:
|
|
|
|
|
|
normalized = str(content or "").strip()
|
|
|
|
|
|
if not UserAgentKnowledgeHelpersMixin._has_inline_numbered_knowledge_items(normalized):
|
|
|
|
|
|
return [normalized] if normalized else []
|
|
|
|
|
|
|
|
|
|
|
|
marker_pattern = r"[((][一二三四五六七八九十百零0-9]+[))]"
|
|
|
|
|
|
first_marker = re.search(marker_pattern, normalized)
|
|
|
|
|
|
if first_marker is None:
|
|
|
|
|
|
return [normalized] if normalized else []
|
|
|
|
|
|
|
|
|
|
|
|
prefix = normalized[: first_marker.start()].strip(" ::")
|
|
|
|
|
|
tail = normalized[first_marker.start() :].strip()
|
|
|
|
|
|
item_pattern = (
|
|
|
|
|
|
r"([((][一二三四五六七八九十百零0-9]+[))]\s*.*?"
|
|
|
|
|
|
r"(?=\s*[((][一二三四五六七八九十百零0-9]+[))]|\s*$))"
|
|
|
|
|
|
)
|
|
|
|
|
|
items = [item.strip() for item in re.findall(item_pattern, tail) if item.strip()]
|
|
|
|
|
|
if prefix:
|
|
|
|
|
|
return [prefix, *items]
|
|
|
|
|
|
return items or [normalized]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _focus_knowledge_segment_content(content: str, query_terms: list[str]) -> str:
|
|
|
|
|
|
normalized = re.sub(r"\s+", " ", str(content or "").strip())
|
|
|
|
|
|
if not normalized:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
anchor_terms = sorted(
|
|
|
|
|
|
{
|
|
|
|
|
|
str(term or "").strip()
|
|
|
|
|
|
for term in query_terms
|
|
|
|
|
|
if len(str(term or "").strip()) >= 3
|
|
|
|
|
|
},
|
|
|
|
|
|
key=len,
|
|
|
|
|
|
reverse=True,
|
|
|
|
|
|
)
|
|
|
|
|
|
anchor_index = -1
|
|
|
|
|
|
for term in anchor_terms:
|
|
|
|
|
|
anchor_index = normalized.lower().find(term.lower())
|
|
|
|
|
|
if anchor_index >= 0:
|
|
|
|
|
|
break
|
|
|
|
|
|
if anchor_index < 0:
|
|
|
|
|
|
return normalized
|
|
|
|
|
|
|
|
|
|
|
|
prefix_window = normalized[max(0, anchor_index - 40) : anchor_index]
|
|
|
|
|
|
marker_match = None
|
|
|
|
|
|
for match in re.finditer(
|
|
|
|
|
|
r"(?:第[一二三四五六七八九十百零0-9]+[部分章节条]|[一二三四五六七八九十]+、|[((][一二三四五六七八九十百零0-9]+[))])",
|
|
|
|
|
|
prefix_window,
|
|
|
|
|
|
):
|
|
|
|
|
|
marker_match = match
|
|
|
|
|
|
start = anchor_index
|
|
|
|
|
|
if marker_match is not None:
|
|
|
|
|
|
start = max(0, anchor_index - len(prefix_window) + marker_match.start())
|
|
|
|
|
|
|
|
|
|
|
|
return normalized[start : start + 700].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _split_markdown_table_cells(line: str) -> list[str]:
|
|
|
|
|
|
stripped = str(line or "").strip()
|
|
|
|
|
|
if stripped.startswith("|"):
|
|
|
|
|
|
stripped = stripped[1:]
|
|
|
|
|
|
if stripped.endswith("|"):
|
|
|
|
|
|
stripped = stripped[:-1]
|
|
|
|
|
|
return [
|
|
|
|
|
|
re.sub(r"\s+", " ", cell.replace("**", "").strip())
|
|
|
|
|
|
for cell in stripped.split("|")
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def _summarize_knowledge_table_preview(cls, preview: str) -> str:
|
|
|
|
|
|
rows: list[list[str]] = []
|
|
|
|
|
|
for line in str(preview or "").splitlines():
|
|
|
|
|
|
if line.count("|") < 2:
|
|
|
|
|
|
continue
|
|
|
|
|
|
cells = cls._split_markdown_table_cells(line)
|
|
|
|
|
|
if not cells or all(re.fullmatch(r":?-{2,}:?", cell.replace(" ", "")) for cell in cells):
|
|
|
|
|
|
continue
|
|
|
|
|
|
rows.append(cells)
|
|
|
|
|
|
|
|
|
|
|
|
if len(rows) < 2:
|
|
|
|
|
|
return "可直接参考的标准表如下。"
|
|
|
|
|
|
|
|
|
|
|
|
header = rows[0]
|
|
|
|
|
|
data_rows = [row for row in rows[1:] if len(row) == len(header)]
|
|
|
|
|
|
if len(data_rows) == 1 and len(header) >= 2:
|
|
|
|
|
|
row = data_rows[0]
|
|
|
|
|
|
subject = row[0] or "该项目"
|
|
|
|
|
|
pairs = [
|
|
|
|
|
|
f"{label}:{value}"
|
|
|
|
|
|
for label, value in zip(header[1:], row[1:])
|
|
|
|
|
|
if label and value and value not in {"-", "—"}
|
|
|
|
|
|
]
|
|
|
|
|
|
if pairs:
|
|
|
|
|
|
return f"{subject}的标准为:{';'.join(pairs)}。"
|
|
|
|
|
|
|
|
|
|
|
|
return "相关标准项如下,请按表头和行内容对应使用。"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _summarize_knowledge_lines_conclusion(
|
|
|
|
|
|
self,
|
|
|
|
|
|
lines: list[str],
|
|
|
|
|
|
*,
|
|
|
|
|
|
heading: str = "",
|
|
|
|
|
|
) -> str:
|
|
|
|
|
|
clean_lines = [
|
|
|
|
|
|
self._clean_knowledge_segment_text(line)
|
|
|
|
|
|
for line in lines
|
|
|
|
|
|
if self._clean_knowledge_segment_text(line)
|
|
|
|
|
|
]
|
|
|
|
|
|
if not clean_lines:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
clean_heading = str(heading or "").strip()
|
|
|
|
|
|
if not clean_heading and clean_lines and ":" not in clean_lines[0] and ":" not in clean_lines[0]:
|
|
|
|
|
|
clean_heading = clean_lines[0]
|
|
|
|
|
|
clean_heading = re.sub(
|
|
|
|
|
|
r"^[一二三四五六七八九十百零0-9]+、\s*",
|
|
|
|
|
|
"",
|
|
|
|
|
|
clean_heading,
|
|
|
|
|
|
)
|
|
|
|
|
|
item_labels: list[str] = []
|
|
|
|
|
|
for line in clean_lines:
|
|
|
|
|
|
if ":" not in line and ":" not in line:
|
|
|
|
|
|
continue
|
|
|
|
|
|
label = re.split(r"[::]", line, maxsplit=1)[0].strip()
|
|
|
|
|
|
if 1 <= len(label) <= 24:
|
|
|
|
|
|
item_labels.append(label)
|
|
|
|
|
|
|
|
|
|
|
|
if clean_heading and len(item_labels) >= 2:
|
|
|
|
|
|
return f"{clean_heading}包括:{'、'.join(item_labels[:6])}。"
|
|
|
|
|
|
if item_labels:
|
|
|
|
|
|
return f"{item_labels[0]}:{clean_lines[0].split(':', 1)[-1].strip()}"
|
|
|
|
|
|
return clean_lines[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _knowledge_lines_have_multiple_labeled_items(lines: list[str]) -> bool:
|
|
|
|
|
|
labeled_count = 0
|
|
|
|
|
|
for line in lines:
|
|
|
|
|
|
normalized = str(line or "").strip()
|
|
|
|
|
|
if ":" not in normalized and ":" not in normalized:
|
|
|
|
|
|
continue
|
|
|
|
|
|
label = re.split(r"[::]", normalized, maxsplit=1)[0].strip()
|
|
|
|
|
|
if 1 <= len(label) <= 24:
|
|
|
|
|
|
labeled_count += 1
|
|
|
|
|
|
return labeled_count >= 2
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
def _score_knowledge_evidence_candidate(
|
|
|
|
|
|
self,
|
|
|
|
|
|
item: dict[str, str],
|
|
|
|
|
|
query_terms: list[str],
|
|
|
|
|
|
) -> int:
|
|
|
|
|
|
heading = str(item.get("heading") or "").lower()
|
|
|
|
|
|
content = str(item.get("content") or "").lower()
|
|
|
|
|
|
kind = str(item.get("kind") or "").strip()
|
|
|
|
|
|
haystack = "\n".join([heading, content])
|
|
|
|
|
|
|
|
|
|
|
|
matched_terms = [term for term in query_terms if term in haystack]
|
|
|
|
|
|
score = len(matched_terms) * 10
|
2026-05-24 21:44:17 +08:00
|
|
|
|
score += sum(max(0, len(term) - 4) * 8 for term in matched_terms)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
score += sum(1 for term in matched_terms if term in heading) * 6
|
2026-05-24 21:44:17 +08:00
|
|
|
|
score += sum(max(0, len(term) - 4) * 6 for term in matched_terms if term in heading)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
if kind == "table":
|
|
|
|
|
|
score += 10
|
2026-05-24 21:44:17 +08:00
|
|
|
|
if content.count("\n") < 2:
|
|
|
|
|
|
score -= 24
|
2026-05-22 10:42:31 +08:00
|
|
|
|
elif kind in {"kv", "clause", "list"}:
|
|
|
|
|
|
score += 8
|
|
|
|
|
|
elif kind == "paragraph":
|
|
|
|
|
|
score += 4
|
|
|
|
|
|
|
|
|
|
|
|
if "问答线索补充" in heading or "重点章节摘录" in heading:
|
|
|
|
|
|
score += 8
|
|
|
|
|
|
if "结构化表格补充" in heading:
|
|
|
|
|
|
score += 10
|
|
|
|
|
|
if "章节导航" in heading or "目录" in heading:
|
|
|
|
|
|
score -= 16
|
|
|
|
|
|
if re.search(r"[.。…]{6,}", content):
|
|
|
|
|
|
score -= 12
|
|
|
|
|
|
if any(hint in content for hint in ("应", "需", "不得", "可以", "标准", "条件", "材料", "审批", "流程", "包括")):
|
|
|
|
|
|
score += 3
|
|
|
|
|
|
|
|
|
|
|
|
content_length = len(content)
|
|
|
|
|
|
if content_length > 220:
|
|
|
|
|
|
score -= min(8, (content_length - 220) // 40)
|
|
|
|
|
|
return score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _extract_knowledge_query_terms(question: str) -> list[str]:
|
|
|
|
|
|
normalized_question = str(question or "").strip().lower()
|
|
|
|
|
|
if not normalized_question:
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
terms: list[str] = []
|
|
|
|
|
|
seen: set[str] = set()
|
|
|
|
|
|
|
|
|
|
|
|
def remember(term: str) -> None:
|
|
|
|
|
|
normalized = str(term or "").strip().lower()
|
|
|
|
|
|
if (
|
|
|
|
|
|
not normalized
|
|
|
|
|
|
or normalized in seen
|
|
|
|
|
|
or normalized in KNOWLEDGE_QUERY_STOPWORDS
|
|
|
|
|
|
):
|
|
|
|
|
|
return
|
|
|
|
|
|
seen.add(normalized)
|
|
|
|
|
|
terms.append(normalized)
|
|
|
|
|
|
|
|
|
|
|
|
for item in re.findall(r"[a-z0-9][a-z0-9_\-]{1,}", normalized_question):
|
|
|
|
|
|
remember(item)
|
|
|
|
|
|
|
|
|
|
|
|
for block in re.findall(r"[\u4e00-\u9fff]{2,20}", normalized_question):
|
2026-05-24 21:44:17 +08:00
|
|
|
|
remember(block)
|
|
|
|
|
|
if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS:
|
|
|
|
|
|
return terms
|
|
|
|
|
|
for marker in ("标准", "金额", "限额", "额度"):
|
|
|
|
|
|
marker_index = block.find(marker)
|
|
|
|
|
|
if marker_index <= 0:
|
|
|
|
|
|
continue
|
|
|
|
|
|
subject = block[:marker_index]
|
|
|
|
|
|
for width in (6, 4, 3, 2):
|
|
|
|
|
|
remember(subject[-width:])
|
|
|
|
|
|
for anchor in UserAgentKnowledgeHelpersMixin.KNOWLEDGE_QUERY_ANCHOR_TERMS:
|
|
|
|
|
|
if anchor in block:
|
|
|
|
|
|
remember(anchor)
|
|
|
|
|
|
tail = block[-14:]
|
|
|
|
|
|
for size in (8, 7, 6, 5, 4):
|
|
|
|
|
|
for start in range(0, len(tail) - size + 1):
|
|
|
|
|
|
piece = tail[start : start + size]
|
|
|
|
|
|
if any(
|
|
|
|
|
|
anchor in piece
|
|
|
|
|
|
for anchor in UserAgentKnowledgeHelpersMixin.KNOWLEDGE_QUERY_ANCHOR_TERMS
|
|
|
|
|
|
):
|
|
|
|
|
|
remember(piece)
|
|
|
|
|
|
if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS:
|
|
|
|
|
|
return terms
|
2026-05-22 10:42:31 +08:00
|
|
|
|
if len(block) <= 4:
|
|
|
|
|
|
remember(block)
|
|
|
|
|
|
continue
|
|
|
|
|
|
for size in (4, 3, 2):
|
|
|
|
|
|
for start in range(0, len(block) - size + 1):
|
|
|
|
|
|
remember(block[start : start + size])
|
|
|
|
|
|
if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS:
|
|
|
|
|
|
return terms
|
|
|
|
|
|
|
|
|
|
|
|
return terms[:MAX_KNOWLEDGE_QUERY_TERMS]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _clean_knowledge_segment_text(content: str) -> str:
|
|
|
|
|
|
normalized = str(content or "").strip()
|
|
|
|
|
|
normalized = re.sub(r"^[-*•]\s*", "", normalized)
|
|
|
|
|
|
normalized = re.sub(r"^(?:\d+[.)、]|[①②③④⑤⑥⑦⑧⑨⑩])\s*", "", normalized)
|
|
|
|
|
|
normalized = re.sub(r"^[((][一二三四五六七八九十百零0-9]+[))]\s*", "", normalized)
|
|
|
|
|
|
normalized = re.sub(r"\s+", " ", normalized)
|
|
|
|
|
|
if len(normalized) <= 180:
|
|
|
|
|
|
return normalized
|
|
|
|
|
|
return f"{normalized[:177].rstrip()}..."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _normalize_knowledge_line(content: str, *, preserve_marker: bool) -> str:
|
|
|
|
|
|
normalized = str(content or "").strip()
|
|
|
|
|
|
normalized = re.sub(r"^[-*•]\s*", "", normalized)
|
|
|
|
|
|
if not preserve_marker:
|
|
|
|
|
|
normalized = re.sub(r"^(?:\d+[.)、]|[①②③④⑤⑥⑦⑧⑨⑩])\s*", "", normalized)
|
|
|
|
|
|
normalized = re.sub(r"^[((][一二三四五六七八九十百零0-9]+[))]\s*", "", normalized)
|
|
|
|
|
|
normalized = re.sub(r"\s+", " ", normalized)
|
|
|
|
|
|
return normalized
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _split_clean_knowledge_lines(
|
|
|
|
|
|
self,
|
|
|
|
|
|
content: str,
|
|
|
|
|
|
*,
|
|
|
|
|
|
preserve_marker: bool,
|
|
|
|
|
|
) -> list[str]:
|
|
|
|
|
|
return [
|
|
|
|
|
|
line
|
|
|
|
|
|
for line in (
|
|
|
|
|
|
self._normalize_knowledge_line(item, preserve_marker=preserve_marker)
|
|
|
|
|
|
for item in str(content or "").splitlines()
|
|
|
|
|
|
)
|
|
|
|
|
|
if line
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
2026-05-24 21:44:17 +08:00
|
|
|
|
def _extract_relevant_table_preview(
|
|
|
|
|
|
content: str,
|
|
|
|
|
|
query_terms: list[str],
|
|
|
|
|
|
*,
|
|
|
|
|
|
preferred_terms: list[str] | None = None,
|
|
|
|
|
|
max_rows: int = 3,
|
|
|
|
|
|
fallback_rows: int = 2,
|
|
|
|
|
|
) -> str:
|
2026-05-22 10:42:31 +08:00
|
|
|
|
lines = [line.strip() for line in str(content or "").splitlines() if line.strip()]
|
|
|
|
|
|
if len(lines) <= 3:
|
|
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
header = lines[0]
|
|
|
|
|
|
divider = lines[1] if len(lines) > 1 else ""
|
|
|
|
|
|
body = lines[2:] if divider.count("|") >= 2 else lines[1:]
|
|
|
|
|
|
|
2026-05-24 21:44:17 +08:00
|
|
|
|
preferred = [
|
|
|
|
|
|
str(term or "").strip().lower()
|
|
|
|
|
|
for term in list(preferred_terms or [])
|
|
|
|
|
|
if str(term or "").strip()
|
|
|
|
|
|
]
|
|
|
|
|
|
base_terms = preferred + [
|
|
|
|
|
|
str(term or "").strip().lower()
|
|
|
|
|
|
for term in query_terms
|
|
|
|
|
|
if str(term or "").strip().lower() not in preferred
|
|
|
|
|
|
]
|
|
|
|
|
|
derived_terms: list[str] = []
|
|
|
|
|
|
for term in base_terms:
|
|
|
|
|
|
for marker in ("标准", "金额", "限额", "额度", "是多少"):
|
|
|
|
|
|
marker_index = term.find(marker)
|
|
|
|
|
|
if marker_index <= 0:
|
|
|
|
|
|
continue
|
|
|
|
|
|
subject = term[:marker_index].strip()
|
|
|
|
|
|
if len(subject) < 2:
|
|
|
|
|
|
continue
|
|
|
|
|
|
for width in (6, 4, 3, 2):
|
|
|
|
|
|
derived_terms.append(subject[-width:])
|
|
|
|
|
|
|
|
|
|
|
|
search_terms: list[str] = []
|
|
|
|
|
|
for term in [*preferred, *derived_terms, *base_terms]:
|
|
|
|
|
|
if term and term not in search_terms:
|
|
|
|
|
|
search_terms.append(term)
|
|
|
|
|
|
|
2026-05-22 10:42:31 +08:00
|
|
|
|
matched_rows = [
|
|
|
|
|
|
row
|
|
|
|
|
|
for row in body
|
2026-05-24 21:44:17 +08:00
|
|
|
|
if any(term in row.lower() for term in search_terms)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
]
|
2026-05-24 21:44:17 +08:00
|
|
|
|
selected_rows = matched_rows[:max_rows] or body[:fallback_rows]
|
2026-05-22 10:42:31 +08:00
|
|
|
|
preview_lines = [header]
|
|
|
|
|
|
if divider:
|
|
|
|
|
|
preview_lines.append(divider)
|
|
|
|
|
|
preview_lines.extend(selected_rows)
|
|
|
|
|
|
return "\n".join(preview_lines).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-24 21:44:17 +08:00
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _question_requests_broad_knowledge_table(question: str) -> bool:
|
|
|
|
|
|
normalized = str(question or "").strip()
|
|
|
|
|
|
if not normalized:
|
|
|
|
|
|
return False
|
|
|
|
|
|
broad_hints = ("有哪些", "是什么", "介绍", "说明", "列表", "清单", "全部", "完整")
|
|
|
|
|
|
table_subject_hints = ("科目", "目录", "清单", "列表", "表", "明细")
|
|
|
|
|
|
return any(hint in normalized for hint in broad_hints) and any(
|
|
|
|
|
|
hint in normalized for hint in table_subject_hints
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _question_requires_explicit_condition(question: str) -> bool:
|
|
|
|
|
|
normalized = str(question or "").strip()
|
|
|
|
|
|
return any(keyword in normalized for keyword in ("多少", "金额", "上限", "限额", "标准", "条件", "需要"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _answer_evidence_has_numeric_or_condition(evidence_items: list[dict[str, Any]]) -> bool:
|
|
|
|
|
|
for item in evidence_items:
|
|
|
|
|
|
content = str(item.get("content") or "")
|
|
|
|
|
|
if re.search(r"\d", content):
|
|
|
|
|
|
return True
|
|
|
|
|
|
if any(
|
|
|
|
|
|
keyword in content
|
|
|
|
|
|
for keyword in ("应", "需", "不得", "可以", "条件", "材料", "审批", "流程", "标准", "适用")
|
|
|
|
|
|
):
|
|
|
|
|
|
return True
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|