Files
X-Financial/server/src/app/services/user_agent_knowledge_helpers.py
caoxiaozhu 50b1c3f9a9 feat: 增强规则资产管理与审计页面运行时调试
后端新增规则资产版本管理和规则文件 CRUD 接口,优化风险
规则生成模板执行和员工数据模型字段,知识库 RAG 增强本
地回退和文档提取能力,清理旧风险规则文件统一由生成引擎
管理,前端审计页面增加运行时调试面板和规则资产编辑交互,
补充单元测试覆盖。
2026-05-24 21:44:17 +08:00

617 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import re
from typing import Any
from app.services.user_agent_knowledge_constants import (
KNOWLEDGE_ARTICLE_PATTERN,
KNOWLEDGE_LIST_ITEM_PATTERN,
KNOWLEDGE_NUMBERED_ITEM_PATTERN,
KNOWLEDGE_QUERY_STOPWORDS,
KNOWLEDGE_SECTION_HEADING_PATTERN,
MAX_KNOWLEDGE_MODEL_HITS,
MAX_KNOWLEDGE_QUERY_TERMS,
)
class UserAgentKnowledgeHelpersMixin:
GENERIC_KNOWLEDGE_TITLE_TERMS = {"远光软件", "股份有限", "有限公司"}
KNOWLEDGE_QUERY_ANCHOR_TERMS = (
"财务基础知识手册",
"基础知识手册",
"会计科目",
"常用会计科目",
"财务报表",
"主要税种",
"税种",
"标准",
"清单",
"明细",
"流程",
)
@staticmethod
def _select_knowledge_model_hits(
tool_payload: dict[str, Any],
*,
question: str | None = None,
) -> list[dict[str, Any]]:
raw_hits = [
item
for item in list(tool_payload.get("hits") or [])
if isinstance(item, dict)
][: max(MAX_KNOWLEDGE_MODEL_HITS + 3, 8)]
if not raw_hits:
return []
query_terms = UserAgentKnowledgeHelpersMixin._extract_knowledge_query_terms(question or "")
if not query_terms:
return raw_hits[:MAX_KNOWLEDGE_MODEL_HITS]
ranked_hits = sorted(
enumerate(raw_hits),
key=lambda value: (
UserAgentKnowledgeHelpersMixin._score_knowledge_model_hit(
value[1],
query_terms=query_terms,
rank_index=value[0],
),
-value[0],
),
reverse=True,
)
return [item for _, item in ranked_hits[:MAX_KNOWLEDGE_MODEL_HITS]]
@staticmethod
def _score_knowledge_model_hit(
item: dict[str, Any],
*,
query_terms: list[str],
rank_index: int,
) -> int:
title = str(item.get("title") or item.get("document_name") or "").lower()
excerpt = str(item.get("excerpt") or "").lower()
content = str(item.get("content") or "").lower()
haystack = "\n".join([title, excerpt, content[:1400]])
matched_terms = [term for term in query_terms if term in haystack]
score = max(1, 48 - rank_index * 4)
score += len(matched_terms) * 10
score += sum(max(0, len(term) - 4) * 8 for term in matched_terms)
score += sum(1 for term in matched_terms if term in title) * 8
score += sum(max(0, len(term) - 4) * 6 for term in matched_terms if term in title)
score += sum(
(len(term) - 3) * 10
for term in matched_terms
if len(term) >= 4
and term in title
and term not in UserAgentKnowledgeHelpersMixin.GENERIC_KNOWLEDGE_TITLE_TERMS
)
leading_marker = UserAgentKnowledgeHelpersMixin._leading_knowledge_appendix_marker(content)
if leading_marker == "# 章节导航":
score -= 22
elif leading_marker == "# 问答线索补充":
score += 6 if matched_terms else -8
elif leading_marker == "# 重点章节摘录":
score += 4 if matched_terms else -4
elif leading_marker == "# 结构化表格补充":
score += 8 if matched_terms else -3
if matched_terms and "|" in content:
score += 8
if matched_terms and any(marker in content for marker in ("", ":")):
score += 10
if matched_terms and "\n" in content:
score += 4
if matched_terms and any(marker in content for marker in ("附表", "", "")):
score += 4
if matched_terms and any(marker in content for marker in ("", "", "", "-", "")):
score += 4
if re.search(r"没有.{0,8}(信息|规定|说明|依据)", content):
score -= 12
return score
@staticmethod
def _leading_knowledge_appendix_marker(content: str) -> str:
normalized = str(content or "").lstrip()
for marker in ("# 章节导航", "# 重点章节摘录", "# 问答线索补充", "# 结构化表格补充"):
index = normalized.find(marker)
if 0 <= index <= 220:
return marker
return ""
def _prioritize_knowledge_evidence_items(
self,
question: str,
evidence_items: list[dict[str, Any]],
) -> list[dict[str, Any]]:
if not evidence_items or not self._question_requires_explicit_condition(question):
return evidence_items
for preferred_kind in ("table", "kv", "clause", "list"):
for index, item in enumerate(evidence_items):
if str(item.get("kind") or "") != preferred_kind:
continue
return [item, *evidence_items[:index], *evidence_items[index + 1 :]]
for index, item in enumerate(evidence_items):
if re.search(r"\d", str(item.get("content") or "")):
return [item, *evidence_items[:index], *evidence_items[index + 1 :]]
return evidence_items
@staticmethod
def _is_knowledge_lead_in_segment(item: dict[str, str]) -> bool:
kind = str(item.get("kind") or "").strip()
content = str(item.get("content") or "").strip()
return kind in {"kv", "list", "clause"} and content.endswith(("", ":"))
@staticmethod
def _extract_knowledge_marker_family(content: str) -> str:
normalized = str(content or "").strip()
if not normalized:
return ""
if KNOWLEDGE_ARTICLE_PATTERN.match(normalized):
return "article"
if re.match(r"^\d+[.)、]\s*", normalized):
return "arabic"
if re.match(r"^[(][一二三四五六七八九十百零0-9]+[)]\s*", normalized):
return "paren"
if re.match(r"^[①②③④⑤⑥⑦⑧⑨⑩]\s*", normalized):
return "circled"
if KNOWLEDGE_LIST_ITEM_PATTERN.match(normalized):
return "bullet"
return ""
@staticmethod
def _knowledge_list_marker_sort_key(content: str) -> int:
normalized = str(content or "").strip()
match = re.match(r"^[(]([一二三四五六七八九十百零0-9]+)[)]", normalized)
if not match:
return 999
marker = match.group(1)
if marker.isdigit():
return int(marker)
values = {
"": 0,
"": 1,
"": 2,
"": 3,
"": 4,
"": 5,
"": 6,
"": 7,
"": 8,
"": 9,
"": 10,
}
if marker in values:
return values[marker]
if marker.startswith("") and len(marker) == 2:
return 10 + values.get(marker[1], 0)
if marker.endswith("") and len(marker) == 2:
return values.get(marker[0], 0) * 10
if "" in marker:
left, right = marker.split("", 1)
return values.get(left, 1) * 10 + values.get(right, 0)
return 999
@staticmethod
def _format_knowledge_heading_label(heading: str) -> str:
parts = [item.strip() for item in str(heading or "").split(">") if item.strip()]
return " / ".join(parts)
@staticmethod
def _has_inline_numbered_knowledge_items(content: str) -> bool:
return len(
re.findall(
r"[(][一二三四五六七八九十百零0-9]+[)]",
str(content or ""),
)
) >= 2
@staticmethod
def _split_inline_numbered_knowledge_items(content: str) -> list[str]:
normalized = str(content or "").strip()
if not UserAgentKnowledgeHelpersMixin._has_inline_numbered_knowledge_items(normalized):
return [normalized] if normalized else []
marker_pattern = r"[(][一二三四五六七八九十百零0-9]+[)]"
first_marker = re.search(marker_pattern, normalized)
if first_marker is None:
return [normalized] if normalized else []
prefix = normalized[: first_marker.start()].strip(" :")
tail = normalized[first_marker.start() :].strip()
item_pattern = (
r"([(][一二三四五六七八九十百零0-9]+[)]\s*.*?"
r"(?=\s*[(][一二三四五六七八九十百零0-9]+[)]|\s*$))"
)
items = [item.strip() for item in re.findall(item_pattern, tail) if item.strip()]
if prefix:
return [prefix, *items]
return items or [normalized]
@staticmethod
def _focus_knowledge_segment_content(content: str, query_terms: list[str]) -> str:
normalized = re.sub(r"\s+", " ", str(content or "").strip())
if not normalized:
return ""
anchor_terms = sorted(
{
str(term or "").strip()
for term in query_terms
if len(str(term or "").strip()) >= 3
},
key=len,
reverse=True,
)
anchor_index = -1
for term in anchor_terms:
anchor_index = normalized.lower().find(term.lower())
if anchor_index >= 0:
break
if anchor_index < 0:
return normalized
prefix_window = normalized[max(0, anchor_index - 40) : anchor_index]
marker_match = None
for match in re.finditer(
r"(?:第[一二三四五六七八九十百零0-9]+[部分章节条]|[一二三四五六七八九十]+、|[(][一二三四五六七八九十百零0-9]+[)])",
prefix_window,
):
marker_match = match
start = anchor_index
if marker_match is not None:
start = max(0, anchor_index - len(prefix_window) + marker_match.start())
return normalized[start : start + 700].strip()
@staticmethod
def _split_markdown_table_cells(line: str) -> list[str]:
stripped = str(line or "").strip()
if stripped.startswith("|"):
stripped = stripped[1:]
if stripped.endswith("|"):
stripped = stripped[:-1]
return [
re.sub(r"\s+", " ", cell.replace("**", "").strip())
for cell in stripped.split("|")
]
@classmethod
def _summarize_knowledge_table_preview(cls, preview: str) -> str:
rows: list[list[str]] = []
for line in str(preview or "").splitlines():
if line.count("|") < 2:
continue
cells = cls._split_markdown_table_cells(line)
if not cells or all(re.fullmatch(r":?-{2,}:?", cell.replace(" ", "")) for cell in cells):
continue
rows.append(cells)
if len(rows) < 2:
return "可直接参考的标准表如下。"
header = rows[0]
data_rows = [row for row in rows[1:] if len(row) == len(header)]
if len(data_rows) == 1 and len(header) >= 2:
row = data_rows[0]
subject = row[0] or "该项目"
pairs = [
f"{label}{value}"
for label, value in zip(header[1:], row[1:])
if label and value and value not in {"-", ""}
]
if pairs:
return f"{subject}的标准为:{''.join(pairs)}"
return "相关标准项如下,请按表头和行内容对应使用。"
def _summarize_knowledge_lines_conclusion(
self,
lines: list[str],
*,
heading: str = "",
) -> str:
clean_lines = [
self._clean_knowledge_segment_text(line)
for line in lines
if self._clean_knowledge_segment_text(line)
]
if not clean_lines:
return ""
clean_heading = str(heading or "").strip()
if not clean_heading and clean_lines and "" not in clean_lines[0] and ":" not in clean_lines[0]:
clean_heading = clean_lines[0]
clean_heading = re.sub(
r"^[一二三四五六七八九十百零0-9]+、\s*",
"",
clean_heading,
)
item_labels: list[str] = []
for line in clean_lines:
if "" not in line and ":" not in line:
continue
label = re.split(r"[:]", line, maxsplit=1)[0].strip()
if 1 <= len(label) <= 24:
item_labels.append(label)
if clean_heading and len(item_labels) >= 2:
return f"{clean_heading}包括:{''.join(item_labels[:6])}"
if item_labels:
return f"{item_labels[0]}{clean_lines[0].split('', 1)[-1].strip()}"
return clean_lines[0]
@staticmethod
def _knowledge_lines_have_multiple_labeled_items(lines: list[str]) -> bool:
labeled_count = 0
for line in lines:
normalized = str(line or "").strip()
if "" not in normalized and ":" not in normalized:
continue
label = re.split(r"[:]", normalized, maxsplit=1)[0].strip()
if 1 <= len(label) <= 24:
labeled_count += 1
return labeled_count >= 2
def _score_knowledge_evidence_candidate(
self,
item: dict[str, str],
query_terms: list[str],
) -> int:
heading = str(item.get("heading") or "").lower()
content = str(item.get("content") or "").lower()
kind = str(item.get("kind") or "").strip()
haystack = "\n".join([heading, content])
matched_terms = [term for term in query_terms if term in haystack]
score = len(matched_terms) * 10
score += sum(max(0, len(term) - 4) * 8 for term in matched_terms)
score += sum(1 for term in matched_terms if term in heading) * 6
score += sum(max(0, len(term) - 4) * 6 for term in matched_terms if term in heading)
if kind == "table":
score += 10
if content.count("\n") < 2:
score -= 24
elif kind in {"kv", "clause", "list"}:
score += 8
elif kind == "paragraph":
score += 4
if "问答线索补充" in heading or "重点章节摘录" in heading:
score += 8
if "结构化表格补充" in heading:
score += 10
if "章节导航" in heading or "目录" in heading:
score -= 16
if re.search(r"[.。…]{6,}", content):
score -= 12
if any(hint in content for hint in ("", "", "不得", "可以", "标准", "条件", "材料", "审批", "流程", "包括")):
score += 3
content_length = len(content)
if content_length > 220:
score -= min(8, (content_length - 220) // 40)
return score
@staticmethod
def _extract_knowledge_query_terms(question: str) -> list[str]:
normalized_question = str(question or "").strip().lower()
if not normalized_question:
return []
terms: list[str] = []
seen: set[str] = set()
def remember(term: str) -> None:
normalized = str(term or "").strip().lower()
if (
not normalized
or normalized in seen
or normalized in KNOWLEDGE_QUERY_STOPWORDS
):
return
seen.add(normalized)
terms.append(normalized)
for item in re.findall(r"[a-z0-9][a-z0-9_\-]{1,}", normalized_question):
remember(item)
for block in re.findall(r"[\u4e00-\u9fff]{2,20}", normalized_question):
remember(block)
if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS:
return terms
for marker in ("标准", "金额", "限额", "额度"):
marker_index = block.find(marker)
if marker_index <= 0:
continue
subject = block[:marker_index]
for width in (6, 4, 3, 2):
remember(subject[-width:])
for anchor in UserAgentKnowledgeHelpersMixin.KNOWLEDGE_QUERY_ANCHOR_TERMS:
if anchor in block:
remember(anchor)
tail = block[-14:]
for size in (8, 7, 6, 5, 4):
for start in range(0, len(tail) - size + 1):
piece = tail[start : start + size]
if any(
anchor in piece
for anchor in UserAgentKnowledgeHelpersMixin.KNOWLEDGE_QUERY_ANCHOR_TERMS
):
remember(piece)
if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS:
return terms
if len(block) <= 4:
remember(block)
continue
for size in (4, 3, 2):
for start in range(0, len(block) - size + 1):
remember(block[start : start + size])
if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS:
return terms
return terms[:MAX_KNOWLEDGE_QUERY_TERMS]
@staticmethod
def _clean_knowledge_segment_text(content: str) -> str:
normalized = str(content or "").strip()
normalized = re.sub(r"^[-*•]\s*", "", normalized)
normalized = re.sub(r"^(?:\d+[.)、]|[①②③④⑤⑥⑦⑧⑨⑩])\s*", "", normalized)
normalized = re.sub(r"^[(][一二三四五六七八九十百零0-9]+[)]\s*", "", normalized)
normalized = re.sub(r"\s+", " ", normalized)
if len(normalized) <= 180:
return normalized
return f"{normalized[:177].rstrip()}..."
@staticmethod
def _normalize_knowledge_line(content: str, *, preserve_marker: bool) -> str:
normalized = str(content or "").strip()
normalized = re.sub(r"^[-*•]\s*", "", normalized)
if not preserve_marker:
normalized = re.sub(r"^(?:\d+[.)、]|[①②③④⑤⑥⑦⑧⑨⑩])\s*", "", normalized)
normalized = re.sub(r"^[(][一二三四五六七八九十百零0-9]+[)]\s*", "", normalized)
normalized = re.sub(r"\s+", " ", normalized)
return normalized
def _split_clean_knowledge_lines(
self,
content: str,
*,
preserve_marker: bool,
) -> list[str]:
return [
line
for line in (
self._normalize_knowledge_line(item, preserve_marker=preserve_marker)
for item in str(content or "").splitlines()
)
if line
]
@staticmethod
def _extract_relevant_table_preview(
content: str,
query_terms: list[str],
*,
preferred_terms: list[str] | None = None,
max_rows: int = 3,
fallback_rows: int = 2,
) -> str:
lines = [line.strip() for line in str(content or "").splitlines() if line.strip()]
if len(lines) <= 3:
return "\n".join(lines)
header = lines[0]
divider = lines[1] if len(lines) > 1 else ""
body = lines[2:] if divider.count("|") >= 2 else lines[1:]
preferred = [
str(term or "").strip().lower()
for term in list(preferred_terms or [])
if str(term or "").strip()
]
base_terms = preferred + [
str(term or "").strip().lower()
for term in query_terms
if str(term or "").strip().lower() not in preferred
]
derived_terms: list[str] = []
for term in base_terms:
for marker in ("标准", "金额", "限额", "额度", "是多少"):
marker_index = term.find(marker)
if marker_index <= 0:
continue
subject = term[:marker_index].strip()
if len(subject) < 2:
continue
for width in (6, 4, 3, 2):
derived_terms.append(subject[-width:])
search_terms: list[str] = []
for term in [*preferred, *derived_terms, *base_terms]:
if term and term not in search_terms:
search_terms.append(term)
matched_rows = [
row
for row in body
if any(term in row.lower() for term in search_terms)
]
selected_rows = matched_rows[:max_rows] or body[:fallback_rows]
preview_lines = [header]
if divider:
preview_lines.append(divider)
preview_lines.extend(selected_rows)
return "\n".join(preview_lines).strip()
@staticmethod
def _question_requests_broad_knowledge_table(question: str) -> bool:
normalized = str(question or "").strip()
if not normalized:
return False
broad_hints = ("有哪些", "是什么", "介绍", "说明", "列表", "清单", "全部", "完整")
table_subject_hints = ("科目", "目录", "清单", "列表", "", "明细")
return any(hint in normalized for hint in broad_hints) and any(
hint in normalized for hint in table_subject_hints
)
@staticmethod
def _question_requires_explicit_condition(question: str) -> bool:
normalized = str(question or "").strip()
return any(keyword in normalized for keyword in ("多少", "金额", "上限", "限额", "标准", "条件", "需要"))
@staticmethod
def _answer_evidence_has_numeric_or_condition(evidence_items: list[dict[str, Any]]) -> bool:
for item in evidence_items:
content = str(item.get("content") or "")
if re.search(r"\d", content):
return True
if any(
keyword in content
for keyword in ("", "", "不得", "可以", "条件", "材料", "审批", "流程", "标准", "适用")
):
return True
return False