Files
X-Financial/server/src/app/services/user_agent_knowledge.py
caoxiaozhu d4d5d40569 feat: 新增预算费控模型与报销审批流引擎
后端新增预算费控服务和报销单审批流模块,引入申请人费用画像
算法,优化知识库 RAG 运行时和同步逻辑,完善报销单工作流常
量和明细同步,更新差旅报销规则电子表格,前端新增预算分析
组件和数字员工模型,完善审批对话框和洞察面板交互,优化侧
边栏和顶栏样式,补充单元测试。
2026-05-27 17:31:27 +08:00

736 lines
28 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import re
from typing import Any
from app.schemas.user_agent import UserAgentCitation, UserAgentRequest
from app.services.user_agent_knowledge_helpers import UserAgentKnowledgeHelpersMixin
from app.services.user_agent_knowledge_constants import (
KNOWLEDGE_ARTICLE_PATTERN,
KNOWLEDGE_DIRECT_ANSWER_HINTS,
KNOWLEDGE_LIST_ITEM_PATTERN,
KNOWLEDGE_NUMBERED_ITEM_PATTERN,
KNOWLEDGE_QUERY_STOPWORDS,
KNOWLEDGE_SECTION_HEADING_PATTERN,
MAX_KNOWLEDGE_DIRECT_EVIDENCE,
MAX_KNOWLEDGE_MODEL_HITS,
MAX_KNOWLEDGE_QUERY_TERMS,
)
class UserAgentKnowledgeMixin(UserAgentKnowledgeHelpersMixin):
@staticmethod
def _build_model_tool_payload(
tool_payload: dict[str, Any],
*,
question: str | None = None,
) -> dict[str, Any]:
normalized = dict(tool_payload or {})
hits = []
for item in UserAgentKnowledgeMixin._select_knowledge_model_hits(
tool_payload,
question=question,
):
if not isinstance(item, dict):
continue
hits.append(
{
"title": str(item.get("title") or "").strip(),
"document_name": str(item.get("document_name") or "").strip(),
"excerpt": str(item.get("excerpt") or "").strip(),
"content": str(item.get("content") or "").strip()[:1200],
"tags": list(item.get("tags") or [])[:5],
"evidence": list(item.get("evidence") or [])[:3],
"code": str(item.get("code") or "").strip(),
}
)
normalized["hits"] = hits
return normalized
@staticmethod
def _build_knowledge_evidence_blocks(
tool_payload: dict[str, Any],
*,
question: str | None = None,
) -> str:
blocks: list[str] = []
for index, item in enumerate(
UserAgentKnowledgeMixin._select_knowledge_model_hits(
tool_payload,
question=question,
)[:3],
start=1,
):
if not isinstance(item, dict):
continue
title = str(item.get("title") or item.get("document_name") or f"证据 {index}").strip()
code = str(item.get("code") or "").strip()
content = str(item.get("content") or "").strip()
if not content:
continue
blocks.append(
"\n".join(
[
f"[证据 {index}] {title}" + (f" ({code})" if code else ""),
"```text",
content[:1200],
"```",
]
)
)
return "\n\n".join(blocks)
def _build_fast_knowledge_answer(
self,
payload: UserAgentRequest,
*,
citations: list[UserAgentCitation],
) -> str | None:
if payload.ontology.scenario != "knowledge":
return None
if str(payload.tool_payload.get("result_type") or "").strip() != "knowledge_search":
return None
evidence_items = self._build_knowledge_answer_evidence(payload)
if not evidence_items:
return None
question = self._resolve_knowledge_question(payload)
if not self._should_use_direct_knowledge_answer(question, evidence_items):
return None
return self._render_knowledge_direct_answer(
payload,
citations=citations,
evidence_items=evidence_items,
)
def _render_knowledge_direct_answer(
self,
payload: UserAgentRequest,
*,
citations: list[UserAgentCitation],
evidence_items: list[dict[str, Any]],
) -> str | None:
if not evidence_items:
return None
title = str(
(citations[0].title if citations else "")
or evidence_items[0].get("title")
or "相关制度"
).strip()
user_name = str(payload.context_json.get("name") or "").strip()
question = self._resolve_knowledge_question(payload)
query_terms = self._extract_knowledge_query_terms(question)
ordered_evidence_items = self._prioritize_knowledge_evidence_items(question, evidence_items)
primary_item = ordered_evidence_items[0]
primary_heading = self._format_knowledge_heading_label(
str(primary_item.get("heading") or "").strip()
)
primary_lines = self._collect_direct_knowledge_answer_lines(
ordered_evidence_items,
query_terms=query_terms,
)
lines: list[str] = []
if user_name:
lines.append(f"{user_name},您好。")
source_prefix = f"根据《{title}"
if primary_heading:
source_prefix = f"{source_prefix}{primary_heading}"
conclusion_lines: list[str] = []
evidence_lines: list[str] = []
if str(primary_item.get("kind") or "") == "table":
table_content = str(primary_item.get("content") or "")
if self._question_requests_broad_knowledge_table(question):
table_preview = table_content.strip()
else:
table_preview = self._extract_relevant_table_preview(
table_content,
query_terms,
preferred_terms=self._build_knowledge_table_preferred_terms(payload),
)
table_summary = self._summarize_knowledge_table_preview(table_preview)
conclusion_lines.append(f"{source_prefix}{table_summary}")
evidence_lines.append(table_preview)
else:
if not primary_lines:
summary = self._summarize_knowledge_evidence_content(primary_item, query_terms)
conclusion_lines.append(
f"{source_prefix},当前能直接确认的是:"
f"{summary}"
)
elif len(primary_lines) == 1:
conclusion_lines.append(f"{source_prefix},当前能直接确认的是:{primary_lines[0].strip()}")
evidence_lines.extend(primary_lines)
else:
subject = self._build_knowledge_answer_subject(question, primary_heading)
summary = self._summarize_knowledge_lines_conclusion(
primary_lines,
heading=subject,
)
if summary:
conclusion_lines.append(f"{source_prefix}{summary}")
else:
conclusion_lines.append(f"{source_prefix},当前能直接确认的是:")
evidence_lines.extend(primary_lines)
notes: list[str] = []
location_note = self._build_missing_location_grounding_note(question, evidence_items)
if location_note:
notes.append(location_note)
if self._question_requires_explicit_condition(question) and not self._answer_evidence_has_numeric_or_condition(evidence_items):
notes.append("当前命中的证据更偏规则说明或流程约束,还没有直接给出可立即套用的数值或完整条件。")
self._append_markdown_section(lines, "结论", conclusion_lines)
self._append_markdown_section(lines, "依据", evidence_lines)
if notes:
self._append_markdown_section(lines, "说明", [f"- {note}" for note in notes])
return "\n".join(line for line in lines if line is not None).strip()
@staticmethod
def _append_markdown_section(lines: list[str], title: str, body_lines: list[str]) -> None:
cleaned = [str(line or "").rstrip() for line in body_lines if str(line or "").strip()]
if not cleaned:
return
if lines and lines[-1] != "":
lines.append("")
lines.append(f"## {title}")
lines.append("")
lines.extend(cleaned)
@staticmethod
def _build_knowledge_answer_subject(question: str, heading: str = "") -> str:
clean_heading = str(heading or "").strip()
if clean_heading and not any(
marker in clean_heading
for marker in ("问答线索补充", "结构化表格补充", "重点章节摘录", "章节导航")
):
return clean_heading
normalized = re.sub(r"\s+", "", str(question or "").strip())
normalized = re.sub(r"[?。.!]+$", "", normalized)
normalized = re.sub(r"(是什么|有哪些|是多少|如何|怎么|吗|呢)$", "", normalized)
return normalized.strip(":,。.")
@staticmethod
def _build_knowledge_table_preferred_terms(payload: UserAgentRequest) -> list[str]:
terms: list[str] = []
context = payload.context_json or {}
for key in ("grade", "position", "job_grade", "rank", "level"):
value = str(context.get(key) or "").strip()
if value and value not in terms:
terms.append(value)
grade_match = re.fullmatch(r"[Pp](\d{1,2})", str(context.get("grade") or "").strip())
if grade_match:
grade = int(grade_match.group(1))
for start in range(max(0, grade - 4), grade + 1):
for end in range(grade, min(12, grade + 4) + 1):
if start >= end:
continue
for separator in ("", "~", "-", ""):
term = f"P{start}{separator}P{end}"
if term not in terms:
terms.append(term)
return terms
@staticmethod
def _resolve_knowledge_question(payload: UserAgentRequest) -> str:
return str(payload.context_json.get("user_input_text") or payload.message or "").strip()
@staticmethod
def _looks_like_structured_knowledge_query(question: str) -> bool:
normalized = str(question or "").strip()
if not normalized:
return False
return any(keyword in normalized for keyword in KNOWLEDGE_DIRECT_ANSWER_HINTS)
def _should_use_direct_knowledge_answer(
self,
question: str,
evidence_items: list[dict[str, Any]],
) -> bool:
if not evidence_items:
return False
if self._looks_like_structured_knowledge_query(question):
return True
return str(evidence_items[0].get("kind") or "") in {"table", "kv", "list", "clause"}
def _build_knowledge_answer_evidence(
self,
payload: UserAgentRequest,
) -> list[dict[str, Any]]:
question = self._resolve_knowledge_question(payload)
query_terms = self._extract_knowledge_query_terms(question)
candidates: list[dict[str, Any]] = []
for hit in self._select_knowledge_model_hits(
payload.tool_payload,
question=question,
):
if not isinstance(hit, dict):
continue
candidates.extend(self._extract_knowledge_evidence_candidates(hit, query_terms))
deduped: list[dict[str, Any]] = []
seen: set[tuple[str, str, str]] = set()
ranked_candidates = sorted(
candidates,
key=lambda value: (
float(value.get("score") or 0),
-len(str(value.get("content") or "")),
),
reverse=True,
)
top_score = float(ranked_candidates[0].get("score") or 0) if ranked_candidates else 0.0
for item in ranked_candidates:
score = float(item.get("score") or 0)
if deduped and score < max(6.0, top_score - 14):
continue
key = (
str(item.get("title") or "").strip(),
str(item.get("heading") or "").strip(),
self._clean_knowledge_segment_text(str(item.get("content") or ""))[:180],
)
if key in seen:
continue
seen.add(key)
deduped.append(item)
if len(deduped) >= MAX_KNOWLEDGE_DIRECT_EVIDENCE:
break
return deduped
def _extract_knowledge_evidence_candidates(
self,
hit: dict[str, Any],
query_terms: list[str],
) -> list[dict[str, Any]]:
title = str(hit.get("title") or hit.get("document_name") or "相关制度").strip()
content = str(hit.get("content") or "").strip()
if not content:
return []
raw_candidates = self._merge_knowledge_lead_in_segments(
self._split_knowledge_hit_into_segments(content)
)
candidates: list[dict[str, Any]] = []
for item in raw_candidates:
score = self._score_knowledge_evidence_candidate(item, query_terms)
if query_terms and score <= 0:
continue
normalized = dict(item)
normalized["title"] = title
normalized["score"] = score
candidates.append(normalized)
if candidates:
return candidates
fallback_text = str(hit.get("excerpt") or "").strip() or self._extract_excerpt(content)
if not fallback_text:
return []
return [
{
"title": title,
"heading": "",
"kind": "paragraph",
"content": fallback_text,
"score": 1,
}
]
def _merge_knowledge_lead_in_segments(
self,
segments: list[dict[str, str]],
) -> list[dict[str, str]]:
if not segments:
return []
merged: list[dict[str, str]] = []
index = 0
while index < len(segments):
current = dict(segments[index])
if not self._is_knowledge_lead_in_segment(current):
merged.append(current)
index += 1
continue
base_heading = str(current.get("heading") or "").strip()
current_marker = self._extract_knowledge_marker_family(str(current.get("content") or ""))
follow_segments: list[dict[str, str]] = []
next_index = index + 1
while next_index < len(segments):
candidate = segments[next_index]
if str(candidate.get("heading") or "").strip() != base_heading:
break
candidate_kind = str(candidate.get("kind") or "").strip()
candidate_content = str(candidate.get("content") or "").strip()
candidate_marker = self._extract_knowledge_marker_family(candidate_content)
if not candidate_content or candidate_kind == "table":
break
if current_marker and candidate_marker == current_marker:
break
if self._is_knowledge_lead_in_segment(candidate) and follow_segments:
break
if candidate_kind not in {"list", "paragraph", "kv", "clause"}:
break
follow_segments.append(candidate)
next_index += 1
if len(follow_segments) >= 4:
break
if candidate_kind == "paragraph" and len(candidate_content) >= 200:
break
if follow_segments:
current["content"] = "\n".join(
[str(current.get("content") or "").strip()]
+ [str(item.get("content") or "").strip() for item in follow_segments]
)
if any(str(item.get("kind") or "").strip() == "list" for item in follow_segments):
current["kind"] = "list"
merged.append(current)
index = next_index
continue
merged.append(current)
index += 1
return merged
def _split_knowledge_hit_into_segments(self, content: str) -> list[dict[str, str]]:
segments: list[dict[str, str]] = []
markdown_headings: list[str] = []
section_heading = ""
paragraph_lines: list[str] = []
table_lines: list[str] = []
def current_heading() -> str:
heading_parts = [item for item in markdown_headings if item]
if section_heading:
heading_parts.append(section_heading)
return " > ".join(heading_parts)
def flush_paragraph() -> None:
nonlocal paragraph_lines
if not paragraph_lines:
return
merged = " ".join(line.strip() for line in paragraph_lines if line.strip()).strip()
paragraph_lines = []
if merged:
segments.append(
{
"heading": current_heading(),
"kind": "paragraph",
"content": merged,
}
)
def flush_table() -> None:
nonlocal table_lines
if not table_lines:
return
merged = "\n".join(line.rstrip() for line in table_lines if line.strip()).strip()
table_lines = []
if merged:
segments.append(
{
"heading": current_heading(),
"kind": "table",
"content": merged,
}
)
for raw_line in str(content or "").replace("\r\n", "\n").replace("\r", "\n").splitlines():
line = raw_line.rstrip()
stripped = line.strip()
if not stripped:
flush_paragraph()
flush_table()
continue
markdown_heading_match = re.match(r"^(#{1,6})\s+(.+)$", stripped)
if markdown_heading_match:
flush_paragraph()
flush_table()
level = len(markdown_heading_match.group(1))
heading_text = markdown_heading_match.group(2).strip()
markdown_headings = markdown_headings[: max(0, level - 1)]
markdown_headings.append(heading_text)
section_heading = ""
continue
if KNOWLEDGE_SECTION_HEADING_PATTERN.match(stripped) and len(stripped) <= 90:
flush_paragraph()
flush_table()
section_heading = stripped.lstrip("#").strip()
continue
if stripped.count("|") >= 2 and "|" in stripped:
flush_paragraph()
table_lines.append(stripped)
continue
flush_table()
if KNOWLEDGE_LIST_ITEM_PATTERN.match(stripped):
flush_paragraph()
segments.append(
{
"heading": current_heading(),
"kind": "list",
"content": stripped,
}
)
continue
if KNOWLEDGE_NUMBERED_ITEM_PATTERN.match(stripped):
flush_paragraph()
segments.append(
{
"heading": current_heading(),
"kind": "list",
"content": stripped,
}
)
continue
if KNOWLEDGE_ARTICLE_PATTERN.match(stripped):
flush_paragraph()
segments.append(
{
"heading": current_heading(),
"kind": "clause",
"content": stripped,
}
)
continue
if ("" in stripped or ":" in stripped) and len(stripped) <= 180:
flush_paragraph()
segments.append(
{
"heading": current_heading(),
"kind": "kv",
"content": stripped,
}
)
continue
paragraph_lines.append(stripped)
flush_paragraph()
flush_table()
return segments
def _render_knowledge_evidence_text(self, item: dict[str, Any]) -> str:
lines = self._split_clean_knowledge_lines(
str(item.get("content") or ""),
preserve_marker=True,
)
if not lines:
return ""
if len(lines) == 1:
return self._clean_knowledge_segment_text(lines[0])
return "\n".join(f" {line}" for line in lines)
def _collect_direct_knowledge_answer_lines(
self,
ordered_evidence_items: list[dict[str, Any]],
*,
query_terms: list[str] | None = None,
) -> list[str]:
if not ordered_evidence_items:
return []
primary_item = ordered_evidence_items[0]
primary_title = str(primary_item.get("title") or "").strip()
primary_heading = str(primary_item.get("heading") or "").strip()
primary_kind = str(primary_item.get("kind") or "").strip()
related_items = [primary_item]
if primary_kind != "table":
for item in ordered_evidence_items[1:]:
if len(related_items) >= 3:
break
if str(item.get("kind") or "").strip() != primary_kind:
continue
if str(item.get("title") or "").strip() != primary_title:
continue
if str(item.get("heading") or "").strip() != primary_heading:
continue
related_items.append(item)
lines: list[str] = []
seen: set[str] = set()
for item in related_items:
item_kind = str(item.get("kind") or "").strip()
item_content = str(item.get("content") or "")
if item_kind == "paragraph" or self._has_inline_numbered_knowledge_items(item_content):
rendered = self._focus_knowledge_segment_content(
item_content,
query_terms or [],
)
rendered_lines = self._split_inline_numbered_knowledge_items(rendered)
else:
rendered = self._render_knowledge_evidence_text(item)
rendered_lines = rendered.splitlines()
for line in rendered_lines:
normalized = str(line or "").strip()
if not normalized or normalized in seen:
continue
seen.add(normalized)
lines.append(line)
return lines
def _summarize_knowledge_evidence_content(
self,
item: dict[str, Any],
query_terms: list[str],
) -> str:
kind = str(item.get("kind") or "").strip()
content = str(item.get("content") or "").strip()
if kind == "table":
preview = self._extract_relevant_table_preview(content, query_terms)
preview_rows = [line for line in preview.splitlines() if line.strip()][:4]
if len(preview_rows) >= 3:
return "当前命中的直接依据是一张与问题强相关的标准表,已摘出最相关的表头和行。"
return "当前命中的直接依据是一张与问题强相关的标准表。"
lines = self._split_clean_knowledge_lines(content, preserve_marker=True)
if len(lines) >= 2:
return self._clean_knowledge_segment_text(f"{lines[0]} {' '.join(lines[1:4])}")
return self._clean_knowledge_segment_text(content)
def _build_missing_location_grounding_note(
self,
question: str,
evidence_items: list[dict[str, Any]],
) -> str:
location = self._extract_query_location(question)
if not location:
return ""
haystack = "\n".join(
str(item.get("heading") or "") + "\n" + str(item.get("content") or "")
for item in evidence_items
)
if location in haystack:
return ""
return (
f"当前命中的制度依据没有直接写出“{location}”对应的地区档位或映射关系,"
"因此不能直接把它套用到表格中的某一列。"
)
def _build_knowledge_search_answer(
self,
payload: UserAgentRequest,
citations: list[UserAgentCitation],
) -> str:
hits = [item for item in list(payload.tool_payload.get("hits") or []) if isinstance(item, dict)]
evidence_items = self._build_knowledge_answer_evidence(payload)
primary_citation = citations[0] if citations else None
title = str(
(primary_citation.title if primary_citation else "")
or (hits[0].get("title") if hits else "")
or "相关制度"
).strip()
user_name = str(payload.context_json.get("name") or "").strip()
answer_lines: list[str] = []
if user_name:
answer_lines.append(f"{user_name},您好。")
if not hits:
self._append_markdown_section(
answer_lines,
"结论",
[f"当前没有拿到可用于回答这个问题的《{title}》知识库命中。"],
)
self._append_markdown_section(
answer_lines,
"说明",
["- 我不会用相似主题或外部常识硬凑答案;请补充更具体的关键词后再试一次。"],
)
return "\n".join(answer_lines).strip()
evidence_lines: list[str] = []
for item in evidence_items[:3]:
heading = str(item.get("heading") or "").strip()
if "表格行级检索线索" in heading:
heading = heading.replace("表格行级检索线索", "").strip(" >")
heading_text = f"{heading}" if heading else ""
item_title = item.get("title") or title
if str(item.get("kind") or "") == "table":
preview = self._extract_relevant_table_preview(
str(item.get("content") or ""),
self._extract_knowledge_query_terms(self._resolve_knowledge_question(payload)),
)
evidence_lines.append(f"- **《{item_title}》** {heading_text}\n{preview}")
continue
rendered = self._render_knowledge_evidence_text(item)
if rendered:
if "\n" in rendered:
evidence_lines.append(f"- **《{item_title}》** {heading_text}\n{rendered}")
else:
evidence_lines.append(f"- **《{item_title}》** {heading_text}\n {rendered}")
if not evidence_lines:
for item in hits[:2]:
item_title = str(item.get("title") or item.get("document_name") or "相关制度").strip()
excerpt = (
str(item.get("excerpt") or "").strip()
or self._extract_excerpt(str(item.get("content") or ""))
)
if not excerpt:
continue
evidence_lines.append(f"- **《{item_title}》**{excerpt}")
if not evidence_lines:
self._append_markdown_section(
answer_lines,
"结论",
[f"当前《{title}》里可用于回答这个问题的关键条款还不够明确。"],
)
self._append_markdown_section(
answer_lines,
"说明",
["- 请补充费用类型、适用地区、职级或具体业务场景,我再继续帮你缩小范围。"],
)
return "\n".join(answer_lines).strip()
self._append_markdown_section(
answer_lines,
"结论",
["我先根据当前制度依据给出可以确认的部分。"],
)
self._append_markdown_section(answer_lines, "依据", evidence_lines)
self._append_markdown_section(
answer_lines,
"说明",
["- 以上只使用当前命中的知识库证据;没有在证据中出现的适用条件或金额,我不会替你默认补齐。"],
)
return "\n".join(answer_lines).strip()