Files
X-Financial/server/src/app/services/user_agent_knowledge.py

736 lines
28 KiB
Python
Raw Normal View History

from __future__ import annotations
import re
from typing import Any
from app.schemas.user_agent import UserAgentCitation, UserAgentRequest
from app.services.user_agent_knowledge_helpers import UserAgentKnowledgeHelpersMixin
from app.services.user_agent_knowledge_constants import (
KNOWLEDGE_ARTICLE_PATTERN,
KNOWLEDGE_DIRECT_ANSWER_HINTS,
KNOWLEDGE_LIST_ITEM_PATTERN,
KNOWLEDGE_NUMBERED_ITEM_PATTERN,
KNOWLEDGE_QUERY_STOPWORDS,
KNOWLEDGE_SECTION_HEADING_PATTERN,
MAX_KNOWLEDGE_DIRECT_EVIDENCE,
MAX_KNOWLEDGE_MODEL_HITS,
MAX_KNOWLEDGE_QUERY_TERMS,
)
class UserAgentKnowledgeMixin(UserAgentKnowledgeHelpersMixin):
@staticmethod
def _build_model_tool_payload(
tool_payload: dict[str, Any],
*,
question: str | None = None,
) -> dict[str, Any]:
normalized = dict(tool_payload or {})
hits = []
for item in UserAgentKnowledgeMixin._select_knowledge_model_hits(
tool_payload,
question=question,
):
if not isinstance(item, dict):
continue
hits.append(
{
"title": str(item.get("title") or "").strip(),
"document_name": str(item.get("document_name") or "").strip(),
"excerpt": str(item.get("excerpt") or "").strip(),
"content": str(item.get("content") or "").strip()[:1200],
"tags": list(item.get("tags") or [])[:5],
"evidence": list(item.get("evidence") or [])[:3],
"code": str(item.get("code") or "").strip(),
}
)
normalized["hits"] = hits
return normalized
@staticmethod
def _build_knowledge_evidence_blocks(
tool_payload: dict[str, Any],
*,
question: str | None = None,
) -> str:
blocks: list[str] = []
for index, item in enumerate(
UserAgentKnowledgeMixin._select_knowledge_model_hits(
tool_payload,
question=question,
)[:3],
start=1,
):
if not isinstance(item, dict):
continue
title = str(item.get("title") or item.get("document_name") or f"证据 {index}").strip()
code = str(item.get("code") or "").strip()
content = str(item.get("content") or "").strip()
if not content:
continue
blocks.append(
"\n".join(
[
f"[证据 {index}] {title}" + (f" ({code})" if code else ""),
"```text",
content[:1200],
"```",
]
)
)
return "\n\n".join(blocks)
def _build_fast_knowledge_answer(
self,
payload: UserAgentRequest,
*,
citations: list[UserAgentCitation],
) -> str | None:
if payload.ontology.scenario != "knowledge":
return None
if str(payload.tool_payload.get("result_type") or "").strip() != "knowledge_search":
return None
evidence_items = self._build_knowledge_answer_evidence(payload)
if not evidence_items:
return None
question = self._resolve_knowledge_question(payload)
if not self._should_use_direct_knowledge_answer(question, evidence_items):
return None
return self._render_knowledge_direct_answer(
payload,
citations=citations,
evidence_items=evidence_items,
)
def _render_knowledge_direct_answer(
self,
payload: UserAgentRequest,
*,
citations: list[UserAgentCitation],
evidence_items: list[dict[str, Any]],
) -> str | None:
if not evidence_items:
return None
title = str(
(citations[0].title if citations else "")
or evidence_items[0].get("title")
or "相关制度"
).strip()
user_name = str(payload.context_json.get("name") or "").strip()
question = self._resolve_knowledge_question(payload)
query_terms = self._extract_knowledge_query_terms(question)
ordered_evidence_items = self._prioritize_knowledge_evidence_items(question, evidence_items)
primary_item = ordered_evidence_items[0]
primary_heading = self._format_knowledge_heading_label(
str(primary_item.get("heading") or "").strip()
)
primary_lines = self._collect_direct_knowledge_answer_lines(
ordered_evidence_items,
query_terms=query_terms,
)
lines: list[str] = []
if user_name:
lines.append(f"{user_name},您好。")
source_prefix = f"根据《{title}"
if primary_heading:
source_prefix = f"{source_prefix}{primary_heading}"
conclusion_lines: list[str] = []
evidence_lines: list[str] = []
if str(primary_item.get("kind") or "") == "table":
table_content = str(primary_item.get("content") or "")
if self._question_requests_broad_knowledge_table(question):
table_preview = table_content.strip()
else:
table_preview = self._extract_relevant_table_preview(
table_content,
query_terms,
preferred_terms=self._build_knowledge_table_preferred_terms(payload),
)
table_summary = self._summarize_knowledge_table_preview(table_preview)
conclusion_lines.append(f"{source_prefix}{table_summary}")
evidence_lines.append(table_preview)
else:
if not primary_lines:
summary = self._summarize_knowledge_evidence_content(primary_item, query_terms)
conclusion_lines.append(
f"{source_prefix},当前能直接确认的是:"
f"{summary}"
)
elif len(primary_lines) == 1:
conclusion_lines.append(f"{source_prefix},当前能直接确认的是:{primary_lines[0].strip()}")
evidence_lines.extend(primary_lines)
else:
subject = self._build_knowledge_answer_subject(question, primary_heading)
summary = self._summarize_knowledge_lines_conclusion(
primary_lines,
heading=subject,
)
if summary:
conclusion_lines.append(f"{source_prefix}{summary}")
else:
conclusion_lines.append(f"{source_prefix},当前能直接确认的是:")
evidence_lines.extend(primary_lines)
notes: list[str] = []
location_note = self._build_missing_location_grounding_note(question, evidence_items)
if location_note:
notes.append(location_note)
if self._question_requires_explicit_condition(question) and not self._answer_evidence_has_numeric_or_condition(evidence_items):
notes.append("当前命中的证据更偏规则说明或流程约束,还没有直接给出可立即套用的数值或完整条件。")
self._append_markdown_section(lines, "结论", conclusion_lines)
self._append_markdown_section(lines, "依据", evidence_lines)
if notes:
self._append_markdown_section(lines, "说明", [f"- {note}" for note in notes])
return "\n".join(line for line in lines if line is not None).strip()
@staticmethod
def _append_markdown_section(lines: list[str], title: str, body_lines: list[str]) -> None:
cleaned = [str(line or "").rstrip() for line in body_lines if str(line or "").strip()]
if not cleaned:
return
if lines and lines[-1] != "":
lines.append("")
lines.append(f"## {title}")
lines.append("")
lines.extend(cleaned)
@staticmethod
def _build_knowledge_answer_subject(question: str, heading: str = "") -> str:
clean_heading = str(heading or "").strip()
if clean_heading and not any(
marker in clean_heading
for marker in ("问答线索补充", "结构化表格补充", "重点章节摘录", "章节导航")
):
return clean_heading
normalized = re.sub(r"\s+", "", str(question or "").strip())
normalized = re.sub(r"[?。.!]+$", "", normalized)
normalized = re.sub(r"(是什么|有哪些|是多少|如何|怎么|吗|呢)$", "", normalized)
return normalized.strip(":,。.")
@staticmethod
def _build_knowledge_table_preferred_terms(payload: UserAgentRequest) -> list[str]:
terms: list[str] = []
context = payload.context_json or {}
for key in ("grade", "position", "job_grade", "rank", "level"):
value = str(context.get(key) or "").strip()
if value and value not in terms:
terms.append(value)
grade_match = re.fullmatch(r"[Pp](\d{1,2})", str(context.get("grade") or "").strip())
if grade_match:
grade = int(grade_match.group(1))
for start in range(max(0, grade - 4), grade + 1):
for end in range(grade, min(12, grade + 4) + 1):
if start >= end:
continue
for separator in ("", "~", "-", ""):
term = f"P{start}{separator}P{end}"
if term not in terms:
terms.append(term)
return terms
@staticmethod
def _resolve_knowledge_question(payload: UserAgentRequest) -> str:
return str(payload.context_json.get("user_input_text") or payload.message or "").strip()
@staticmethod
def _looks_like_structured_knowledge_query(question: str) -> bool:
normalized = str(question or "").strip()
if not normalized:
return False
return any(keyword in normalized for keyword in KNOWLEDGE_DIRECT_ANSWER_HINTS)
def _should_use_direct_knowledge_answer(
self,
question: str,
evidence_items: list[dict[str, Any]],
) -> bool:
if not evidence_items:
return False
if self._looks_like_structured_knowledge_query(question):
return True
return str(evidence_items[0].get("kind") or "") in {"table", "kv", "list", "clause"}
def _build_knowledge_answer_evidence(
self,
payload: UserAgentRequest,
) -> list[dict[str, Any]]:
question = self._resolve_knowledge_question(payload)
query_terms = self._extract_knowledge_query_terms(question)
candidates: list[dict[str, Any]] = []
for hit in self._select_knowledge_model_hits(
payload.tool_payload,
question=question,
):
if not isinstance(hit, dict):
continue
candidates.extend(self._extract_knowledge_evidence_candidates(hit, query_terms))
deduped: list[dict[str, Any]] = []
seen: set[tuple[str, str, str]] = set()
ranked_candidates = sorted(
candidates,
key=lambda value: (
float(value.get("score") or 0),
-len(str(value.get("content") or "")),
),
reverse=True,
)
top_score = float(ranked_candidates[0].get("score") or 0) if ranked_candidates else 0.0
for item in ranked_candidates:
score = float(item.get("score") or 0)
if deduped and score < max(6.0, top_score - 14):
continue
key = (
str(item.get("title") or "").strip(),
str(item.get("heading") or "").strip(),
self._clean_knowledge_segment_text(str(item.get("content") or ""))[:180],
)
if key in seen:
continue
seen.add(key)
deduped.append(item)
if len(deduped) >= MAX_KNOWLEDGE_DIRECT_EVIDENCE:
break
return deduped
def _extract_knowledge_evidence_candidates(
self,
hit: dict[str, Any],
query_terms: list[str],
) -> list[dict[str, Any]]:
title = str(hit.get("title") or hit.get("document_name") or "相关制度").strip()
content = str(hit.get("content") or "").strip()
if not content:
return []
raw_candidates = self._merge_knowledge_lead_in_segments(
self._split_knowledge_hit_into_segments(content)
)
candidates: list[dict[str, Any]] = []
for item in raw_candidates:
score = self._score_knowledge_evidence_candidate(item, query_terms)
if query_terms and score <= 0:
continue
normalized = dict(item)
normalized["title"] = title
normalized["score"] = score
candidates.append(normalized)
if candidates:
return candidates
fallback_text = str(hit.get("excerpt") or "").strip() or self._extract_excerpt(content)
if not fallback_text:
return []
return [
{
"title": title,
"heading": "",
"kind": "paragraph",
"content": fallback_text,
"score": 1,
}
]
def _merge_knowledge_lead_in_segments(
self,
segments: list[dict[str, str]],
) -> list[dict[str, str]]:
if not segments:
return []
merged: list[dict[str, str]] = []
index = 0
while index < len(segments):
current = dict(segments[index])
if not self._is_knowledge_lead_in_segment(current):
merged.append(current)
index += 1
continue
base_heading = str(current.get("heading") or "").strip()
current_marker = self._extract_knowledge_marker_family(str(current.get("content") or ""))
follow_segments: list[dict[str, str]] = []
next_index = index + 1
while next_index < len(segments):
candidate = segments[next_index]
if str(candidate.get("heading") or "").strip() != base_heading:
break
candidate_kind = str(candidate.get("kind") or "").strip()
candidate_content = str(candidate.get("content") or "").strip()
candidate_marker = self._extract_knowledge_marker_family(candidate_content)
if not candidate_content or candidate_kind == "table":
break
if current_marker and candidate_marker == current_marker:
break
if self._is_knowledge_lead_in_segment(candidate) and follow_segments:
break
if candidate_kind not in {"list", "paragraph", "kv", "clause"}:
break
follow_segments.append(candidate)
next_index += 1
if len(follow_segments) >= 4:
break
if candidate_kind == "paragraph" and len(candidate_content) >= 200:
break
if follow_segments:
current["content"] = "\n".join(
[str(current.get("content") or "").strip()]
+ [str(item.get("content") or "").strip() for item in follow_segments]
)
if any(str(item.get("kind") or "").strip() == "list" for item in follow_segments):
current["kind"] = "list"
merged.append(current)
index = next_index
continue
merged.append(current)
index += 1
return merged
def _split_knowledge_hit_into_segments(self, content: str) -> list[dict[str, str]]:
segments: list[dict[str, str]] = []
markdown_headings: list[str] = []
section_heading = ""
paragraph_lines: list[str] = []
table_lines: list[str] = []
def current_heading() -> str:
heading_parts = [item for item in markdown_headings if item]
if section_heading:
heading_parts.append(section_heading)
return " > ".join(heading_parts)
def flush_paragraph() -> None:
nonlocal paragraph_lines
if not paragraph_lines:
return
merged = " ".join(line.strip() for line in paragraph_lines if line.strip()).strip()
paragraph_lines = []
if merged:
segments.append(
{
"heading": current_heading(),
"kind": "paragraph",
"content": merged,
}
)
def flush_table() -> None:
nonlocal table_lines
if not table_lines:
return
merged = "\n".join(line.rstrip() for line in table_lines if line.strip()).strip()
table_lines = []
if merged:
segments.append(
{
"heading": current_heading(),
"kind": "table",
"content": merged,
}
)
for raw_line in str(content or "").replace("\r\n", "\n").replace("\r", "\n").splitlines():
line = raw_line.rstrip()
stripped = line.strip()
if not stripped:
flush_paragraph()
flush_table()
continue
markdown_heading_match = re.match(r"^(#{1,6})\s+(.+)$", stripped)
if markdown_heading_match:
flush_paragraph()
flush_table()
level = len(markdown_heading_match.group(1))
heading_text = markdown_heading_match.group(2).strip()
markdown_headings = markdown_headings[: max(0, level - 1)]
markdown_headings.append(heading_text)
section_heading = ""
continue
if KNOWLEDGE_SECTION_HEADING_PATTERN.match(stripped) and len(stripped) <= 90:
flush_paragraph()
flush_table()
section_heading = stripped.lstrip("#").strip()
continue
if stripped.count("|") >= 2 and "|" in stripped:
flush_paragraph()
table_lines.append(stripped)
continue
flush_table()
if KNOWLEDGE_LIST_ITEM_PATTERN.match(stripped):
flush_paragraph()
segments.append(
{
"heading": current_heading(),
"kind": "list",
"content": stripped,
}
)
continue
if KNOWLEDGE_NUMBERED_ITEM_PATTERN.match(stripped):
flush_paragraph()
segments.append(
{
"heading": current_heading(),
"kind": "list",
"content": stripped,
}
)
continue
if KNOWLEDGE_ARTICLE_PATTERN.match(stripped):
flush_paragraph()
segments.append(
{
"heading": current_heading(),
"kind": "clause",
"content": stripped,
}
)
continue
if ("" in stripped or ":" in stripped) and len(stripped) <= 180:
flush_paragraph()
segments.append(
{
"heading": current_heading(),
"kind": "kv",
"content": stripped,
}
)
continue
paragraph_lines.append(stripped)
flush_paragraph()
flush_table()
return segments
def _render_knowledge_evidence_text(self, item: dict[str, Any]) -> str:
lines = self._split_clean_knowledge_lines(
str(item.get("content") or ""),
preserve_marker=True,
)
if not lines:
return ""
if len(lines) == 1:
return self._clean_knowledge_segment_text(lines[0])
return "\n".join(f" {line}" for line in lines)
def _collect_direct_knowledge_answer_lines(
self,
ordered_evidence_items: list[dict[str, Any]],
*,
query_terms: list[str] | None = None,
) -> list[str]:
if not ordered_evidence_items:
return []
primary_item = ordered_evidence_items[0]
primary_title = str(primary_item.get("title") or "").strip()
primary_heading = str(primary_item.get("heading") or "").strip()
primary_kind = str(primary_item.get("kind") or "").strip()
related_items = [primary_item]
if primary_kind != "table":
for item in ordered_evidence_items[1:]:
if len(related_items) >= 3:
break
if str(item.get("kind") or "").strip() != primary_kind:
continue
if str(item.get("title") or "").strip() != primary_title:
continue
if str(item.get("heading") or "").strip() != primary_heading:
continue
related_items.append(item)
lines: list[str] = []
seen: set[str] = set()
for item in related_items:
item_kind = str(item.get("kind") or "").strip()
item_content = str(item.get("content") or "")
if item_kind == "paragraph" or self._has_inline_numbered_knowledge_items(item_content):
rendered = self._focus_knowledge_segment_content(
item_content,
query_terms or [],
)
rendered_lines = self._split_inline_numbered_knowledge_items(rendered)
else:
rendered = self._render_knowledge_evidence_text(item)
rendered_lines = rendered.splitlines()
for line in rendered_lines:
normalized = str(line or "").strip()
if not normalized or normalized in seen:
continue
seen.add(normalized)
lines.append(line)
return lines
def _summarize_knowledge_evidence_content(
self,
item: dict[str, Any],
query_terms: list[str],
) -> str:
kind = str(item.get("kind") or "").strip()
content = str(item.get("content") or "").strip()
if kind == "table":
preview = self._extract_relevant_table_preview(content, query_terms)
preview_rows = [line for line in preview.splitlines() if line.strip()][:4]
if len(preview_rows) >= 3:
return "当前命中的直接依据是一张与问题强相关的标准表,已摘出最相关的表头和行。"
return "当前命中的直接依据是一张与问题强相关的标准表。"
lines = self._split_clean_knowledge_lines(content, preserve_marker=True)
if len(lines) >= 2:
return self._clean_knowledge_segment_text(f"{lines[0]} {' '.join(lines[1:4])}")
return self._clean_knowledge_segment_text(content)
def _build_missing_location_grounding_note(
self,
question: str,
evidence_items: list[dict[str, Any]],
) -> str:
location = self._extract_query_location(question)
if not location:
return ""
haystack = "\n".join(
str(item.get("heading") or "") + "\n" + str(item.get("content") or "")
for item in evidence_items
)
if location in haystack:
return ""
return (
f"当前命中的制度依据没有直接写出“{location}”对应的地区档位或映射关系,"
"因此不能直接把它套用到表格中的某一列。"
)
def _build_knowledge_search_answer(
self,
payload: UserAgentRequest,
citations: list[UserAgentCitation],
) -> str:
hits = [item for item in list(payload.tool_payload.get("hits") or []) if isinstance(item, dict)]
evidence_items = self._build_knowledge_answer_evidence(payload)
primary_citation = citations[0] if citations else None
title = str(
(primary_citation.title if primary_citation else "")
or (hits[0].get("title") if hits else "")
or "相关制度"
).strip()
user_name = str(payload.context_json.get("name") or "").strip()
answer_lines: list[str] = []
if user_name:
answer_lines.append(f"{user_name},您好。")
if not hits:
self._append_markdown_section(
answer_lines,
"结论",
[f"当前没有拿到可用于回答这个问题的《{title}》知识库命中。"],
)
self._append_markdown_section(
answer_lines,
"说明",
["- 我不会用相似主题或外部常识硬凑答案;请补充更具体的关键词后再试一次。"],
)
return "\n".join(answer_lines).strip()
evidence_lines: list[str] = []
for item in evidence_items[:3]:
heading = str(item.get("heading") or "").strip()
if "表格行级检索线索" in heading:
heading = heading.replace("表格行级检索线索", "").strip(" >")
heading_text = f"{heading}" if heading else ""
item_title = item.get("title") or title
if str(item.get("kind") or "") == "table":
preview = self._extract_relevant_table_preview(
str(item.get("content") or ""),
self._extract_knowledge_query_terms(self._resolve_knowledge_question(payload)),
)
evidence_lines.append(f"- **《{item_title}》** {heading_text}\n{preview}")
continue
rendered = self._render_knowledge_evidence_text(item)
if rendered:
if "\n" in rendered:
evidence_lines.append(f"- **《{item_title}》** {heading_text}\n{rendered}")
else:
evidence_lines.append(f"- **《{item_title}》** {heading_text}\n {rendered}")
if not evidence_lines:
for item in hits[:2]:
item_title = str(item.get("title") or item.get("document_name") or "相关制度").strip()
excerpt = (
str(item.get("excerpt") or "").strip()
or self._extract_excerpt(str(item.get("content") or ""))
)
if not excerpt:
continue
evidence_lines.append(f"- **《{item_title}》**{excerpt}")
if not evidence_lines:
self._append_markdown_section(
answer_lines,
"结论",
[f"当前《{title}》里可用于回答这个问题的关键条款还不够明确。"],
)
self._append_markdown_section(
answer_lines,
"说明",
["- 请补充费用类型、适用地区、职级或具体业务场景,我会继续帮您缩小范围。"],
)
return "\n".join(answer_lines).strip()
self._append_markdown_section(
answer_lines,
"结论",
["我先根据当前制度依据给出可以确认的部分。"],
)
self._append_markdown_section(answer_lines, "依据", evidence_lines)
self._append_markdown_section(
answer_lines,
"说明",
["- 以上只使用当前命中的知识库证据;没有在证据中出现的适用条件或金额,我不会替您默认补齐。"],
)
return "\n".join(answer_lines).strip()