Files
X-Financial/server/src/app/services/user_agent_knowledge.py

628 lines
23 KiB
Python
Raw Normal View History

from __future__ import annotations
import re
from typing import Any
from app.schemas.user_agent import UserAgentCitation, UserAgentRequest
from app.services.user_agent_knowledge_helpers import UserAgentKnowledgeHelpersMixin
from app.services.user_agent_knowledge_constants import (
KNOWLEDGE_ARTICLE_PATTERN,
KNOWLEDGE_DIRECT_ANSWER_HINTS,
KNOWLEDGE_LIST_ITEM_PATTERN,
KNOWLEDGE_NUMBERED_ITEM_PATTERN,
KNOWLEDGE_QUERY_STOPWORDS,
KNOWLEDGE_SECTION_HEADING_PATTERN,
MAX_KNOWLEDGE_DIRECT_EVIDENCE,
MAX_KNOWLEDGE_MODEL_HITS,
MAX_KNOWLEDGE_QUERY_TERMS,
)
class UserAgentKnowledgeMixin(UserAgentKnowledgeHelpersMixin):
@staticmethod
def _build_model_tool_payload(
tool_payload: dict[str, Any],
*,
question: str | None = None,
) -> dict[str, Any]:
normalized = dict(tool_payload or {})
hits = []
for item in UserAgentKnowledgeMixin._select_knowledge_model_hits(
tool_payload,
question=question,
):
if not isinstance(item, dict):
continue
hits.append(
{
"title": str(item.get("title") or "").strip(),
"document_name": str(item.get("document_name") or "").strip(),
"excerpt": str(item.get("excerpt") or "").strip(),
"content": str(item.get("content") or "").strip()[:1200],
"tags": list(item.get("tags") or [])[:5],
"evidence": list(item.get("evidence") or [])[:3],
"code": str(item.get("code") or "").strip(),
}
)
normalized["hits"] = hits
return normalized
@staticmethod
def _build_knowledge_evidence_blocks(
tool_payload: dict[str, Any],
*,
question: str | None = None,
) -> str:
blocks: list[str] = []
for index, item in enumerate(
UserAgentKnowledgeMixin._select_knowledge_model_hits(
tool_payload,
question=question,
)[:3],
start=1,
):
if not isinstance(item, dict):
continue
title = str(item.get("title") or item.get("document_name") or f"证据 {index}").strip()
code = str(item.get("code") or "").strip()
content = str(item.get("content") or "").strip()
if not content:
continue
blocks.append(
"\n".join(
[
f"[证据 {index}] {title}" + (f" ({code})" if code else ""),
"```text",
content[:1200],
"```",
]
)
)
return "\n\n".join(blocks)
def _build_fast_knowledge_answer(
self,
payload: UserAgentRequest,
*,
citations: list[UserAgentCitation],
) -> str | None:
if payload.ontology.scenario != "knowledge":
return None
if str(payload.tool_payload.get("result_type") or "").strip() != "knowledge_search":
return None
evidence_items = self._build_knowledge_answer_evidence(payload)
if not evidence_items:
return None
question = self._resolve_knowledge_question(payload)
if not self._should_use_direct_knowledge_answer(question, evidence_items):
return None
return self._render_knowledge_direct_answer(
payload,
citations=citations,
evidence_items=evidence_items,
)
def _render_knowledge_direct_answer(
self,
payload: UserAgentRequest,
*,
citations: list[UserAgentCitation],
evidence_items: list[dict[str, Any]],
) -> str | None:
if not evidence_items:
return None
title = str(
(citations[0].title if citations else "")
or evidence_items[0].get("title")
or "相关制度"
).strip()
user_name = str(payload.context_json.get("name") or "").strip()
question = self._resolve_knowledge_question(payload)
query_terms = self._extract_knowledge_query_terms(question)
ordered_evidence_items = self._prioritize_knowledge_evidence_items(question, evidence_items)
primary_item = ordered_evidence_items[0]
primary_heading = self._format_knowledge_heading_label(
str(primary_item.get("heading") or "").strip()
)
primary_lines = self._collect_direct_knowledge_answer_lines(ordered_evidence_items)
lines: list[str] = []
if user_name:
lines.append(f"{user_name},您好。")
source_prefix = f"根据《{title}"
if primary_heading:
source_prefix = f"{source_prefix}{primary_heading}"
if str(primary_item.get("kind") or "") == "table":
lines.append(f"{source_prefix},当前能直接确认的是:")
lines.append(self._extract_relevant_table_preview(str(primary_item.get("content") or ""), query_terms))
else:
if not primary_lines:
lines.append(
f"{source_prefix},当前能直接确认的是:"
f"{self._summarize_knowledge_evidence_content(primary_item, query_terms)}"
)
elif len(primary_lines) == 1:
lines.append(f"{source_prefix},当前能直接确认的是:{primary_lines[0].strip()}")
else:
lines.append(f"{source_prefix},当前能直接确认的是:")
lines.extend(primary_lines)
notes: list[str] = []
location_note = self._build_missing_location_grounding_note(question, evidence_items)
if location_note:
notes.append(location_note)
if self._question_requires_explicit_condition(question) and not self._answer_evidence_has_numeric_or_condition(evidence_items):
notes.append("当前命中的证据更偏规则说明或流程约束,还没有直接给出可立即套用的数值或完整条件。")
if notes:
lines.append("")
lines.append("说明:")
lines.extend(f"- {note}" for note in notes)
return "\n".join(line for line in lines if line is not None).strip()
@staticmethod
def _resolve_knowledge_question(payload: UserAgentRequest) -> str:
return str(payload.context_json.get("user_input_text") or payload.message or "").strip()
@staticmethod
def _looks_like_structured_knowledge_query(question: str) -> bool:
normalized = str(question or "").strip()
if not normalized:
return False
return any(keyword in normalized for keyword in KNOWLEDGE_DIRECT_ANSWER_HINTS)
def _should_use_direct_knowledge_answer(
self,
question: str,
evidence_items: list[dict[str, Any]],
) -> bool:
if not evidence_items:
return False
if self._looks_like_structured_knowledge_query(question):
return True
return str(evidence_items[0].get("kind") or "") in {"table", "kv", "list", "clause"}
def _build_knowledge_answer_evidence(
self,
payload: UserAgentRequest,
) -> list[dict[str, Any]]:
question = self._resolve_knowledge_question(payload)
query_terms = self._extract_knowledge_query_terms(question)
candidates: list[dict[str, Any]] = []
for hit in self._select_knowledge_model_hits(
payload.tool_payload,
question=question,
):
if not isinstance(hit, dict):
continue
candidates.extend(self._extract_knowledge_evidence_candidates(hit, query_terms))
deduped: list[dict[str, Any]] = []
seen: set[tuple[str, str, str]] = set()
ranked_candidates = sorted(
candidates,
key=lambda value: (
float(value.get("score") or 0),
-len(str(value.get("content") or "")),
),
reverse=True,
)
top_score = float(ranked_candidates[0].get("score") or 0) if ranked_candidates else 0.0
for item in ranked_candidates:
score = float(item.get("score") or 0)
if deduped and score < max(6.0, top_score - 14):
continue
key = (
str(item.get("title") or "").strip(),
str(item.get("heading") or "").strip(),
self._clean_knowledge_segment_text(str(item.get("content") or ""))[:180],
)
if key in seen:
continue
seen.add(key)
deduped.append(item)
if len(deduped) >= MAX_KNOWLEDGE_DIRECT_EVIDENCE:
break
return deduped
def _extract_knowledge_evidence_candidates(
self,
hit: dict[str, Any],
query_terms: list[str],
) -> list[dict[str, Any]]:
title = str(hit.get("title") or hit.get("document_name") or "相关制度").strip()
content = str(hit.get("content") or "").strip()
if not content:
return []
raw_candidates = self._merge_knowledge_lead_in_segments(
self._split_knowledge_hit_into_segments(content)
)
candidates: list[dict[str, Any]] = []
for item in raw_candidates:
score = self._score_knowledge_evidence_candidate(item, query_terms)
if query_terms and score <= 0:
continue
normalized = dict(item)
normalized["title"] = title
normalized["score"] = score
candidates.append(normalized)
if candidates:
return candidates
fallback_text = str(hit.get("excerpt") or "").strip() or self._extract_excerpt(content)
if not fallback_text:
return []
return [
{
"title": title,
"heading": "",
"kind": "paragraph",
"content": fallback_text,
"score": 1,
}
]
def _merge_knowledge_lead_in_segments(
self,
segments: list[dict[str, str]],
) -> list[dict[str, str]]:
if not segments:
return []
merged: list[dict[str, str]] = []
index = 0
while index < len(segments):
current = dict(segments[index])
if not self._is_knowledge_lead_in_segment(current):
merged.append(current)
index += 1
continue
base_heading = str(current.get("heading") or "").strip()
current_marker = self._extract_knowledge_marker_family(str(current.get("content") or ""))
follow_segments: list[dict[str, str]] = []
next_index = index + 1
while next_index < len(segments):
candidate = segments[next_index]
if str(candidate.get("heading") or "").strip() != base_heading:
break
candidate_kind = str(candidate.get("kind") or "").strip()
candidate_content = str(candidate.get("content") or "").strip()
candidate_marker = self._extract_knowledge_marker_family(candidate_content)
if not candidate_content or candidate_kind == "table":
break
if current_marker and candidate_marker == current_marker:
break
if self._is_knowledge_lead_in_segment(candidate) and follow_segments:
break
if candidate_kind not in {"list", "paragraph", "kv", "clause"}:
break
follow_segments.append(candidate)
next_index += 1
if len(follow_segments) >= 4:
break
if candidate_kind == "paragraph" and len(candidate_content) >= 200:
break
if follow_segments:
current["content"] = "\n".join(
[str(current.get("content") or "").strip()]
+ [str(item.get("content") or "").strip() for item in follow_segments]
)
if any(str(item.get("kind") or "").strip() == "list" for item in follow_segments):
current["kind"] = "list"
merged.append(current)
index = next_index
continue
merged.append(current)
index += 1
return merged
def _split_knowledge_hit_into_segments(self, content: str) -> list[dict[str, str]]:
segments: list[dict[str, str]] = []
markdown_headings: list[str] = []
section_heading = ""
paragraph_lines: list[str] = []
table_lines: list[str] = []
def current_heading() -> str:
heading_parts = [item for item in markdown_headings if item]
if section_heading:
heading_parts.append(section_heading)
return " > ".join(heading_parts)
def flush_paragraph() -> None:
nonlocal paragraph_lines
if not paragraph_lines:
return
merged = " ".join(line.strip() for line in paragraph_lines if line.strip()).strip()
paragraph_lines = []
if merged:
segments.append(
{
"heading": current_heading(),
"kind": "paragraph",
"content": merged,
}
)
def flush_table() -> None:
nonlocal table_lines
if not table_lines:
return
merged = "\n".join(line.rstrip() for line in table_lines if line.strip()).strip()
table_lines = []
if merged:
segments.append(
{
"heading": current_heading(),
"kind": "table",
"content": merged,
}
)
for raw_line in str(content or "").replace("\r\n", "\n").replace("\r", "\n").splitlines():
line = raw_line.rstrip()
stripped = line.strip()
if not stripped:
flush_paragraph()
flush_table()
continue
markdown_heading_match = re.match(r"^(#{1,6})\s+(.+)$", stripped)
if markdown_heading_match:
flush_paragraph()
flush_table()
level = len(markdown_heading_match.group(1))
heading_text = markdown_heading_match.group(2).strip()
markdown_headings = markdown_headings[: max(0, level - 1)]
markdown_headings.append(heading_text)
section_heading = ""
continue
if KNOWLEDGE_SECTION_HEADING_PATTERN.match(stripped) and len(stripped) <= 90:
flush_paragraph()
flush_table()
section_heading = stripped.lstrip("#").strip()
continue
if stripped.count("|") >= 2 and "|" in stripped:
flush_paragraph()
table_lines.append(stripped)
continue
flush_table()
if KNOWLEDGE_LIST_ITEM_PATTERN.match(stripped):
flush_paragraph()
segments.append(
{
"heading": current_heading(),
"kind": "list",
"content": stripped,
}
)
continue
if KNOWLEDGE_NUMBERED_ITEM_PATTERN.match(stripped):
flush_paragraph()
segments.append(
{
"heading": current_heading(),
"kind": "list",
"content": stripped,
}
)
continue
if KNOWLEDGE_ARTICLE_PATTERN.match(stripped):
flush_paragraph()
segments.append(
{
"heading": current_heading(),
"kind": "clause",
"content": stripped,
}
)
continue
if ("" in stripped or ":" in stripped) and len(stripped) <= 180:
flush_paragraph()
segments.append(
{
"heading": current_heading(),
"kind": "kv",
"content": stripped,
}
)
continue
paragraph_lines.append(stripped)
flush_paragraph()
flush_table()
return segments
def _render_knowledge_evidence_text(self, item: dict[str, Any]) -> str:
lines = self._split_clean_knowledge_lines(
str(item.get("content") or ""),
preserve_marker=True,
)
if not lines:
return ""
if len(lines) == 1:
return self._clean_knowledge_segment_text(lines[0])
return "\n".join(f" {line}" for line in lines)
def _collect_direct_knowledge_answer_lines(
self,
ordered_evidence_items: list[dict[str, Any]],
) -> list[str]:
if not ordered_evidence_items:
return []
primary_item = ordered_evidence_items[0]
primary_title = str(primary_item.get("title") or "").strip()
primary_heading = str(primary_item.get("heading") or "").strip()
primary_kind = str(primary_item.get("kind") or "").strip()
related_items = [primary_item]
if primary_kind != "table":
for item in ordered_evidence_items[1:]:
if len(related_items) >= 3:
break
if str(item.get("kind") or "").strip() != primary_kind:
continue
if str(item.get("title") or "").strip() != primary_title:
continue
if str(item.get("heading") or "").strip() != primary_heading:
continue
related_items.append(item)
lines: list[str] = []
seen: set[str] = set()
for item in related_items:
rendered = self._render_knowledge_evidence_text(item)
for line in rendered.splitlines():
normalized = str(line or "").strip()
if not normalized or normalized in seen:
continue
seen.add(normalized)
lines.append(line)
return lines
def _summarize_knowledge_evidence_content(
self,
item: dict[str, Any],
query_terms: list[str],
) -> str:
kind = str(item.get("kind") or "").strip()
content = str(item.get("content") or "").strip()
if kind == "table":
preview = self._extract_relevant_table_preview(content, query_terms)
preview_rows = [line for line in preview.splitlines() if line.strip()][:4]
if len(preview_rows) >= 3:
return "当前命中的直接依据是一张与问题强相关的标准表,已摘出最相关的表头和行。"
return "当前命中的直接依据是一张与问题强相关的标准表。"
lines = self._split_clean_knowledge_lines(content, preserve_marker=True)
if len(lines) >= 2:
return self._clean_knowledge_segment_text(f"{lines[0]} {' '.join(lines[1:4])}")
return self._clean_knowledge_segment_text(content)
def _build_missing_location_grounding_note(
self,
question: str,
evidence_items: list[dict[str, Any]],
) -> str:
location = self._extract_query_location(question)
if not location:
return ""
haystack = "\n".join(
str(item.get("heading") or "") + "\n" + str(item.get("content") or "")
for item in evidence_items
)
if location in haystack:
return ""
return (
f"当前命中的制度依据没有直接写出“{location}”对应的地区档位或映射关系,"
"因此不能直接把它套用到表格中的某一列。"
)
def _build_knowledge_search_answer(
self,
payload: UserAgentRequest,
citations: list[UserAgentCitation],
) -> str:
hits = [item for item in list(payload.tool_payload.get("hits") or []) if isinstance(item, dict)]
evidence_items = self._build_knowledge_answer_evidence(payload)
primary_citation = citations[0] if citations else None
title = str(
(primary_citation.title if primary_citation else "")
or (hits[0].get("title") if hits else "")
or "相关制度"
).strip()
user_name = str(payload.context_json.get("name") or "").strip()
prefix = f"{user_name},您好。\n" if user_name else ""
if not hits:
return (
f"{prefix}我已经从《{title}》中检索到与你这次问题相关的制度依据,"
"但本次答案生成环节暂时没有成功返回。请稍后重试一次;如果仍然失败,"
"建议先检查主对话模型的连通性。"
)
evidence_lines: list[str] = []
for item in evidence_items[:3]:
heading = str(item.get("heading") or "").strip()
heading_text = f" > {heading}" if heading else ""
if str(item.get("kind") or "") == "table":
preview = self._extract_relevant_table_preview(
str(item.get("content") or ""),
self._extract_knowledge_query_terms(self._resolve_knowledge_question(payload)),
)
evidence_lines.append(f"- 《{item.get('title') or title}{heading_text}\n{preview}")
continue
rendered = self._render_knowledge_evidence_text(item)
if rendered:
if "\n" in rendered:
evidence_lines.append(f"- 《{item.get('title') or title}{heading_text}\n{rendered}")
else:
evidence_lines.append(f"- 《{item.get('title') or title}{heading_text}{rendered}")
if not evidence_lines:
for item in hits[:2]:
item_title = str(item.get("title") or item.get("document_name") or "相关制度").strip()
excerpt = (
str(item.get("excerpt") or "").strip()
or self._extract_excerpt(str(item.get("content") or ""))
)
if not excerpt:
continue
evidence_lines.append(f"- 《{item_title}》:{excerpt}")
if not evidence_lines:
return (
f"{prefix}我已经从《{title}》中检索到与你这次问题相关的制度依据,"
"但本次答案生成环节暂时没有成功返回。请稍后重试一次;如果仍然失败,"
"建议先检查主对话模型的连通性。"
)
return "\n".join(
[
f"{prefix}我已经命中与你这次问题最相关的制度依据,但答案整理阶段本轮没有及时返回。",
"先给你当前最直接的依据:",
*evidence_lines,
"如果你希望我继续把这些依据整理成更完整的结论、步骤或对比说明,可以继续缩小问题范围后再问一次。",
]
).strip()