2026-05-22 10:42:31 +08:00
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
|
|
from app.schemas.user_agent import UserAgentCitation, UserAgentRequest
|
|
|
|
|
|
from app.services.user_agent_knowledge_helpers import UserAgentKnowledgeHelpersMixin
|
|
|
|
|
|
from app.services.user_agent_knowledge_constants import (
|
|
|
|
|
|
KNOWLEDGE_ARTICLE_PATTERN,
|
|
|
|
|
|
KNOWLEDGE_DIRECT_ANSWER_HINTS,
|
|
|
|
|
|
KNOWLEDGE_LIST_ITEM_PATTERN,
|
|
|
|
|
|
KNOWLEDGE_NUMBERED_ITEM_PATTERN,
|
|
|
|
|
|
KNOWLEDGE_QUERY_STOPWORDS,
|
|
|
|
|
|
KNOWLEDGE_SECTION_HEADING_PATTERN,
|
|
|
|
|
|
MAX_KNOWLEDGE_DIRECT_EVIDENCE,
|
|
|
|
|
|
MAX_KNOWLEDGE_MODEL_HITS,
|
|
|
|
|
|
MAX_KNOWLEDGE_QUERY_TERMS,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class UserAgentKnowledgeMixin(UserAgentKnowledgeHelpersMixin):
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _build_model_tool_payload(
|
|
|
|
|
|
tool_payload: dict[str, Any],
|
|
|
|
|
|
*,
|
|
|
|
|
|
question: str | None = None,
|
|
|
|
|
|
) -> dict[str, Any]:
|
|
|
|
|
|
normalized = dict(tool_payload or {})
|
|
|
|
|
|
hits = []
|
|
|
|
|
|
for item in UserAgentKnowledgeMixin._select_knowledge_model_hits(
|
|
|
|
|
|
tool_payload,
|
|
|
|
|
|
question=question,
|
|
|
|
|
|
):
|
|
|
|
|
|
if not isinstance(item, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
hits.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"title": str(item.get("title") or "").strip(),
|
|
|
|
|
|
"document_name": str(item.get("document_name") or "").strip(),
|
|
|
|
|
|
"excerpt": str(item.get("excerpt") or "").strip(),
|
|
|
|
|
|
"content": str(item.get("content") or "").strip()[:1200],
|
|
|
|
|
|
"tags": list(item.get("tags") or [])[:5],
|
|
|
|
|
|
"evidence": list(item.get("evidence") or [])[:3],
|
|
|
|
|
|
"code": str(item.get("code") or "").strip(),
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
normalized["hits"] = hits
|
|
|
|
|
|
return normalized
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _build_knowledge_evidence_blocks(
|
|
|
|
|
|
tool_payload: dict[str, Any],
|
|
|
|
|
|
*,
|
|
|
|
|
|
question: str | None = None,
|
|
|
|
|
|
) -> str:
|
|
|
|
|
|
blocks: list[str] = []
|
|
|
|
|
|
for index, item in enumerate(
|
|
|
|
|
|
UserAgentKnowledgeMixin._select_knowledge_model_hits(
|
|
|
|
|
|
tool_payload,
|
|
|
|
|
|
question=question,
|
|
|
|
|
|
)[:3],
|
|
|
|
|
|
start=1,
|
|
|
|
|
|
):
|
|
|
|
|
|
if not isinstance(item, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
title = str(item.get("title") or item.get("document_name") or f"证据 {index}").strip()
|
|
|
|
|
|
code = str(item.get("code") or "").strip()
|
|
|
|
|
|
content = str(item.get("content") or "").strip()
|
|
|
|
|
|
if not content:
|
|
|
|
|
|
continue
|
|
|
|
|
|
blocks.append(
|
|
|
|
|
|
"\n".join(
|
|
|
|
|
|
[
|
|
|
|
|
|
f"[证据 {index}] {title}" + (f" ({code})" if code else ""),
|
|
|
|
|
|
"```text",
|
|
|
|
|
|
content[:1200],
|
|
|
|
|
|
"```",
|
|
|
|
|
|
]
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
|
|
|
|
|
return "\n\n".join(blocks)
|
|
|
|
|
|
|
|
|
|
|
|
def _build_fast_knowledge_answer(
|
|
|
|
|
|
self,
|
|
|
|
|
|
payload: UserAgentRequest,
|
|
|
|
|
|
*,
|
|
|
|
|
|
citations: list[UserAgentCitation],
|
|
|
|
|
|
) -> str | None:
|
|
|
|
|
|
if payload.ontology.scenario != "knowledge":
|
|
|
|
|
|
return None
|
|
|
|
|
|
if str(payload.tool_payload.get("result_type") or "").strip() != "knowledge_search":
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
evidence_items = self._build_knowledge_answer_evidence(payload)
|
|
|
|
|
|
if not evidence_items:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
question = self._resolve_knowledge_question(payload)
|
|
|
|
|
|
if not self._should_use_direct_knowledge_answer(question, evidence_items):
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
return self._render_knowledge_direct_answer(
|
|
|
|
|
|
payload,
|
|
|
|
|
|
citations=citations,
|
|
|
|
|
|
evidence_items=evidence_items,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _render_knowledge_direct_answer(
|
|
|
|
|
|
self,
|
|
|
|
|
|
payload: UserAgentRequest,
|
|
|
|
|
|
*,
|
|
|
|
|
|
citations: list[UserAgentCitation],
|
|
|
|
|
|
evidence_items: list[dict[str, Any]],
|
|
|
|
|
|
) -> str | None:
|
|
|
|
|
|
if not evidence_items:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
title = str(
|
|
|
|
|
|
(citations[0].title if citations else "")
|
|
|
|
|
|
or evidence_items[0].get("title")
|
|
|
|
|
|
or "相关制度"
|
|
|
|
|
|
).strip()
|
|
|
|
|
|
user_name = str(payload.context_json.get("name") or "").strip()
|
|
|
|
|
|
question = self._resolve_knowledge_question(payload)
|
|
|
|
|
|
query_terms = self._extract_knowledge_query_terms(question)
|
|
|
|
|
|
ordered_evidence_items = self._prioritize_knowledge_evidence_items(question, evidence_items)
|
|
|
|
|
|
primary_item = ordered_evidence_items[0]
|
|
|
|
|
|
primary_heading = self._format_knowledge_heading_label(
|
|
|
|
|
|
str(primary_item.get("heading") or "").strip()
|
|
|
|
|
|
)
|
2026-05-27 17:31:27 +08:00
|
|
|
|
primary_lines = self._collect_direct_knowledge_answer_lines(
|
|
|
|
|
|
ordered_evidence_items,
|
|
|
|
|
|
query_terms=query_terms,
|
|
|
|
|
|
)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
lines: list[str] = []
|
|
|
|
|
|
if user_name:
|
|
|
|
|
|
lines.append(f"{user_name},您好。")
|
|
|
|
|
|
source_prefix = f"根据《{title}》"
|
|
|
|
|
|
if primary_heading:
|
|
|
|
|
|
source_prefix = f"{source_prefix}({primary_heading})"
|
|
|
|
|
|
|
2026-05-27 17:31:27 +08:00
|
|
|
|
conclusion_lines: list[str] = []
|
|
|
|
|
|
evidence_lines: list[str] = []
|
2026-05-22 10:42:31 +08:00
|
|
|
|
if str(primary_item.get("kind") or "") == "table":
|
2026-05-27 17:31:27 +08:00
|
|
|
|
table_content = str(primary_item.get("content") or "")
|
|
|
|
|
|
if self._question_requests_broad_knowledge_table(question):
|
|
|
|
|
|
table_preview = table_content.strip()
|
|
|
|
|
|
else:
|
|
|
|
|
|
table_preview = self._extract_relevant_table_preview(
|
|
|
|
|
|
table_content,
|
|
|
|
|
|
query_terms,
|
|
|
|
|
|
preferred_terms=self._build_knowledge_table_preferred_terms(payload),
|
|
|
|
|
|
)
|
|
|
|
|
|
table_summary = self._summarize_knowledge_table_preview(table_preview)
|
|
|
|
|
|
conclusion_lines.append(f"{source_prefix},{table_summary}")
|
|
|
|
|
|
evidence_lines.append(table_preview)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
else:
|
|
|
|
|
|
if not primary_lines:
|
2026-05-27 17:31:27 +08:00
|
|
|
|
summary = self._summarize_knowledge_evidence_content(primary_item, query_terms)
|
|
|
|
|
|
conclusion_lines.append(
|
2026-05-22 10:42:31 +08:00
|
|
|
|
f"{source_prefix},当前能直接确认的是:"
|
2026-05-27 17:31:27 +08:00
|
|
|
|
f"{summary}"
|
2026-05-22 10:42:31 +08:00
|
|
|
|
)
|
|
|
|
|
|
elif len(primary_lines) == 1:
|
2026-05-27 17:31:27 +08:00
|
|
|
|
conclusion_lines.append(f"{source_prefix},当前能直接确认的是:{primary_lines[0].strip()}")
|
|
|
|
|
|
evidence_lines.extend(primary_lines)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
else:
|
2026-05-27 17:31:27 +08:00
|
|
|
|
subject = self._build_knowledge_answer_subject(question, primary_heading)
|
|
|
|
|
|
summary = self._summarize_knowledge_lines_conclusion(
|
|
|
|
|
|
primary_lines,
|
|
|
|
|
|
heading=subject,
|
|
|
|
|
|
)
|
|
|
|
|
|
if summary:
|
|
|
|
|
|
conclusion_lines.append(f"{source_prefix},{summary}")
|
|
|
|
|
|
else:
|
|
|
|
|
|
conclusion_lines.append(f"{source_prefix},当前能直接确认的是:")
|
|
|
|
|
|
evidence_lines.extend(primary_lines)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
notes: list[str] = []
|
|
|
|
|
|
location_note = self._build_missing_location_grounding_note(question, evidence_items)
|
|
|
|
|
|
if location_note:
|
|
|
|
|
|
notes.append(location_note)
|
|
|
|
|
|
if self._question_requires_explicit_condition(question) and not self._answer_evidence_has_numeric_or_condition(evidence_items):
|
|
|
|
|
|
notes.append("当前命中的证据更偏规则说明或流程约束,还没有直接给出可立即套用的数值或完整条件。")
|
|
|
|
|
|
|
2026-05-27 17:31:27 +08:00
|
|
|
|
self._append_markdown_section(lines, "结论", conclusion_lines)
|
|
|
|
|
|
self._append_markdown_section(lines, "依据", evidence_lines)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
if notes:
|
2026-05-27 17:31:27 +08:00
|
|
|
|
self._append_markdown_section(lines, "说明", [f"- {note}" for note in notes])
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
return "\n".join(line for line in lines if line is not None).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-27 17:31:27 +08:00
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _append_markdown_section(lines: list[str], title: str, body_lines: list[str]) -> None:
|
|
|
|
|
|
cleaned = [str(line or "").rstrip() for line in body_lines if str(line or "").strip()]
|
|
|
|
|
|
if not cleaned:
|
|
|
|
|
|
return
|
|
|
|
|
|
if lines and lines[-1] != "":
|
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
lines.append(f"## {title}")
|
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
lines.extend(cleaned)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _build_knowledge_answer_subject(question: str, heading: str = "") -> str:
|
|
|
|
|
|
clean_heading = str(heading or "").strip()
|
|
|
|
|
|
if clean_heading and not any(
|
|
|
|
|
|
marker in clean_heading
|
|
|
|
|
|
for marker in ("问答线索补充", "结构化表格补充", "重点章节摘录", "章节导航")
|
|
|
|
|
|
):
|
|
|
|
|
|
return clean_heading
|
|
|
|
|
|
|
|
|
|
|
|
normalized = re.sub(r"\s+", "", str(question or "").strip())
|
|
|
|
|
|
normalized = re.sub(r"[??。.!!]+$", "", normalized)
|
|
|
|
|
|
normalized = re.sub(r"(是什么|有哪些|是多少|如何|怎么|吗|呢)$", "", normalized)
|
|
|
|
|
|
return normalized.strip("::,,。.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _build_knowledge_table_preferred_terms(payload: UserAgentRequest) -> list[str]:
|
|
|
|
|
|
terms: list[str] = []
|
|
|
|
|
|
context = payload.context_json or {}
|
|
|
|
|
|
for key in ("grade", "position", "job_grade", "rank", "level"):
|
|
|
|
|
|
value = str(context.get(key) or "").strip()
|
|
|
|
|
|
if value and value not in terms:
|
|
|
|
|
|
terms.append(value)
|
|
|
|
|
|
|
|
|
|
|
|
grade_match = re.fullmatch(r"[Pp](\d{1,2})", str(context.get("grade") or "").strip())
|
|
|
|
|
|
if grade_match:
|
|
|
|
|
|
grade = int(grade_match.group(1))
|
|
|
|
|
|
for start in range(max(0, grade - 4), grade + 1):
|
|
|
|
|
|
for end in range(grade, min(12, grade + 4) + 1):
|
|
|
|
|
|
if start >= end:
|
|
|
|
|
|
continue
|
|
|
|
|
|
for separator in ("~", "~", "-", "至"):
|
|
|
|
|
|
term = f"P{start}{separator}P{end}"
|
|
|
|
|
|
if term not in terms:
|
|
|
|
|
|
terms.append(term)
|
|
|
|
|
|
return terms
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-22 10:42:31 +08:00
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _resolve_knowledge_question(payload: UserAgentRequest) -> str:
|
|
|
|
|
|
return str(payload.context_json.get("user_input_text") or payload.message or "").strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _looks_like_structured_knowledge_query(question: str) -> bool:
|
|
|
|
|
|
normalized = str(question or "").strip()
|
|
|
|
|
|
if not normalized:
|
|
|
|
|
|
return False
|
|
|
|
|
|
return any(keyword in normalized for keyword in KNOWLEDGE_DIRECT_ANSWER_HINTS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _should_use_direct_knowledge_answer(
|
|
|
|
|
|
self,
|
|
|
|
|
|
question: str,
|
|
|
|
|
|
evidence_items: list[dict[str, Any]],
|
|
|
|
|
|
) -> bool:
|
|
|
|
|
|
if not evidence_items:
|
|
|
|
|
|
return False
|
|
|
|
|
|
if self._looks_like_structured_knowledge_query(question):
|
|
|
|
|
|
return True
|
|
|
|
|
|
return str(evidence_items[0].get("kind") or "") in {"table", "kv", "list", "clause"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_knowledge_answer_evidence(
|
|
|
|
|
|
self,
|
|
|
|
|
|
payload: UserAgentRequest,
|
|
|
|
|
|
) -> list[dict[str, Any]]:
|
|
|
|
|
|
question = self._resolve_knowledge_question(payload)
|
|
|
|
|
|
query_terms = self._extract_knowledge_query_terms(question)
|
|
|
|
|
|
candidates: list[dict[str, Any]] = []
|
|
|
|
|
|
|
|
|
|
|
|
for hit in self._select_knowledge_model_hits(
|
|
|
|
|
|
payload.tool_payload,
|
|
|
|
|
|
question=question,
|
|
|
|
|
|
):
|
|
|
|
|
|
if not isinstance(hit, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
candidates.extend(self._extract_knowledge_evidence_candidates(hit, query_terms))
|
|
|
|
|
|
|
|
|
|
|
|
deduped: list[dict[str, Any]] = []
|
|
|
|
|
|
seen: set[tuple[str, str, str]] = set()
|
|
|
|
|
|
ranked_candidates = sorted(
|
|
|
|
|
|
candidates,
|
|
|
|
|
|
key=lambda value: (
|
|
|
|
|
|
float(value.get("score") or 0),
|
|
|
|
|
|
-len(str(value.get("content") or "")),
|
|
|
|
|
|
),
|
|
|
|
|
|
reverse=True,
|
|
|
|
|
|
)
|
|
|
|
|
|
top_score = float(ranked_candidates[0].get("score") or 0) if ranked_candidates else 0.0
|
|
|
|
|
|
|
|
|
|
|
|
for item in ranked_candidates:
|
|
|
|
|
|
score = float(item.get("score") or 0)
|
|
|
|
|
|
if deduped and score < max(6.0, top_score - 14):
|
|
|
|
|
|
continue
|
|
|
|
|
|
key = (
|
|
|
|
|
|
str(item.get("title") or "").strip(),
|
|
|
|
|
|
str(item.get("heading") or "").strip(),
|
|
|
|
|
|
self._clean_knowledge_segment_text(str(item.get("content") or ""))[:180],
|
|
|
|
|
|
)
|
|
|
|
|
|
if key in seen:
|
|
|
|
|
|
continue
|
|
|
|
|
|
seen.add(key)
|
|
|
|
|
|
deduped.append(item)
|
|
|
|
|
|
if len(deduped) >= MAX_KNOWLEDGE_DIRECT_EVIDENCE:
|
|
|
|
|
|
break
|
|
|
|
|
|
return deduped
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_knowledge_evidence_candidates(
|
|
|
|
|
|
self,
|
|
|
|
|
|
hit: dict[str, Any],
|
|
|
|
|
|
query_terms: list[str],
|
|
|
|
|
|
) -> list[dict[str, Any]]:
|
|
|
|
|
|
title = str(hit.get("title") or hit.get("document_name") or "相关制度").strip()
|
|
|
|
|
|
content = str(hit.get("content") or "").strip()
|
|
|
|
|
|
if not content:
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
raw_candidates = self._merge_knowledge_lead_in_segments(
|
|
|
|
|
|
self._split_knowledge_hit_into_segments(content)
|
|
|
|
|
|
)
|
|
|
|
|
|
candidates: list[dict[str, Any]] = []
|
|
|
|
|
|
for item in raw_candidates:
|
|
|
|
|
|
score = self._score_knowledge_evidence_candidate(item, query_terms)
|
|
|
|
|
|
if query_terms and score <= 0:
|
|
|
|
|
|
continue
|
|
|
|
|
|
normalized = dict(item)
|
|
|
|
|
|
normalized["title"] = title
|
|
|
|
|
|
normalized["score"] = score
|
|
|
|
|
|
candidates.append(normalized)
|
|
|
|
|
|
|
|
|
|
|
|
if candidates:
|
|
|
|
|
|
return candidates
|
|
|
|
|
|
|
|
|
|
|
|
fallback_text = str(hit.get("excerpt") or "").strip() or self._extract_excerpt(content)
|
|
|
|
|
|
if not fallback_text:
|
|
|
|
|
|
return []
|
|
|
|
|
|
return [
|
|
|
|
|
|
{
|
|
|
|
|
|
"title": title,
|
|
|
|
|
|
"heading": "",
|
|
|
|
|
|
"kind": "paragraph",
|
|
|
|
|
|
"content": fallback_text,
|
|
|
|
|
|
"score": 1,
|
|
|
|
|
|
}
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _merge_knowledge_lead_in_segments(
|
|
|
|
|
|
self,
|
|
|
|
|
|
segments: list[dict[str, str]],
|
|
|
|
|
|
) -> list[dict[str, str]]:
|
|
|
|
|
|
if not segments:
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
merged: list[dict[str, str]] = []
|
|
|
|
|
|
index = 0
|
|
|
|
|
|
while index < len(segments):
|
|
|
|
|
|
current = dict(segments[index])
|
|
|
|
|
|
if not self._is_knowledge_lead_in_segment(current):
|
|
|
|
|
|
merged.append(current)
|
|
|
|
|
|
index += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
base_heading = str(current.get("heading") or "").strip()
|
|
|
|
|
|
current_marker = self._extract_knowledge_marker_family(str(current.get("content") or ""))
|
|
|
|
|
|
follow_segments: list[dict[str, str]] = []
|
|
|
|
|
|
next_index = index + 1
|
|
|
|
|
|
|
|
|
|
|
|
while next_index < len(segments):
|
|
|
|
|
|
candidate = segments[next_index]
|
|
|
|
|
|
if str(candidate.get("heading") or "").strip() != base_heading:
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
candidate_kind = str(candidate.get("kind") or "").strip()
|
|
|
|
|
|
candidate_content = str(candidate.get("content") or "").strip()
|
|
|
|
|
|
candidate_marker = self._extract_knowledge_marker_family(candidate_content)
|
|
|
|
|
|
if not candidate_content or candidate_kind == "table":
|
|
|
|
|
|
break
|
|
|
|
|
|
if current_marker and candidate_marker == current_marker:
|
|
|
|
|
|
break
|
|
|
|
|
|
if self._is_knowledge_lead_in_segment(candidate) and follow_segments:
|
|
|
|
|
|
break
|
|
|
|
|
|
if candidate_kind not in {"list", "paragraph", "kv", "clause"}:
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
follow_segments.append(candidate)
|
|
|
|
|
|
next_index += 1
|
|
|
|
|
|
if len(follow_segments) >= 4:
|
|
|
|
|
|
break
|
|
|
|
|
|
if candidate_kind == "paragraph" and len(candidate_content) >= 200:
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
if follow_segments:
|
|
|
|
|
|
current["content"] = "\n".join(
|
|
|
|
|
|
[str(current.get("content") or "").strip()]
|
|
|
|
|
|
+ [str(item.get("content") or "").strip() for item in follow_segments]
|
|
|
|
|
|
)
|
|
|
|
|
|
if any(str(item.get("kind") or "").strip() == "list" for item in follow_segments):
|
|
|
|
|
|
current["kind"] = "list"
|
|
|
|
|
|
merged.append(current)
|
|
|
|
|
|
index = next_index
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
merged.append(current)
|
|
|
|
|
|
index += 1
|
|
|
|
|
|
|
|
|
|
|
|
return merged
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _split_knowledge_hit_into_segments(self, content: str) -> list[dict[str, str]]:
|
|
|
|
|
|
segments: list[dict[str, str]] = []
|
|
|
|
|
|
markdown_headings: list[str] = []
|
|
|
|
|
|
section_heading = ""
|
|
|
|
|
|
paragraph_lines: list[str] = []
|
|
|
|
|
|
table_lines: list[str] = []
|
|
|
|
|
|
|
|
|
|
|
|
def current_heading() -> str:
|
|
|
|
|
|
heading_parts = [item for item in markdown_headings if item]
|
|
|
|
|
|
if section_heading:
|
|
|
|
|
|
heading_parts.append(section_heading)
|
|
|
|
|
|
return " > ".join(heading_parts)
|
|
|
|
|
|
|
|
|
|
|
|
def flush_paragraph() -> None:
|
|
|
|
|
|
nonlocal paragraph_lines
|
|
|
|
|
|
if not paragraph_lines:
|
|
|
|
|
|
return
|
|
|
|
|
|
merged = " ".join(line.strip() for line in paragraph_lines if line.strip()).strip()
|
|
|
|
|
|
paragraph_lines = []
|
|
|
|
|
|
if merged:
|
|
|
|
|
|
segments.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"heading": current_heading(),
|
|
|
|
|
|
"kind": "paragraph",
|
|
|
|
|
|
"content": merged,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def flush_table() -> None:
|
|
|
|
|
|
nonlocal table_lines
|
|
|
|
|
|
if not table_lines:
|
|
|
|
|
|
return
|
|
|
|
|
|
merged = "\n".join(line.rstrip() for line in table_lines if line.strip()).strip()
|
|
|
|
|
|
table_lines = []
|
|
|
|
|
|
if merged:
|
|
|
|
|
|
segments.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"heading": current_heading(),
|
|
|
|
|
|
"kind": "table",
|
|
|
|
|
|
"content": merged,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
for raw_line in str(content or "").replace("\r\n", "\n").replace("\r", "\n").splitlines():
|
|
|
|
|
|
line = raw_line.rstrip()
|
|
|
|
|
|
stripped = line.strip()
|
|
|
|
|
|
|
|
|
|
|
|
if not stripped:
|
|
|
|
|
|
flush_paragraph()
|
|
|
|
|
|
flush_table()
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
markdown_heading_match = re.match(r"^(#{1,6})\s+(.+)$", stripped)
|
|
|
|
|
|
if markdown_heading_match:
|
|
|
|
|
|
flush_paragraph()
|
|
|
|
|
|
flush_table()
|
|
|
|
|
|
level = len(markdown_heading_match.group(1))
|
|
|
|
|
|
heading_text = markdown_heading_match.group(2).strip()
|
|
|
|
|
|
markdown_headings = markdown_headings[: max(0, level - 1)]
|
|
|
|
|
|
markdown_headings.append(heading_text)
|
|
|
|
|
|
section_heading = ""
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if KNOWLEDGE_SECTION_HEADING_PATTERN.match(stripped) and len(stripped) <= 90:
|
|
|
|
|
|
flush_paragraph()
|
|
|
|
|
|
flush_table()
|
|
|
|
|
|
section_heading = stripped.lstrip("#").strip()
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if stripped.count("|") >= 2 and "|" in stripped:
|
|
|
|
|
|
flush_paragraph()
|
|
|
|
|
|
table_lines.append(stripped)
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
flush_table()
|
|
|
|
|
|
|
|
|
|
|
|
if KNOWLEDGE_LIST_ITEM_PATTERN.match(stripped):
|
|
|
|
|
|
flush_paragraph()
|
|
|
|
|
|
segments.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"heading": current_heading(),
|
|
|
|
|
|
"kind": "list",
|
|
|
|
|
|
"content": stripped,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if KNOWLEDGE_NUMBERED_ITEM_PATTERN.match(stripped):
|
|
|
|
|
|
flush_paragraph()
|
|
|
|
|
|
segments.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"heading": current_heading(),
|
|
|
|
|
|
"kind": "list",
|
|
|
|
|
|
"content": stripped,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if KNOWLEDGE_ARTICLE_PATTERN.match(stripped):
|
|
|
|
|
|
flush_paragraph()
|
|
|
|
|
|
segments.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"heading": current_heading(),
|
|
|
|
|
|
"kind": "clause",
|
|
|
|
|
|
"content": stripped,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if (":" in stripped or ":" in stripped) and len(stripped) <= 180:
|
|
|
|
|
|
flush_paragraph()
|
|
|
|
|
|
segments.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"heading": current_heading(),
|
|
|
|
|
|
"kind": "kv",
|
|
|
|
|
|
"content": stripped,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
paragraph_lines.append(stripped)
|
|
|
|
|
|
|
|
|
|
|
|
flush_paragraph()
|
|
|
|
|
|
flush_table()
|
|
|
|
|
|
return segments
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _render_knowledge_evidence_text(self, item: dict[str, Any]) -> str:
|
|
|
|
|
|
lines = self._split_clean_knowledge_lines(
|
|
|
|
|
|
str(item.get("content") or ""),
|
|
|
|
|
|
preserve_marker=True,
|
|
|
|
|
|
)
|
|
|
|
|
|
if not lines:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
if len(lines) == 1:
|
|
|
|
|
|
return self._clean_knowledge_segment_text(lines[0])
|
|
|
|
|
|
return "\n".join(f" {line}" for line in lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _collect_direct_knowledge_answer_lines(
|
|
|
|
|
|
self,
|
|
|
|
|
|
ordered_evidence_items: list[dict[str, Any]],
|
2026-05-27 17:31:27 +08:00
|
|
|
|
*,
|
|
|
|
|
|
query_terms: list[str] | None = None,
|
2026-05-22 10:42:31 +08:00
|
|
|
|
) -> list[str]:
|
|
|
|
|
|
if not ordered_evidence_items:
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
primary_item = ordered_evidence_items[0]
|
|
|
|
|
|
primary_title = str(primary_item.get("title") or "").strip()
|
|
|
|
|
|
primary_heading = str(primary_item.get("heading") or "").strip()
|
|
|
|
|
|
primary_kind = str(primary_item.get("kind") or "").strip()
|
|
|
|
|
|
|
|
|
|
|
|
related_items = [primary_item]
|
|
|
|
|
|
if primary_kind != "table":
|
|
|
|
|
|
for item in ordered_evidence_items[1:]:
|
|
|
|
|
|
if len(related_items) >= 3:
|
|
|
|
|
|
break
|
|
|
|
|
|
if str(item.get("kind") or "").strip() != primary_kind:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if str(item.get("title") or "").strip() != primary_title:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if str(item.get("heading") or "").strip() != primary_heading:
|
|
|
|
|
|
continue
|
|
|
|
|
|
related_items.append(item)
|
|
|
|
|
|
|
|
|
|
|
|
lines: list[str] = []
|
|
|
|
|
|
seen: set[str] = set()
|
|
|
|
|
|
for item in related_items:
|
2026-05-27 17:31:27 +08:00
|
|
|
|
item_kind = str(item.get("kind") or "").strip()
|
|
|
|
|
|
item_content = str(item.get("content") or "")
|
|
|
|
|
|
if item_kind == "paragraph" or self._has_inline_numbered_knowledge_items(item_content):
|
|
|
|
|
|
rendered = self._focus_knowledge_segment_content(
|
|
|
|
|
|
item_content,
|
|
|
|
|
|
query_terms or [],
|
|
|
|
|
|
)
|
|
|
|
|
|
rendered_lines = self._split_inline_numbered_knowledge_items(rendered)
|
|
|
|
|
|
else:
|
|
|
|
|
|
rendered = self._render_knowledge_evidence_text(item)
|
|
|
|
|
|
rendered_lines = rendered.splitlines()
|
|
|
|
|
|
for line in rendered_lines:
|
2026-05-22 10:42:31 +08:00
|
|
|
|
normalized = str(line or "").strip()
|
|
|
|
|
|
if not normalized or normalized in seen:
|
|
|
|
|
|
continue
|
|
|
|
|
|
seen.add(normalized)
|
|
|
|
|
|
lines.append(line)
|
|
|
|
|
|
return lines
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _summarize_knowledge_evidence_content(
|
|
|
|
|
|
self,
|
|
|
|
|
|
item: dict[str, Any],
|
|
|
|
|
|
query_terms: list[str],
|
|
|
|
|
|
) -> str:
|
|
|
|
|
|
kind = str(item.get("kind") or "").strip()
|
|
|
|
|
|
content = str(item.get("content") or "").strip()
|
|
|
|
|
|
if kind == "table":
|
|
|
|
|
|
preview = self._extract_relevant_table_preview(content, query_terms)
|
|
|
|
|
|
preview_rows = [line for line in preview.splitlines() if line.strip()][:4]
|
|
|
|
|
|
if len(preview_rows) >= 3:
|
|
|
|
|
|
return "当前命中的直接依据是一张与问题强相关的标准表,已摘出最相关的表头和行。"
|
|
|
|
|
|
return "当前命中的直接依据是一张与问题强相关的标准表。"
|
|
|
|
|
|
lines = self._split_clean_knowledge_lines(content, preserve_marker=True)
|
|
|
|
|
|
if len(lines) >= 2:
|
|
|
|
|
|
return self._clean_knowledge_segment_text(f"{lines[0]} {' '.join(lines[1:4])}")
|
|
|
|
|
|
return self._clean_knowledge_segment_text(content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_missing_location_grounding_note(
|
|
|
|
|
|
self,
|
|
|
|
|
|
question: str,
|
|
|
|
|
|
evidence_items: list[dict[str, Any]],
|
|
|
|
|
|
) -> str:
|
|
|
|
|
|
location = self._extract_query_location(question)
|
|
|
|
|
|
if not location:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
haystack = "\n".join(
|
|
|
|
|
|
str(item.get("heading") or "") + "\n" + str(item.get("content") or "")
|
|
|
|
|
|
for item in evidence_items
|
|
|
|
|
|
)
|
|
|
|
|
|
if location in haystack:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
return (
|
|
|
|
|
|
f"当前命中的制度依据没有直接写出“{location}”对应的地区档位或映射关系,"
|
|
|
|
|
|
"因此不能直接把它套用到表格中的某一列。"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_knowledge_search_answer(
|
|
|
|
|
|
self,
|
|
|
|
|
|
payload: UserAgentRequest,
|
|
|
|
|
|
citations: list[UserAgentCitation],
|
|
|
|
|
|
) -> str:
|
|
|
|
|
|
hits = [item for item in list(payload.tool_payload.get("hits") or []) if isinstance(item, dict)]
|
|
|
|
|
|
evidence_items = self._build_knowledge_answer_evidence(payload)
|
|
|
|
|
|
primary_citation = citations[0] if citations else None
|
|
|
|
|
|
title = str(
|
|
|
|
|
|
(primary_citation.title if primary_citation else "")
|
|
|
|
|
|
or (hits[0].get("title") if hits else "")
|
|
|
|
|
|
or "相关制度"
|
|
|
|
|
|
).strip()
|
|
|
|
|
|
user_name = str(payload.context_json.get("name") or "").strip()
|
2026-05-27 17:31:27 +08:00
|
|
|
|
answer_lines: list[str] = []
|
|
|
|
|
|
if user_name:
|
|
|
|
|
|
answer_lines.append(f"{user_name},您好。")
|
2026-05-22 10:42:31 +08:00
|
|
|
|
if not hits:
|
2026-05-27 17:31:27 +08:00
|
|
|
|
self._append_markdown_section(
|
|
|
|
|
|
answer_lines,
|
|
|
|
|
|
"结论",
|
|
|
|
|
|
[f"当前没有拿到可用于回答这个问题的《{title}》知识库命中。"],
|
2026-05-22 10:42:31 +08:00
|
|
|
|
)
|
2026-05-27 17:31:27 +08:00
|
|
|
|
self._append_markdown_section(
|
|
|
|
|
|
answer_lines,
|
|
|
|
|
|
"说明",
|
|
|
|
|
|
["- 我不会用相似主题或外部常识硬凑答案;请补充更具体的关键词后再试一次。"],
|
|
|
|
|
|
)
|
|
|
|
|
|
return "\n".join(answer_lines).strip()
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
evidence_lines: list[str] = []
|
|
|
|
|
|
for item in evidence_items[:3]:
|
|
|
|
|
|
heading = str(item.get("heading") or "").strip()
|
2026-05-24 21:44:17 +08:00
|
|
|
|
if "表格行级检索线索" in heading:
|
|
|
|
|
|
heading = heading.replace("表格行级检索线索", "").strip(" >")
|
|
|
|
|
|
heading_text = f"({heading})" if heading else ""
|
|
|
|
|
|
item_title = item.get("title") or title
|
2026-05-22 10:42:31 +08:00
|
|
|
|
if str(item.get("kind") or "") == "table":
|
|
|
|
|
|
preview = self._extract_relevant_table_preview(
|
|
|
|
|
|
str(item.get("content") or ""),
|
|
|
|
|
|
self._extract_knowledge_query_terms(self._resolve_knowledge_question(payload)),
|
|
|
|
|
|
)
|
2026-05-24 21:44:17 +08:00
|
|
|
|
evidence_lines.append(f"- **《{item_title}》** {heading_text}\n{preview}")
|
2026-05-22 10:42:31 +08:00
|
|
|
|
continue
|
|
|
|
|
|
rendered = self._render_knowledge_evidence_text(item)
|
|
|
|
|
|
if rendered:
|
|
|
|
|
|
if "\n" in rendered:
|
2026-05-24 21:44:17 +08:00
|
|
|
|
evidence_lines.append(f"- **《{item_title}》** {heading_text}\n{rendered}")
|
2026-05-22 10:42:31 +08:00
|
|
|
|
else:
|
2026-05-24 21:44:17 +08:00
|
|
|
|
evidence_lines.append(f"- **《{item_title}》** {heading_text}\n {rendered}")
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
if not evidence_lines:
|
|
|
|
|
|
for item in hits[:2]:
|
|
|
|
|
|
item_title = str(item.get("title") or item.get("document_name") or "相关制度").strip()
|
|
|
|
|
|
excerpt = (
|
|
|
|
|
|
str(item.get("excerpt") or "").strip()
|
|
|
|
|
|
or self._extract_excerpt(str(item.get("content") or ""))
|
|
|
|
|
|
)
|
|
|
|
|
|
if not excerpt:
|
|
|
|
|
|
continue
|
2026-05-24 21:44:17 +08:00
|
|
|
|
evidence_lines.append(f"- **《{item_title}》**:{excerpt}")
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
if not evidence_lines:
|
2026-05-27 17:31:27 +08:00
|
|
|
|
self._append_markdown_section(
|
|
|
|
|
|
answer_lines,
|
|
|
|
|
|
"结论",
|
|
|
|
|
|
[f"当前《{title}》里可用于回答这个问题的关键条款还不够明确。"],
|
|
|
|
|
|
)
|
|
|
|
|
|
self._append_markdown_section(
|
|
|
|
|
|
answer_lines,
|
|
|
|
|
|
"说明",
|
|
|
|
|
|
["- 请补充费用类型、适用地区、职级或具体业务场景,我再继续帮你缩小范围。"],
|
2026-05-22 10:42:31 +08:00
|
|
|
|
)
|
2026-05-27 17:31:27 +08:00
|
|
|
|
return "\n".join(answer_lines).strip()
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
2026-05-27 17:31:27 +08:00
|
|
|
|
self._append_markdown_section(
|
|
|
|
|
|
answer_lines,
|
|
|
|
|
|
"结论",
|
|
|
|
|
|
["我先根据当前制度依据给出可以确认的部分。"],
|
|
|
|
|
|
)
|
|
|
|
|
|
self._append_markdown_section(answer_lines, "依据", evidence_lines)
|
|
|
|
|
|
self._append_markdown_section(
|
|
|
|
|
|
answer_lines,
|
|
|
|
|
|
"说明",
|
|
|
|
|
|
["- 以上只使用当前命中的知识库证据;没有在证据中出现的适用条件或金额,我不会替你默认补齐。"],
|
|
|
|
|
|
)
|
|
|
|
|
|
return "\n".join(answer_lines).strip()
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|