from __future__ import annotations import re from typing import Any from app.schemas.user_agent import UserAgentCitation, UserAgentRequest from app.services.user_agent_knowledge_helpers import UserAgentKnowledgeHelpersMixin from app.services.user_agent_knowledge_constants import ( KNOWLEDGE_ARTICLE_PATTERN, KNOWLEDGE_DIRECT_ANSWER_HINTS, KNOWLEDGE_LIST_ITEM_PATTERN, KNOWLEDGE_NUMBERED_ITEM_PATTERN, KNOWLEDGE_QUERY_STOPWORDS, KNOWLEDGE_SECTION_HEADING_PATTERN, MAX_KNOWLEDGE_DIRECT_EVIDENCE, MAX_KNOWLEDGE_MODEL_HITS, MAX_KNOWLEDGE_QUERY_TERMS, ) class UserAgentKnowledgeMixin(UserAgentKnowledgeHelpersMixin): @staticmethod def _build_model_tool_payload( tool_payload: dict[str, Any], *, question: str | None = None, ) -> dict[str, Any]: normalized = dict(tool_payload or {}) hits = [] for item in UserAgentKnowledgeMixin._select_knowledge_model_hits( tool_payload, question=question, ): if not isinstance(item, dict): continue hits.append( { "title": str(item.get("title") or "").strip(), "document_name": str(item.get("document_name") or "").strip(), "excerpt": str(item.get("excerpt") or "").strip(), "content": str(item.get("content") or "").strip()[:1200], "tags": list(item.get("tags") or [])[:5], "evidence": list(item.get("evidence") or [])[:3], "code": str(item.get("code") or "").strip(), } ) normalized["hits"] = hits return normalized @staticmethod def _build_knowledge_evidence_blocks( tool_payload: dict[str, Any], *, question: str | None = None, ) -> str: blocks: list[str] = [] for index, item in enumerate( UserAgentKnowledgeMixin._select_knowledge_model_hits( tool_payload, question=question, )[:3], start=1, ): if not isinstance(item, dict): continue title = str(item.get("title") or item.get("document_name") or f"证据 {index}").strip() code = str(item.get("code") or "").strip() content = str(item.get("content") or "").strip() if not content: continue blocks.append( "\n".join( [ f"[证据 {index}] {title}" + (f" ({code})" if code else ""), "```text", content[:1200], "```", ] ) ) return "\n\n".join(blocks) def _build_fast_knowledge_answer( self, payload: UserAgentRequest, *, citations: list[UserAgentCitation], ) -> str | None: if payload.ontology.scenario != "knowledge": return None if str(payload.tool_payload.get("result_type") or "").strip() != "knowledge_search": return None evidence_items = self._build_knowledge_answer_evidence(payload) if not evidence_items: return None question = self._resolve_knowledge_question(payload) if not self._should_use_direct_knowledge_answer(question, evidence_items): return None return self._render_knowledge_direct_answer( payload, citations=citations, evidence_items=evidence_items, ) def _render_knowledge_direct_answer( self, payload: UserAgentRequest, *, citations: list[UserAgentCitation], evidence_items: list[dict[str, Any]], ) -> str | None: if not evidence_items: return None title = str( (citations[0].title if citations else "") or evidence_items[0].get("title") or "相关制度" ).strip() user_name = str(payload.context_json.get("name") or "").strip() question = self._resolve_knowledge_question(payload) query_terms = self._extract_knowledge_query_terms(question) ordered_evidence_items = self._prioritize_knowledge_evidence_items(question, evidence_items) primary_item = ordered_evidence_items[0] primary_heading = self._format_knowledge_heading_label( str(primary_item.get("heading") or "").strip() ) primary_lines = self._collect_direct_knowledge_answer_lines( ordered_evidence_items, query_terms=query_terms, ) lines: list[str] = [] if user_name: lines.append(f"{user_name},您好。") source_prefix = f"根据《{title}》" if primary_heading: source_prefix = f"{source_prefix}({primary_heading})" conclusion_lines: list[str] = [] evidence_lines: list[str] = [] if str(primary_item.get("kind") or "") == "table": table_content = str(primary_item.get("content") or "") if self._question_requests_broad_knowledge_table(question): table_preview = table_content.strip() else: table_preview = self._extract_relevant_table_preview( table_content, query_terms, preferred_terms=self._build_knowledge_table_preferred_terms(payload), ) table_summary = self._summarize_knowledge_table_preview(table_preview) conclusion_lines.append(f"{source_prefix},{table_summary}") evidence_lines.append(table_preview) else: if not primary_lines: summary = self._summarize_knowledge_evidence_content(primary_item, query_terms) conclusion_lines.append( f"{source_prefix},当前能直接确认的是:" f"{summary}" ) elif len(primary_lines) == 1: conclusion_lines.append(f"{source_prefix},当前能直接确认的是:{primary_lines[0].strip()}") evidence_lines.extend(primary_lines) else: subject = self._build_knowledge_answer_subject(question, primary_heading) summary = self._summarize_knowledge_lines_conclusion( primary_lines, heading=subject, ) if summary: conclusion_lines.append(f"{source_prefix},{summary}") else: conclusion_lines.append(f"{source_prefix},当前能直接确认的是:") evidence_lines.extend(primary_lines) notes: list[str] = [] location_note = self._build_missing_location_grounding_note(question, evidence_items) if location_note: notes.append(location_note) if self._question_requires_explicit_condition(question) and not self._answer_evidence_has_numeric_or_condition(evidence_items): notes.append("当前命中的证据更偏规则说明或流程约束,还没有直接给出可立即套用的数值或完整条件。") self._append_markdown_section(lines, "结论", conclusion_lines) self._append_markdown_section(lines, "依据", evidence_lines) if notes: self._append_markdown_section(lines, "说明", [f"- {note}" for note in notes]) return "\n".join(line for line in lines if line is not None).strip() @staticmethod def _append_markdown_section(lines: list[str], title: str, body_lines: list[str]) -> None: cleaned = [str(line or "").rstrip() for line in body_lines if str(line or "").strip()] if not cleaned: return if lines and lines[-1] != "": lines.append("") lines.append(f"## {title}") lines.append("") lines.extend(cleaned) @staticmethod def _build_knowledge_answer_subject(question: str, heading: str = "") -> str: clean_heading = str(heading or "").strip() if clean_heading and not any( marker in clean_heading for marker in ("问答线索补充", "结构化表格补充", "重点章节摘录", "章节导航") ): return clean_heading normalized = re.sub(r"\s+", "", str(question or "").strip()) normalized = re.sub(r"[??。.!!]+$", "", normalized) normalized = re.sub(r"(是什么|有哪些|是多少|如何|怎么|吗|呢)$", "", normalized) return normalized.strip("::,,。.") @staticmethod def _build_knowledge_table_preferred_terms(payload: UserAgentRequest) -> list[str]: terms: list[str] = [] context = payload.context_json or {} for key in ("grade", "position", "job_grade", "rank", "level"): value = str(context.get(key) or "").strip() if value and value not in terms: terms.append(value) grade_match = re.fullmatch(r"[Pp](\d{1,2})", str(context.get("grade") or "").strip()) if grade_match: grade = int(grade_match.group(1)) for start in range(max(0, grade - 4), grade + 1): for end in range(grade, min(12, grade + 4) + 1): if start >= end: continue for separator in ("~", "~", "-", "至"): term = f"P{start}{separator}P{end}" if term not in terms: terms.append(term) return terms @staticmethod def _resolve_knowledge_question(payload: UserAgentRequest) -> str: return str(payload.context_json.get("user_input_text") or payload.message or "").strip() @staticmethod def _looks_like_structured_knowledge_query(question: str) -> bool: normalized = str(question or "").strip() if not normalized: return False return any(keyword in normalized for keyword in KNOWLEDGE_DIRECT_ANSWER_HINTS) def _should_use_direct_knowledge_answer( self, question: str, evidence_items: list[dict[str, Any]], ) -> bool: if not evidence_items: return False if self._looks_like_structured_knowledge_query(question): return True return str(evidence_items[0].get("kind") or "") in {"table", "kv", "list", "clause"} def _build_knowledge_answer_evidence( self, payload: UserAgentRequest, ) -> list[dict[str, Any]]: question = self._resolve_knowledge_question(payload) query_terms = self._extract_knowledge_query_terms(question) candidates: list[dict[str, Any]] = [] for hit in self._select_knowledge_model_hits( payload.tool_payload, question=question, ): if not isinstance(hit, dict): continue candidates.extend(self._extract_knowledge_evidence_candidates(hit, query_terms)) deduped: list[dict[str, Any]] = [] seen: set[tuple[str, str, str]] = set() ranked_candidates = sorted( candidates, key=lambda value: ( float(value.get("score") or 0), -len(str(value.get("content") or "")), ), reverse=True, ) top_score = float(ranked_candidates[0].get("score") or 0) if ranked_candidates else 0.0 for item in ranked_candidates: score = float(item.get("score") or 0) if deduped and score < max(6.0, top_score - 14): continue key = ( str(item.get("title") or "").strip(), str(item.get("heading") or "").strip(), self._clean_knowledge_segment_text(str(item.get("content") or ""))[:180], ) if key in seen: continue seen.add(key) deduped.append(item) if len(deduped) >= MAX_KNOWLEDGE_DIRECT_EVIDENCE: break return deduped def _extract_knowledge_evidence_candidates( self, hit: dict[str, Any], query_terms: list[str], ) -> list[dict[str, Any]]: title = str(hit.get("title") or hit.get("document_name") or "相关制度").strip() content = str(hit.get("content") or "").strip() if not content: return [] raw_candidates = self._merge_knowledge_lead_in_segments( self._split_knowledge_hit_into_segments(content) ) candidates: list[dict[str, Any]] = [] for item in raw_candidates: score = self._score_knowledge_evidence_candidate(item, query_terms) if query_terms and score <= 0: continue normalized = dict(item) normalized["title"] = title normalized["score"] = score candidates.append(normalized) if candidates: return candidates fallback_text = str(hit.get("excerpt") or "").strip() or self._extract_excerpt(content) if not fallback_text: return [] return [ { "title": title, "heading": "", "kind": "paragraph", "content": fallback_text, "score": 1, } ] def _merge_knowledge_lead_in_segments( self, segments: list[dict[str, str]], ) -> list[dict[str, str]]: if not segments: return [] merged: list[dict[str, str]] = [] index = 0 while index < len(segments): current = dict(segments[index]) if not self._is_knowledge_lead_in_segment(current): merged.append(current) index += 1 continue base_heading = str(current.get("heading") or "").strip() current_marker = self._extract_knowledge_marker_family(str(current.get("content") or "")) follow_segments: list[dict[str, str]] = [] next_index = index + 1 while next_index < len(segments): candidate = segments[next_index] if str(candidate.get("heading") or "").strip() != base_heading: break candidate_kind = str(candidate.get("kind") or "").strip() candidate_content = str(candidate.get("content") or "").strip() candidate_marker = self._extract_knowledge_marker_family(candidate_content) if not candidate_content or candidate_kind == "table": break if current_marker and candidate_marker == current_marker: break if self._is_knowledge_lead_in_segment(candidate) and follow_segments: break if candidate_kind not in {"list", "paragraph", "kv", "clause"}: break follow_segments.append(candidate) next_index += 1 if len(follow_segments) >= 4: break if candidate_kind == "paragraph" and len(candidate_content) >= 200: break if follow_segments: current["content"] = "\n".join( [str(current.get("content") or "").strip()] + [str(item.get("content") or "").strip() for item in follow_segments] ) if any(str(item.get("kind") or "").strip() == "list" for item in follow_segments): current["kind"] = "list" merged.append(current) index = next_index continue merged.append(current) index += 1 return merged def _split_knowledge_hit_into_segments(self, content: str) -> list[dict[str, str]]: segments: list[dict[str, str]] = [] markdown_headings: list[str] = [] section_heading = "" paragraph_lines: list[str] = [] table_lines: list[str] = [] def current_heading() -> str: heading_parts = [item for item in markdown_headings if item] if section_heading: heading_parts.append(section_heading) return " > ".join(heading_parts) def flush_paragraph() -> None: nonlocal paragraph_lines if not paragraph_lines: return merged = " ".join(line.strip() for line in paragraph_lines if line.strip()).strip() paragraph_lines = [] if merged: segments.append( { "heading": current_heading(), "kind": "paragraph", "content": merged, } ) def flush_table() -> None: nonlocal table_lines if not table_lines: return merged = "\n".join(line.rstrip() for line in table_lines if line.strip()).strip() table_lines = [] if merged: segments.append( { "heading": current_heading(), "kind": "table", "content": merged, } ) for raw_line in str(content or "").replace("\r\n", "\n").replace("\r", "\n").splitlines(): line = raw_line.rstrip() stripped = line.strip() if not stripped: flush_paragraph() flush_table() continue markdown_heading_match = re.match(r"^(#{1,6})\s+(.+)$", stripped) if markdown_heading_match: flush_paragraph() flush_table() level = len(markdown_heading_match.group(1)) heading_text = markdown_heading_match.group(2).strip() markdown_headings = markdown_headings[: max(0, level - 1)] markdown_headings.append(heading_text) section_heading = "" continue if KNOWLEDGE_SECTION_HEADING_PATTERN.match(stripped) and len(stripped) <= 90: flush_paragraph() flush_table() section_heading = stripped.lstrip("#").strip() continue if stripped.count("|") >= 2 and "|" in stripped: flush_paragraph() table_lines.append(stripped) continue flush_table() if KNOWLEDGE_LIST_ITEM_PATTERN.match(stripped): flush_paragraph() segments.append( { "heading": current_heading(), "kind": "list", "content": stripped, } ) continue if KNOWLEDGE_NUMBERED_ITEM_PATTERN.match(stripped): flush_paragraph() segments.append( { "heading": current_heading(), "kind": "list", "content": stripped, } ) continue if KNOWLEDGE_ARTICLE_PATTERN.match(stripped): flush_paragraph() segments.append( { "heading": current_heading(), "kind": "clause", "content": stripped, } ) continue if (":" in stripped or ":" in stripped) and len(stripped) <= 180: flush_paragraph() segments.append( { "heading": current_heading(), "kind": "kv", "content": stripped, } ) continue paragraph_lines.append(stripped) flush_paragraph() flush_table() return segments def _render_knowledge_evidence_text(self, item: dict[str, Any]) -> str: lines = self._split_clean_knowledge_lines( str(item.get("content") or ""), preserve_marker=True, ) if not lines: return "" if len(lines) == 1: return self._clean_knowledge_segment_text(lines[0]) return "\n".join(f" {line}" for line in lines) def _collect_direct_knowledge_answer_lines( self, ordered_evidence_items: list[dict[str, Any]], *, query_terms: list[str] | None = None, ) -> list[str]: if not ordered_evidence_items: return [] primary_item = ordered_evidence_items[0] primary_title = str(primary_item.get("title") or "").strip() primary_heading = str(primary_item.get("heading") or "").strip() primary_kind = str(primary_item.get("kind") or "").strip() related_items = [primary_item] if primary_kind != "table": for item in ordered_evidence_items[1:]: if len(related_items) >= 3: break if str(item.get("kind") or "").strip() != primary_kind: continue if str(item.get("title") or "").strip() != primary_title: continue if str(item.get("heading") or "").strip() != primary_heading: continue related_items.append(item) lines: list[str] = [] seen: set[str] = set() for item in related_items: item_kind = str(item.get("kind") or "").strip() item_content = str(item.get("content") or "") if item_kind == "paragraph" or self._has_inline_numbered_knowledge_items(item_content): rendered = self._focus_knowledge_segment_content( item_content, query_terms or [], ) rendered_lines = self._split_inline_numbered_knowledge_items(rendered) else: rendered = self._render_knowledge_evidence_text(item) rendered_lines = rendered.splitlines() for line in rendered_lines: normalized = str(line or "").strip() if not normalized or normalized in seen: continue seen.add(normalized) lines.append(line) return lines def _summarize_knowledge_evidence_content( self, item: dict[str, Any], query_terms: list[str], ) -> str: kind = str(item.get("kind") or "").strip() content = str(item.get("content") or "").strip() if kind == "table": preview = self._extract_relevant_table_preview(content, query_terms) preview_rows = [line for line in preview.splitlines() if line.strip()][:4] if len(preview_rows) >= 3: return "当前命中的直接依据是一张与问题强相关的标准表,已摘出最相关的表头和行。" return "当前命中的直接依据是一张与问题强相关的标准表。" lines = self._split_clean_knowledge_lines(content, preserve_marker=True) if len(lines) >= 2: return self._clean_knowledge_segment_text(f"{lines[0]} {' '.join(lines[1:4])}") return self._clean_knowledge_segment_text(content) def _build_missing_location_grounding_note( self, question: str, evidence_items: list[dict[str, Any]], ) -> str: location = self._extract_query_location(question) if not location: return "" haystack = "\n".join( str(item.get("heading") or "") + "\n" + str(item.get("content") or "") for item in evidence_items ) if location in haystack: return "" return ( f"当前命中的制度依据没有直接写出“{location}”对应的地区档位或映射关系," "因此不能直接把它套用到表格中的某一列。" ) def _build_knowledge_search_answer( self, payload: UserAgentRequest, citations: list[UserAgentCitation], ) -> str: hits = [item for item in list(payload.tool_payload.get("hits") or []) if isinstance(item, dict)] evidence_items = self._build_knowledge_answer_evidence(payload) primary_citation = citations[0] if citations else None title = str( (primary_citation.title if primary_citation else "") or (hits[0].get("title") if hits else "") or "相关制度" ).strip() user_name = str(payload.context_json.get("name") or "").strip() answer_lines: list[str] = [] if user_name: answer_lines.append(f"{user_name},您好。") if not hits: self._append_markdown_section( answer_lines, "结论", [f"当前没有拿到可用于回答这个问题的《{title}》知识库命中。"], ) self._append_markdown_section( answer_lines, "说明", ["- 我不会用相似主题或外部常识硬凑答案;请补充更具体的关键词后再试一次。"], ) return "\n".join(answer_lines).strip() evidence_lines: list[str] = [] for item in evidence_items[:3]: heading = str(item.get("heading") or "").strip() if "表格行级检索线索" in heading: heading = heading.replace("表格行级检索线索", "").strip(" >") heading_text = f"({heading})" if heading else "" item_title = item.get("title") or title if str(item.get("kind") or "") == "table": preview = self._extract_relevant_table_preview( str(item.get("content") or ""), self._extract_knowledge_query_terms(self._resolve_knowledge_question(payload)), ) evidence_lines.append(f"- **《{item_title}》** {heading_text}\n{preview}") continue rendered = self._render_knowledge_evidence_text(item) if rendered: if "\n" in rendered: evidence_lines.append(f"- **《{item_title}》** {heading_text}\n{rendered}") else: evidence_lines.append(f"- **《{item_title}》** {heading_text}\n {rendered}") if not evidence_lines: for item in hits[:2]: item_title = str(item.get("title") or item.get("document_name") or "相关制度").strip() excerpt = ( str(item.get("excerpt") or "").strip() or self._extract_excerpt(str(item.get("content") or "")) ) if not excerpt: continue evidence_lines.append(f"- **《{item_title}》**:{excerpt}") if not evidence_lines: self._append_markdown_section( answer_lines, "结论", [f"当前《{title}》里可用于回答这个问题的关键条款还不够明确。"], ) self._append_markdown_section( answer_lines, "说明", ["- 请补充费用类型、适用地区、职级或具体业务场景,我会继续帮您缩小范围。"], ) return "\n".join(answer_lines).strip() self._append_markdown_section( answer_lines, "结论", ["我先根据当前制度依据给出可以确认的部分。"], ) self._append_markdown_section(answer_lines, "依据", evidence_lines) self._append_markdown_section( answer_lines, "说明", ["- 以上只使用当前命中的知识库证据;没有在证据中出现的适用条件或金额,我不会替您默认补齐。"], ) return "\n".join(answer_lines).strip()