from __future__ import annotations import re from typing import Any from app.services.user_agent_knowledge_constants import ( KNOWLEDGE_ARTICLE_PATTERN, KNOWLEDGE_LIST_ITEM_PATTERN, KNOWLEDGE_NUMBERED_ITEM_PATTERN, KNOWLEDGE_QUERY_STOPWORDS, KNOWLEDGE_SECTION_HEADING_PATTERN, MAX_KNOWLEDGE_MODEL_HITS, MAX_KNOWLEDGE_QUERY_TERMS, ) class UserAgentKnowledgeHelpersMixin: GENERIC_KNOWLEDGE_TITLE_TERMS = {"远光软件", "股份有限", "有限公司"} KNOWLEDGE_QUERY_ANCHOR_TERMS = ( "财务基础知识手册", "基础知识手册", "会计科目", "常用会计科目", "财务报表", "主要税种", "税种", "标准", "清单", "明细", "流程", ) @staticmethod def _select_knowledge_model_hits( tool_payload: dict[str, Any], *, question: str | None = None, ) -> list[dict[str, Any]]: raw_hits = [ item for item in list(tool_payload.get("hits") or []) if isinstance(item, dict) ][: max(MAX_KNOWLEDGE_MODEL_HITS + 3, 8)] if not raw_hits: return [] query_terms = UserAgentKnowledgeHelpersMixin._extract_knowledge_query_terms(question or "") if not query_terms: return raw_hits[:MAX_KNOWLEDGE_MODEL_HITS] ranked_hits = sorted( enumerate(raw_hits), key=lambda value: ( UserAgentKnowledgeHelpersMixin._score_knowledge_model_hit( value[1], query_terms=query_terms, rank_index=value[0], ), -value[0], ), reverse=True, ) return [item for _, item in ranked_hits[:MAX_KNOWLEDGE_MODEL_HITS]] @staticmethod def _score_knowledge_model_hit( item: dict[str, Any], *, query_terms: list[str], rank_index: int, ) -> int: title = str(item.get("title") or item.get("document_name") or "").lower() excerpt = str(item.get("excerpt") or "").lower() content = str(item.get("content") or "").lower() haystack = "\n".join([title, excerpt, content[:1400]]) matched_terms = [term for term in query_terms if term in haystack] score = max(1, 48 - rank_index * 4) score += len(matched_terms) * 10 score += sum(max(0, len(term) - 4) * 8 for term in matched_terms) score += sum(1 for term in matched_terms if term in title) * 8 score += sum(max(0, len(term) - 4) * 6 for term in matched_terms if term in title) score += sum( (len(term) - 3) * 10 for term in matched_terms if len(term) >= 4 and term in title and term not in UserAgentKnowledgeHelpersMixin.GENERIC_KNOWLEDGE_TITLE_TERMS ) leading_marker = UserAgentKnowledgeHelpersMixin._leading_knowledge_appendix_marker(content) if leading_marker == "# 章节导航": score -= 22 elif leading_marker == "# 问答线索补充": score += 6 if matched_terms else -8 elif leading_marker == "# 重点章节摘录": score += 4 if matched_terms else -4 elif leading_marker == "# 结构化表格补充": score += 8 if matched_terms else -3 if matched_terms and "|" in content: score += 8 if matched_terms and any(marker in content for marker in (":", ":")): score += 10 if matched_terms and "\n" in content: score += 4 if matched_terms and any(marker in content for marker in ("附表", "第", "条")): score += 4 if matched_terms and any(marker in content for marker in ("第", "条", ":", "-", "•")): score += 4 if re.search(r"没有.{0,8}(信息|规定|说明|依据)", content): score -= 12 return score @staticmethod def _leading_knowledge_appendix_marker(content: str) -> str: normalized = str(content or "").lstrip() for marker in ("# 章节导航", "# 重点章节摘录", "# 问答线索补充", "# 结构化表格补充"): index = normalized.find(marker) if 0 <= index <= 220: return marker return "" def _prioritize_knowledge_evidence_items( self, question: str, evidence_items: list[dict[str, Any]], ) -> list[dict[str, Any]]: if not evidence_items or not self._question_requires_explicit_condition(question): return evidence_items for preferred_kind in ("table", "kv", "clause", "list"): for index, item in enumerate(evidence_items): if str(item.get("kind") or "") != preferred_kind: continue return [item, *evidence_items[:index], *evidence_items[index + 1 :]] for index, item in enumerate(evidence_items): if re.search(r"\d", str(item.get("content") or "")): return [item, *evidence_items[:index], *evidence_items[index + 1 :]] return evidence_items @staticmethod def _is_knowledge_lead_in_segment(item: dict[str, str]) -> bool: kind = str(item.get("kind") or "").strip() content = str(item.get("content") or "").strip() return kind in {"kv", "list", "clause"} and content.endswith((":", ":")) @staticmethod def _extract_knowledge_marker_family(content: str) -> str: normalized = str(content or "").strip() if not normalized: return "" if KNOWLEDGE_ARTICLE_PATTERN.match(normalized): return "article" if re.match(r"^\d+[.)、]\s*", normalized): return "arabic" if re.match(r"^[((][一二三四五六七八九十百零0-9]+[))]\s*", normalized): return "paren" if re.match(r"^[①②③④⑤⑥⑦⑧⑨⑩]\s*", normalized): return "circled" if KNOWLEDGE_LIST_ITEM_PATTERN.match(normalized): return "bullet" return "" @staticmethod def _knowledge_list_marker_sort_key(content: str) -> int: normalized = str(content or "").strip() match = re.match(r"^[((]([一二三四五六七八九十百零0-9]+)[))]", normalized) if not match: return 999 marker = match.group(1) if marker.isdigit(): return int(marker) values = { "零": 0, "一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10, } if marker in values: return values[marker] if marker.startswith("十") and len(marker) == 2: return 10 + values.get(marker[1], 0) if marker.endswith("十") and len(marker) == 2: return values.get(marker[0], 0) * 10 if "十" in marker: left, right = marker.split("十", 1) return values.get(left, 1) * 10 + values.get(right, 0) return 999 @staticmethod def _format_knowledge_heading_label(heading: str) -> str: parts = [item.strip() for item in str(heading or "").split(">") if item.strip()] return " / ".join(parts) @staticmethod def _has_inline_numbered_knowledge_items(content: str) -> bool: return len( re.findall( r"[((][一二三四五六七八九十百零0-9]+[))]", str(content or ""), ) ) >= 2 @staticmethod def _split_inline_numbered_knowledge_items(content: str) -> list[str]: normalized = str(content or "").strip() if not UserAgentKnowledgeHelpersMixin._has_inline_numbered_knowledge_items(normalized): return [normalized] if normalized else [] marker_pattern = r"[((][一二三四五六七八九十百零0-9]+[))]" first_marker = re.search(marker_pattern, normalized) if first_marker is None: return [normalized] if normalized else [] prefix = normalized[: first_marker.start()].strip(" ::") tail = normalized[first_marker.start() :].strip() item_pattern = ( r"([((][一二三四五六七八九十百零0-9]+[))]\s*.*?" r"(?=\s*[((][一二三四五六七八九十百零0-9]+[))]|\s*$))" ) items = [item.strip() for item in re.findall(item_pattern, tail) if item.strip()] if prefix: return [prefix, *items] return items or [normalized] @staticmethod def _focus_knowledge_segment_content(content: str, query_terms: list[str]) -> str: normalized = re.sub(r"\s+", " ", str(content or "").strip()) if not normalized: return "" anchor_terms = sorted( { str(term or "").strip() for term in query_terms if len(str(term or "").strip()) >= 3 }, key=len, reverse=True, ) anchor_index = -1 for term in anchor_terms: anchor_index = normalized.lower().find(term.lower()) if anchor_index >= 0: break if anchor_index < 0: return normalized prefix_window = normalized[max(0, anchor_index - 40) : anchor_index] marker_match = None for match in re.finditer( r"(?:第[一二三四五六七八九十百零0-9]+[部分章节条]|[一二三四五六七八九十]+、|[((][一二三四五六七八九十百零0-9]+[))])", prefix_window, ): marker_match = match start = anchor_index if marker_match is not None: start = max(0, anchor_index - len(prefix_window) + marker_match.start()) return normalized[start : start + 700].strip() @staticmethod def _split_markdown_table_cells(line: str) -> list[str]: stripped = str(line or "").strip() if stripped.startswith("|"): stripped = stripped[1:] if stripped.endswith("|"): stripped = stripped[:-1] return [ re.sub(r"\s+", " ", cell.replace("**", "").strip()) for cell in stripped.split("|") ] @classmethod def _summarize_knowledge_table_preview(cls, preview: str) -> str: rows: list[list[str]] = [] for line in str(preview or "").splitlines(): if line.count("|") < 2: continue cells = cls._split_markdown_table_cells(line) if not cells or all(re.fullmatch(r":?-{2,}:?", cell.replace(" ", "")) for cell in cells): continue rows.append(cells) if len(rows) < 2: return "可直接参考的标准表如下。" header = rows[0] data_rows = [row for row in rows[1:] if len(row) == len(header)] if len(data_rows) == 1 and len(header) >= 2: row = data_rows[0] subject = row[0] or "该项目" pairs = [ f"{label}:{value}" for label, value in zip(header[1:], row[1:]) if label and value and value not in {"-", "—"} ] if pairs: return f"{subject}的标准为:{';'.join(pairs)}。" return "相关标准项如下,请按表头和行内容对应使用。" def _summarize_knowledge_lines_conclusion( self, lines: list[str], *, heading: str = "", ) -> str: clean_lines = [ self._clean_knowledge_segment_text(line) for line in lines if self._clean_knowledge_segment_text(line) ] if not clean_lines: return "" clean_heading = str(heading or "").strip() if not clean_heading and clean_lines and ":" not in clean_lines[0] and ":" not in clean_lines[0]: clean_heading = clean_lines[0] clean_heading = re.sub( r"^[一二三四五六七八九十百零0-9]+、\s*", "", clean_heading, ) item_labels: list[str] = [] for line in clean_lines: if ":" not in line and ":" not in line: continue label = re.split(r"[::]", line, maxsplit=1)[0].strip() if 1 <= len(label) <= 24: item_labels.append(label) if clean_heading and len(item_labels) >= 2: return f"{clean_heading}包括:{'、'.join(item_labels[:6])}。" if item_labels: return f"{item_labels[0]}:{clean_lines[0].split(':', 1)[-1].strip()}" return clean_lines[0] @staticmethod def _knowledge_lines_have_multiple_labeled_items(lines: list[str]) -> bool: labeled_count = 0 for line in lines: normalized = str(line or "").strip() if ":" not in normalized and ":" not in normalized: continue label = re.split(r"[::]", normalized, maxsplit=1)[0].strip() if 1 <= len(label) <= 24: labeled_count += 1 return labeled_count >= 2 def _score_knowledge_evidence_candidate( self, item: dict[str, str], query_terms: list[str], ) -> int: heading = str(item.get("heading") or "").lower() content = str(item.get("content") or "").lower() kind = str(item.get("kind") or "").strip() haystack = "\n".join([heading, content]) matched_terms = [term for term in query_terms if term in haystack] score = len(matched_terms) * 10 score += sum(max(0, len(term) - 4) * 8 for term in matched_terms) score += sum(1 for term in matched_terms if term in heading) * 6 score += sum(max(0, len(term) - 4) * 6 for term in matched_terms if term in heading) if kind == "table": score += 10 if content.count("\n") < 2: score -= 24 elif kind in {"kv", "clause", "list"}: score += 8 elif kind == "paragraph": score += 4 if "问答线索补充" in heading or "重点章节摘录" in heading: score += 8 if "结构化表格补充" in heading: score += 10 if "章节导航" in heading or "目录" in heading: score -= 16 if re.search(r"[.。…]{6,}", content): score -= 12 if any(hint in content for hint in ("应", "需", "不得", "可以", "标准", "条件", "材料", "审批", "流程", "包括")): score += 3 content_length = len(content) if content_length > 220: score -= min(8, (content_length - 220) // 40) return score @staticmethod def _extract_knowledge_query_terms(question: str) -> list[str]: normalized_question = str(question or "").strip().lower() if not normalized_question: return [] terms: list[str] = [] seen: set[str] = set() def remember(term: str) -> None: normalized = str(term or "").strip().lower() if ( not normalized or normalized in seen or normalized in KNOWLEDGE_QUERY_STOPWORDS ): return seen.add(normalized) terms.append(normalized) for item in re.findall(r"[a-z0-9][a-z0-9_\-]{1,}", normalized_question): remember(item) for block in re.findall(r"[\u4e00-\u9fff]{2,20}", normalized_question): remember(block) if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS: return terms for marker in ("标准", "金额", "限额", "额度"): marker_index = block.find(marker) if marker_index <= 0: continue subject = block[:marker_index] for width in (6, 4, 3, 2): remember(subject[-width:]) for anchor in UserAgentKnowledgeHelpersMixin.KNOWLEDGE_QUERY_ANCHOR_TERMS: if anchor in block: remember(anchor) tail = block[-14:] for size in (8, 7, 6, 5, 4): for start in range(0, len(tail) - size + 1): piece = tail[start : start + size] if any( anchor in piece for anchor in UserAgentKnowledgeHelpersMixin.KNOWLEDGE_QUERY_ANCHOR_TERMS ): remember(piece) if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS: return terms if len(block) <= 4: remember(block) continue for size in (4, 3, 2): for start in range(0, len(block) - size + 1): remember(block[start : start + size]) if len(terms) >= MAX_KNOWLEDGE_QUERY_TERMS: return terms return terms[:MAX_KNOWLEDGE_QUERY_TERMS] @staticmethod def _clean_knowledge_segment_text(content: str) -> str: normalized = str(content or "").strip() normalized = re.sub(r"^[-*•]\s*", "", normalized) normalized = re.sub(r"^(?:\d+[.)、]|[①②③④⑤⑥⑦⑧⑨⑩])\s*", "", normalized) normalized = re.sub(r"^[((][一二三四五六七八九十百零0-9]+[))]\s*", "", normalized) normalized = re.sub(r"\s+", " ", normalized) if len(normalized) <= 180: return normalized return f"{normalized[:177].rstrip()}..." @staticmethod def _normalize_knowledge_line(content: str, *, preserve_marker: bool) -> str: normalized = str(content or "").strip() normalized = re.sub(r"^[-*•]\s*", "", normalized) if not preserve_marker: normalized = re.sub(r"^(?:\d+[.)、]|[①②③④⑤⑥⑦⑧⑨⑩])\s*", "", normalized) normalized = re.sub(r"^[((][一二三四五六七八九十百零0-9]+[))]\s*", "", normalized) normalized = re.sub(r"\s+", " ", normalized) return normalized def _split_clean_knowledge_lines( self, content: str, *, preserve_marker: bool, ) -> list[str]: return [ line for line in ( self._normalize_knowledge_line(item, preserve_marker=preserve_marker) for item in str(content or "").splitlines() ) if line ] @staticmethod def _extract_relevant_table_preview( content: str, query_terms: list[str], *, preferred_terms: list[str] | None = None, max_rows: int = 3, fallback_rows: int = 2, ) -> str: lines = [line.strip() for line in str(content or "").splitlines() if line.strip()] if len(lines) <= 3: return "\n".join(lines) header = lines[0] divider = lines[1] if len(lines) > 1 else "" body = lines[2:] if divider.count("|") >= 2 else lines[1:] preferred = [ str(term or "").strip().lower() for term in list(preferred_terms or []) if str(term or "").strip() ] base_terms = preferred + [ str(term or "").strip().lower() for term in query_terms if str(term or "").strip().lower() not in preferred ] derived_terms: list[str] = [] for term in base_terms: for marker in ("标准", "金额", "限额", "额度", "是多少"): marker_index = term.find(marker) if marker_index <= 0: continue subject = term[:marker_index].strip() if len(subject) < 2: continue for width in (6, 4, 3, 2): derived_terms.append(subject[-width:]) search_terms: list[str] = [] for term in [*preferred, *derived_terms, *base_terms]: if term and term not in search_terms: search_terms.append(term) matched_rows = [ row for row in body if any(term in row.lower() for term in search_terms) ] selected_rows = matched_rows[:max_rows] or body[:fallback_rows] preview_lines = [header] if divider: preview_lines.append(divider) preview_lines.extend(selected_rows) return "\n".join(preview_lines).strip() @staticmethod def _question_requests_broad_knowledge_table(question: str) -> bool: normalized = str(question or "").strip() if not normalized: return False broad_hints = ("有哪些", "是什么", "介绍", "说明", "列表", "清单", "全部", "完整") table_subject_hints = ("科目", "目录", "清单", "列表", "表", "明细") return any(hint in normalized for hint in broad_hints) and any( hint in normalized for hint in table_subject_hints ) @staticmethod def _question_requires_explicit_condition(question: str) -> bool: normalized = str(question or "").strip() return any(keyword in normalized for keyword in ("多少", "金额", "上限", "限额", "标准", "条件", "需要")) @staticmethod def _answer_evidence_has_numeric_or_condition(evidence_items: list[dict[str, Any]]) -> bool: for item in evidence_items: content = str(item.get("content") or "") if re.search(r"\d", content): return True if any( keyword in content for keyword in ("应", "需", "不得", "可以", "条件", "材料", "审批", "流程", "标准", "适用") ): return True return False