Files
X-Financial/server/src/app/services/knowledge_rag.py
caoxiaozhu d4d5d40569 feat: 新增预算费控模型与报销审批流引擎
后端新增预算费控服务和报销单审批流模块,引入申请人费用画像
算法,优化知识库 RAG 运行时和同步逻辑,完善报销单工作流常
量和明细同步,更新差旅报销规则电子表格,前端新增预算分析
组件和数字员工模型,完善审批对话框和洞察面板交互,优化侧
边栏和顶栏样式,补充单元测试。
2026-05-27 17:31:27 +08:00

878 lines
32 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import os
import re
import socket
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Any, Callable
from sqlalchemy.orm import Session
from app.core.config import get_settings
from app.core.logging import get_logger
from app.db.session import get_session_factory
from app.services.knowledge_ingest_log import (
build_document_graph_summary,
build_ingest_document_summary,
build_ingest_status_summary,
)
from app.services.knowledge_rag_local import query_local_text_chunks
from app.services.knowledge_rag_runtime import (
KnowledgeRagError,
RuntimeModelConfig,
_LightRagRuntime,
)
from app.services.settings import SettingsService
logger = get_logger("app.services.knowledge_rag")
DEFAULT_QDRANT_URL = "http://127.0.0.1:6333"
CONTAINER_QDRANT_URL = "http://qdrant:6333"
DEFAULT_LIGHTRAG_WORKSPACE = "x_financial_knowledge"
MAX_KNOWLEDGE_HIT_CONTENT_LENGTH = 2200
MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH = 220
MAX_QUERY_TERMS = 12
QUERY_TERM_STOPWORDS = {
"什么",
"多少",
"哪些",
"怎么",
"如何",
"请问",
"一下",
"关于",
"规定",
"标准",
"可以",
"是否",
"一个",
"哪些人",
}
TABLE_OR_STANDARD_QUERY_HINTS = (
"",
"表格",
"清单",
"明细",
"目录",
"科目",
"标准",
"金额",
"限额",
"补贴",
"住宿",
"餐费",
"交通",
"报销",
"档位",
"额度",
)
QUERY_ANCHOR_TERMS = (
"财务基础知识手册",
"基础知识手册",
"会计科目",
"常用会计科目",
"财务报表",
"主要税种",
"税种",
"标准",
"清单",
"明细",
"流程",
)
GENERIC_TITLE_TERMS = {"远光软件", "股份有限", "有限公司"}
STRUCTURED_APPENDIX_LEADING_MARKERS = (
"# 章节导航",
"# 重点章节摘录",
"# 问答线索补充",
"# 结构化表格补充",
)
STRUCTURED_APPENDIX_LEADING_WINDOW = 220
_runtime_lock = threading.RLock()
_runtime_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="knowledge-rag-runtime")
_runtime_instances: dict[str, _LightRagRuntime] = {}
_runtime_signatures: dict[str, tuple[Any, ...]] = {}
_RUNTIME_CACHE_KEY = "lightrag"
class KnowledgeRagService:
def __init__(self, db: Session | None = None, storage_root: Path | None = None) -> None:
self.db = db
self.storage_root = Path(storage_root or get_settings().resolved_storage_root_dir)
def query_knowledge(
self,
query: str,
*,
conversation_history: list[dict[str, str]] | None = None,
limit: int = 5,
) -> dict[str, Any]:
normalized_query = str(query or "").strip()
if not normalized_query:
return {
"result_type": "knowledge_search",
"query": "",
"record_count": 0,
"hits": [],
"references": [],
"message": "请先输入要检索的知识库问题。",
}
rewritten_query = normalized_query
if conversation_history:
rewritten_query = self._rewrite_query(normalized_query, conversation_history)
workspace = (
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
or DEFAULT_LIGHTRAG_WORKSPACE
)
local_result = query_local_text_chunks(
lightrag_root=(self.storage_root / "knowledge" / ".lightrag").resolve(),
workspace=workspace,
query=rewritten_query,
limit=limit,
)
runtime_hits: list[dict[str, Any]] = []
runtime_references: list[str] = []
if not local_result.confident:
try:
raw = self._run_runtime_operation(
lambda runtime: runtime.query_data(
rewritten_query,
conversation_history=conversation_history,
)
)
data = raw.get("data") if isinstance(raw, dict) else {}
chunks = list(data.get("chunks") or []) if isinstance(data, dict) else []
entities = list(data.get("entities") or []) if isinstance(data, dict) else []
runtime_references = list(data.get("references") or []) if isinstance(data, dict) else []
runtime_hits = self._build_hits_from_query_data(
query=rewritten_query,
chunks=chunks,
entities=entities,
limit=limit,
)
except Exception as exc:
logger.warning("Knowledge query failed: %s", exc)
all_hits: dict[str, dict[str, Any]] = {}
for hit in local_result.hits:
hit["score"] = int(hit.get("score") or 0)
all_hits[hit["code"]] = hit
for hit in runtime_hits:
code = hit["code"]
if code in all_hits:
all_hits[code]["score"] = max(all_hits[code]["score"], int(hit.get("score") or 0) + 20)
if not all_hits[code].get("tags") and hit.get("tags"):
all_hits[code]["tags"] = hit["tags"]
else:
hit["score"] = int(hit.get("score") or 0)
all_hits[code] = hit
merged_hits = sorted(all_hits.values(), key=lambda x: int(x.get("score") or 0), reverse=True)[:max(1, limit)]
if not merged_hits:
return {
"result_type": "knowledge_search",
"query": rewritten_query,
"record_count": 0,
"hits": [],
"references": [],
"raw_references": runtime_references,
"message": "当前知识库中没有检索到与本次问题直接匹配的内容。",
}
return {
"result_type": "knowledge_search",
"query": rewritten_query,
"record_count": len(merged_hits),
"hits": merged_hits,
"references": [
str(item.get("code") or "").strip()
for item in merged_hits
if str(item.get("code") or "").strip()
],
"raw_references": runtime_references,
"metadata": {
"retrieval_strategy": "fusion" if runtime_hits else "local_text_chunks",
"local_total_chunks": local_result.total_chunks,
"local_best_score": local_result.best_score,
},
"message": f"已从知识库中联合检索到 {len(merged_hits)} 条相关内容。",
}
def _rewrite_query(self, query: str, conversation_history: list[dict[str, str]]) -> str:
if not self.db:
return query
from app.services.runtime_chat import RuntimeChatService
try:
chat_service = RuntimeChatService(self.db)
messages: list[dict[str, Any]] = [{"role": "system", "content": "你是一个查询重写助手。你的任务是根据用户的多轮对话历史,将用户的最后一次提问重写为一句独立、完整的查询语句,以便于在知识库中进行向量检索。只输出重写后的句子,不要任何解释。"}]
for msg in conversation_history[-6:]:
messages.append({"role": msg.get("role", "user"), "content": msg.get("content", "")})
messages.append({"role": "user", "content": f"当前提问:{query}\n\n请重写当前提问。"})
rewritten = chat_service.complete(
messages,
max_tokens=60,
temperature=0.1,
timeout_seconds=10,
)
if rewritten and len(rewritten) > 2 and len(rewritten) < 80:
logger.info("Query rewritten: '%s' -> '%s'", query, rewritten)
return rewritten
except Exception as exc:
logger.warning("Query rewrite failed: %s", exc)
return query
def index_documents(
self,
*,
document_ids: list[str],
force: bool = False,
) -> dict[str, Any]:
normalized_ids = [str(item).strip() for item in document_ids if str(item).strip()]
if not normalized_ids:
raise ValueError("没有可供索引的知识文档。")
from app.services.knowledge import KnowledgeService
from app.services.knowledge_normalizer import KnowledgeNormalizationService
knowledge_service = KnowledgeService(storage_root=self.storage_root, db=self.db)
normalization_service = (
KnowledgeNormalizationService(self.db) if self.db is not None else None
)
texts: list[str] = []
file_paths: list[str] = []
document_summaries: list[dict[str, Any]] = []
existing_statuses = self._run_runtime_operation(
lambda runtime: runtime.get_document_statuses(normalized_ids)
)
for document_id in normalized_ids:
entry = knowledge_service.get_document_entry(document_id)
if force and document_id in existing_statuses:
try:
self._run_runtime_operation(
lambda runtime, target_id=document_id: runtime.delete_document(target_id)
)
except Exception as exc:
logger.warning(
"Delete existing LightRAG document failed doc_id=%s: %s", document_id, exc
)
text = knowledge_service.extract_document_text(document_id)
raw_text = text
if normalization_service is not None:
text = normalization_service.build_enriched_text(text)
texts.append(text)
file_paths.append(
str(
(
knowledge_service.library_root / entry["folder"] / entry["stored_name"]
).resolve()
)
)
document_summaries.append(
build_ingest_document_summary(
document_id=document_id,
entry=entry,
raw_text=raw_text,
indexed_text=text,
)
)
track_id = self._run_runtime_operation(
lambda runtime: runtime.insert_documents(
texts=texts,
document_ids=normalized_ids,
file_paths=file_paths,
)
)
statuses = self._run_runtime_operation(
lambda runtime: runtime.get_document_statuses(normalized_ids)
)
succeeded_document_ids: list[str] = []
failed_documents: list[dict[str, str]] = []
summary_by_id = {
str(item.get("document_id") or "").strip(): item
for item in document_summaries
if str(item.get("document_id") or "").strip()
}
for document_id in normalized_ids:
status_obj = statuses.get(document_id)
status_text = self._status_value(status_obj)
status_payload = self._serialize_status(status_obj)
workspace = (
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
or DEFAULT_LIGHTRAG_WORKSPACE
)
graph_summary = build_document_graph_summary(
self.storage_root,
workspace=workspace,
document_id=document_id,
)
if document_id in summary_by_id:
summary_by_id[document_id].update(
build_ingest_status_summary(
status_payload=status_payload,
graph_summary=graph_summary,
)
)
if self.is_query_ready_status(status_obj):
succeeded_document_ids.append(document_id)
continue
failed_documents.append(
{
"document_id": document_id,
"status": status_text or "unknown",
"error": self._status_error(status_obj),
}
)
return {
"track_id": track_id,
"requested_document_ids": normalized_ids,
"succeeded_document_ids": succeeded_document_ids,
"failed_documents": failed_documents,
"document_summaries": [
summary_by_id.get(document_id, {}) for document_id in normalized_ids
],
"status_snapshot": {
document_id: self._serialize_status(status_obj)
for document_id, status_obj in statuses.items()
},
}
def get_document_status_map(
self, document_ids: list[str] | None = None
) -> dict[str, dict[str, Any]]:
target_ids = [str(item).strip() for item in document_ids or [] if str(item).strip()]
if not target_ids:
return {}
try:
statuses = self._run_runtime_operation(
lambda runtime: runtime.get_document_statuses(target_ids)
)
except Exception as exc:
logger.warning("Load LightRAG document statuses failed: %s", exc)
return {}
return {
document_id: self._serialize_status(status_obj)
for document_id, status_obj in statuses.items()
}
def delete_document(self, document_id: str) -> None:
normalized_id = str(document_id or "").strip()
if not normalized_id:
return
try:
self._run_runtime_operation(
lambda runtime: runtime.delete_document(normalized_id)
)
except Exception as exc:
logger.warning("Delete LightRAG document ignored doc_id=%s: %s", normalized_id, exc)
def _run_runtime_operation(self, operation: Callable[[_LightRagRuntime], Any]) -> Any:
signature, runtime_kwargs = self._build_runtime_signature()
return _runtime_executor.submit(
self._execute_runtime_operation,
signature,
runtime_kwargs,
operation,
).result()
def _execute_runtime_operation(
self,
signature: tuple[Any, ...],
runtime_kwargs: dict[str, Any],
operation: Callable[[_LightRagRuntime], Any],
) -> Any:
return operation(self._get_runtime(signature=signature, runtime_kwargs=runtime_kwargs))
def _get_runtime(
self,
*,
signature: tuple[Any, ...] | None = None,
runtime_kwargs: dict[str, Any] | None = None,
) -> _LightRagRuntime:
if signature is None or runtime_kwargs is None:
signature, runtime_kwargs = self._build_runtime_signature()
with _runtime_lock:
runtime = _runtime_instances.get(_RUNTIME_CACHE_KEY)
if runtime is not None and _runtime_signatures.get(_RUNTIME_CACHE_KEY) == signature:
return runtime
if runtime is not None:
try:
runtime.finalize()
except Exception as exc: # pragma: no cover - best effort cleanup
logger.warning("Finalize previous LightRAG runtime failed: %s", exc)
runtime = _LightRagRuntime(**runtime_kwargs)
_runtime_instances[_RUNTIME_CACHE_KEY] = runtime
_runtime_signatures[_RUNTIME_CACHE_KEY] = signature
return runtime
def _build_runtime_signature(self) -> tuple[tuple[Any, ...], dict[str, Any]]:
configs = self._load_runtime_configs()
settings = get_settings()
working_dir = (self.storage_root / "knowledge" / ".lightrag").resolve()
workspace = (
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
or DEFAULT_LIGHTRAG_WORKSPACE
)
qdrant_url = os.environ.get("QDRANT_URL", "").strip() or _resolve_default_qdrant_url()
qdrant_api_key = os.environ.get("QDRANT_API_KEY", "").strip()
signature = (
str(working_dir),
workspace,
qdrant_url,
qdrant_api_key,
configs["main"].provider,
configs["main"].model,
configs["main"].endpoint,
configs["main"].api_key,
configs["backup"].provider if configs["backup"] else "",
configs["backup"].model if configs["backup"] else "",
configs["backup"].endpoint if configs["backup"] else "",
configs["backup"].api_key if configs["backup"] else "",
configs["embedding"].provider,
configs["embedding"].model,
configs["embedding"].endpoint,
configs["embedding"].api_key,
configs["reranker"].provider if configs["reranker"] else "",
configs["reranker"].model if configs["reranker"] else "",
configs["reranker"].endpoint if configs["reranker"] else "",
configs["reranker"].api_key if configs["reranker"] else "",
str(settings.resolved_storage_root_dir),
)
return signature, {
"working_dir": working_dir,
"workspace": workspace,
"qdrant_url": qdrant_url,
"qdrant_api_key": qdrant_api_key,
"primary_chat": configs["main"],
"backup_chat": configs["backup"],
"embedding": configs["embedding"],
"reranker": configs["reranker"],
}
def _load_runtime_configs(self) -> dict[str, RuntimeModelConfig | None]:
owned_session = False
session = self.db
if session is None:
session = get_session_factory()()
owned_session = True
try:
settings_service = SettingsService(session)
main = self._normalize_runtime_model(settings_service.get_runtime_model_config("main"))
embedding = self._normalize_runtime_model(
settings_service.get_runtime_model_config("embedding")
)
try:
backup_raw = settings_service.get_runtime_model_config("backup")
backup = self._normalize_runtime_model(backup_raw)
except Exception:
backup = None
try:
reranker_raw = settings_service.get_runtime_model_config("reranker")
reranker = self._normalize_runtime_model(reranker_raw)
except Exception:
reranker = None
if backup is not None and (
not backup.endpoint
or not backup.model
or (backup.provider != "Ollama" and not backup.api_key)
):
backup = None
if reranker is not None and (
not reranker.endpoint
or not reranker.model
or (reranker.provider != "Ollama" and not reranker.api_key)
):
reranker = None
if not main.endpoint or not main.model:
raise KnowledgeRagError("主对话模型未配置,无法初始化 LightRAG。")
if main.provider != "Ollama" and not main.api_key:
raise KnowledgeRagError("主对话模型缺少 API Key无法初始化 LightRAG。")
if not embedding.endpoint or not embedding.model:
raise KnowledgeRagError("Embedding 模型未配置,无法初始化 LightRAG。")
if embedding.provider != "Ollama" and not embedding.api_key:
raise KnowledgeRagError("Embedding 模型缺少 API Key无法初始化 LightRAG。")
return {
"main": main,
"backup": backup,
"embedding": embedding,
"reranker": reranker,
}
finally:
if owned_session and session is not None:
session.close()
@staticmethod
def _normalize_runtime_model(payload: dict[str, str]) -> RuntimeModelConfig:
return RuntimeModelConfig(
slot=str(payload.get("slot") or "").strip(),
provider=str(payload.get("provider") or "").strip(),
model=str(payload.get("model") or "").strip(),
endpoint=str(payload.get("endpoint") or "").strip(),
api_key=str(payload.get("apiKey") or "").strip(),
capability=str(payload.get("capability") or "").strip(),
)
@staticmethod
def _build_hits_from_query_data(
*,
query: str,
chunks: list[dict[str, Any]],
entities: list[dict[str, Any]],
limit: int,
) -> list[dict[str, Any]]:
entity_tags_by_path: dict[str, list[str]] = {}
for entity in entities:
if not isinstance(entity, dict):
continue
file_path = str(entity.get("file_path") or "").strip()
entity_name = str(entity.get("entity_name") or "").strip()
if not file_path or not entity_name:
continue
entity_tags_by_path.setdefault(file_path, [])
if entity_name not in entity_tags_by_path[file_path]:
entity_tags_by_path[file_path].append(entity_name)
query_terms = _extract_query_terms(query)
prefers_tabular_evidence = any(hint in query for hint in TABLE_OR_STANDARD_QUERY_HINTS)
candidates: list[dict[str, Any]] = []
for rank, chunk in enumerate(chunks, start=1):
if not isinstance(chunk, dict):
continue
file_path = str(chunk.get("file_path") or "").strip()
chunk_id = str(chunk.get("chunk_id") or "").strip()
content = str(chunk.get("content") or "").strip()
if not file_path or not content:
continue
document_id, document_name = _parse_document_identity(file_path)
normalized_chunk_id = chunk_id or f"path-{rank}"
normalized_content = _truncate_text(
content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH
)
excerpt = _build_query_focused_excerpt(
normalized_content,
query_terms=query_terms,
max_length=MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH,
)
candidates.append(
{
"code": f"knowledge.{document_id or 'unknown'}.{normalized_chunk_id}",
"candidate_id": normalized_chunk_id,
"title": document_name or "知识库文档",
"content": normalized_content,
"excerpt": excerpt,
"document_id": document_id,
"document_name": document_name or Path(file_path).name,
"version": None,
"updated_at": None,
"score": max(1, 100 - rank),
"tags": entity_tags_by_path.get(file_path, [])[:5],
"evidence": [normalized_chunk_id],
"file_path": file_path,
"_rank": rank,
}
)
ranked = sorted(
candidates,
key=lambda item: (
_score_knowledge_hit(
item,
query_terms=query_terms,
prefers_tabular_evidence=prefers_tabular_evidence,
),
-int(item.get("_rank") or 0),
),
reverse=True,
)
hits: list[dict[str, Any]] = []
for item in ranked[: max(1, limit)]:
normalized = dict(item)
normalized.pop("_rank", None)
hits.append(normalized)
return hits
@staticmethod
def _serialize_status(status_obj: Any) -> dict[str, Any]:
if status_obj is None:
return {}
if hasattr(status_obj, "__dict__"):
payload = dict(status_obj.__dict__)
elif isinstance(status_obj, dict):
payload = dict(status_obj)
else:
payload = {}
payload["status"] = KnowledgeRagService._status_value(status_obj)
payload["error_msg"] = KnowledgeRagService._status_error(status_obj)
payload["query_ready"] = KnowledgeRagService.is_query_ready_status(status_obj)
return payload
@staticmethod
def _status_value(status_obj: Any) -> str:
raw_status = getattr(status_obj, "status", None)
if raw_status is None and isinstance(status_obj, dict):
raw_status = status_obj.get("status")
normalized = str(raw_status or "").strip().lower()
if "." in normalized:
normalized = normalized.split(".")[-1].strip()
if ":" in normalized and normalized.endswith(">"):
normalized = normalized.split(":")[0].strip("<> '\"")
return normalized
@staticmethod
def _status_error(status_obj: Any) -> str:
value = getattr(status_obj, "error_msg", None)
if value is None and isinstance(status_obj, dict):
value = status_obj.get("error_msg")
return str(value or "").strip()
@staticmethod
def is_query_ready_status(status_obj: Any) -> bool:
status_text = KnowledgeRagService._status_value(status_obj)
if status_text in {"failed", "error", "aborted"}:
return False
if status_text == "processed":
return True
if status_text in {"pending", "processing", "preprocessed"}:
return False
chunks_count = getattr(status_obj, "chunks_count", None)
if chunks_count is None and isinstance(status_obj, dict):
chunks_count = status_obj.get("chunks_count")
try:
if int(chunks_count or 0) > 0:
return True
except (TypeError, ValueError):
pass
chunks_list = getattr(status_obj, "chunks_list", None)
if chunks_list is None and isinstance(status_obj, dict):
chunks_list = status_obj.get("chunks_list")
return bool(chunks_list)
def shutdown_knowledge_rag_runtime() -> None:
_runtime_executor.submit(_shutdown_runtime_instances).result()
def _shutdown_runtime_instances() -> None:
with _runtime_lock:
for runtime in list(_runtime_instances.values()):
try:
runtime.finalize()
except Exception as exc: # pragma: no cover - best effort cleanup
logger.warning("Finalize LightRAG runtime failed during shutdown: %s", exc)
_runtime_instances.clear()
_runtime_signatures.clear()
def _parse_document_identity(file_path: str) -> tuple[str, str]:
path = Path(str(file_path or "").strip())
name = path.name
if "__" not in name:
return "", name
document_id, document_name = name.split("__", maxsplit=1)
return document_id.strip(), document_name.strip()
def _build_excerpt(text: str, *, max_length: int = 180) -> str:
normalized = " ".join(str(text or "").split()).strip()
if len(normalized) <= max_length:
return normalized
return f"{normalized[: max_length - 3].rstrip()}..."
def _build_query_focused_excerpt(
text: str,
*,
query_terms: list[str],
max_length: int = 180,
) -> str:
normalized = " ".join(str(text or "").split()).strip()
if not normalized:
return ""
lowered = normalized.lower()
match_positions = [
lowered.find(term) for term in query_terms if term and lowered.find(term) >= 0
]
if not match_positions:
return _build_excerpt(normalized, max_length=max_length)
start = max(0, min(match_positions) - max_length // 3)
end = min(len(normalized), start + max_length)
snippet = normalized[start:end].strip()
if start > 0:
snippet = f"...{snippet.lstrip()}"
if end < len(normalized):
snippet = f"{snippet.rstrip()}..."
return snippet
def _truncate_text(text: str, *, max_length: int) -> str:
normalized = str(text or "").strip()
if len(normalized) <= max_length:
return normalized
return f"{normalized[: max_length - 3].rstrip()}..."
def _resolve_default_qdrant_url() -> str:
if _hostname_resolves("qdrant"):
return CONTAINER_QDRANT_URL
return DEFAULT_QDRANT_URL
def _hostname_resolves(hostname: str) -> bool:
try:
socket.getaddrinfo(hostname, None)
except OSError:
return False
return True
def _extract_query_terms(query: str) -> list[str]:
normalized_query = str(query or "").strip().lower()
if not normalized_query:
return []
terms: list[str] = []
seen: set[str] = set()
def remember(term: str) -> None:
normalized_term = str(term or "").strip().lower()
if (
not normalized_term
or normalized_term in seen
or normalized_term in QUERY_TERM_STOPWORDS
or len(normalized_term) < 2
):
return
seen.add(normalized_term)
terms.append(normalized_term)
for item in re.findall(r"[a-z0-9][a-z0-9_\-]{1,}", normalized_query):
remember(item)
for block in re.findall(r"[\u4e00-\u9fff]{2,20}", normalized_query):
for marker in ("标准", "金额", "限额", "额度"):
marker_index = block.find(marker)
if marker_index <= 0:
continue
subject = block[:marker_index]
for width in (6, 4, 3, 2):
remember(subject[-width:])
for anchor in QUERY_ANCHOR_TERMS:
if anchor in block:
remember(anchor)
tail = block[-14:]
for size in (8, 7, 6, 5, 4):
for start in range(0, len(tail) - size + 1):
piece = tail[start : start + size]
if any(anchor in piece for anchor in QUERY_ANCHOR_TERMS):
remember(piece)
if len(terms) >= MAX_QUERY_TERMS:
return terms
if len(block) <= 4:
remember(block)
continue
for size in (4, 3, 2):
for start in range(0, len(block) - size + 1):
remember(block[start : start + size])
if len(terms) >= MAX_QUERY_TERMS:
return terms
return terms[:MAX_QUERY_TERMS]
def _score_knowledge_hit(
item: dict[str, Any],
*,
query_terms: list[str],
prefers_tabular_evidence: bool,
) -> int:
rank = max(1, int(item.get("_rank") or 1))
title = str(item.get("title") or item.get("document_name") or "").lower()
content = str(item.get("content") or "").lower()
excerpt = str(item.get("excerpt") or "").lower()
tags = " ".join(str(value).lower() for value in list(item.get("tags") or [])[:5])
haystack = "\n".join([title, excerpt, tags, content[:1200]])
score = max(1, 120 - rank * 4)
matched_terms = [term for term in query_terms if term in haystack]
score += len(matched_terms) * 8
score += sum(1 for term in matched_terms if term in title) * 6
score += sum(
(len(term) - 3) * 12
for term in matched_terms
if len(term) >= 4 and term in title and term not in GENERIC_TITLE_TERMS
)
leading_appendix_marker = _leading_structured_appendix_marker(content)
if leading_appendix_marker == "# 章节导航":
score -= 24
elif leading_appendix_marker == "# 重点章节摘录":
score += 4 if matched_terms else -12
elif leading_appendix_marker == "# 问答线索补充":
score += (
8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
)
elif leading_appendix_marker == "# 结构化表格补充":
if prefers_tabular_evidence and matched_terms:
score += 16
elif matched_terms:
score += 6
else:
score -= 18
if prefers_tabular_evidence and matched_terms and ("|" in content or "" in content):
score += 10
if matched_terms and any(marker in content for marker in ("", ":")):
score += 10
if matched_terms and "\n" in content:
score += 4
if matched_terms and any(marker in content for marker in ("附表", "", "")):
score += 4
if (
not prefers_tabular_evidence
and matched_terms
and any(marker in content for marker in ("", "", "", "-", ""))
):
score += 4
if title and any(term in title for term in query_terms):
score += 6
if re.search(r"没有.{0,8}(信息|规定|说明|依据)", content):
score -= 12
return score
def _leading_structured_appendix_marker(content: str) -> str:
normalized = str(content or "").lstrip()
for marker in STRUCTURED_APPENDIX_LEADING_MARKERS:
index = normalized.find(marker)
if 0 <= index <= STRUCTURED_APPENDIX_LEADING_WINDOW:
return marker
return ""