X-Financial/server/src/app/services/knowledge_rag.py

from __future__ import annotations

import os
import re
import socket
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Any, Callable

from sqlalchemy.orm import Session

from app.core.config import get_settings
from app.core.logging import get_logger
from app.db.session import get_session_factory
from app.services.knowledge_ingest_log import (
    build_document_graph_summary,
    build_ingest_document_summary,
    build_ingest_status_summary,
)
from app.services.knowledge_rag_local import query_local_text_chunks
from app.services.knowledge_rag_runtime import (
    KnowledgeRagError,
    RuntimeModelConfig,
    _LightRagRuntime,
)
from app.services.settings import SettingsService

logger = get_logger("app.services.knowledge_rag")

DEFAULT_QDRANT_URL = "http://127.0.0.1:6333"
CONTAINER_QDRANT_URL = "http://qdrant:6333"
DEFAULT_LIGHTRAG_WORKSPACE = "x_financial_knowledge"
MAX_KNOWLEDGE_HIT_CONTENT_LENGTH = 2200
MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH = 220
MAX_QUERY_TERMS = 12
QUERY_TERM_STOPWORDS = {
    "什么",
    "多少",
    "哪些",
    "怎么",
    "如何",
    "请问",
    "一下",
    "关于",
    "规定",
    "标准",
    "可以",
    "是否",
    "一个",
    "哪些人",
}
TABLE_OR_STANDARD_QUERY_HINTS = (
    "表",
    "表格",
    "清单",
    "明细",
    "目录",
    "科目",
    "标准",
    "金额",
    "限额",
    "补贴",
    "住宿",
    "餐费",
    "交通",
    "报销",
    "档位",
    "额度",
)
QUERY_ANCHOR_TERMS = (
    "财务基础知识手册",
    "基础知识手册",
    "会计科目",
    "常用会计科目",
    "财务报表",
    "主要税种",
    "税种",
    "标准",
    "清单",
    "明细",
    "流程",
)
GENERIC_TITLE_TERMS = {"远光软件", "股份有限", "有限公司"}
STRUCTURED_APPENDIX_LEADING_MARKERS = (
    "# 章节导航",
    "# 重点章节摘录",
    "# 问答线索补充",
    "# 结构化表格补充",
)
STRUCTURED_APPENDIX_LEADING_WINDOW = 220
_runtime_lock = threading.RLock()
_runtime_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="knowledge-rag-runtime")
_runtime_instances: dict[str, _LightRagRuntime] = {}
_runtime_signatures: dict[str, tuple[Any, ...]] = {}
_RUNTIME_CACHE_KEY = "lightrag"


class KnowledgeRagService:
    def __init__(self, db: Session | None = None, storage_root: Path | None = None) -> None:
        self.db = db
        self.storage_root = Path(storage_root or get_settings().resolved_storage_root_dir)

    def query_knowledge(
        self,
        query: str,
        *,
        conversation_history: list[dict[str, str]] | None = None,
        limit: int = 5,
    ) -> dict[str, Any]:
        normalized_query = str(query or "").strip()
        if not normalized_query:
            return {
                "result_type": "knowledge_search",
                "query": "",
                "record_count": 0,
                "hits": [],
                "references": [],
                "message": "请先输入要检索的知识库问题。",
            }

        rewritten_query = normalized_query
        if conversation_history:
            rewritten_query = self._rewrite_query(normalized_query, conversation_history)

        workspace = (
            os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
            or DEFAULT_LIGHTRAG_WORKSPACE
        )
        local_result = query_local_text_chunks(
            lightrag_root=(self.storage_root / "knowledge" / ".lightrag").resolve(),
            workspace=workspace,
            query=rewritten_query,
            limit=limit,
        )

        runtime_hits: list[dict[str, Any]] = []
        runtime_references: list[str] = []
        if not local_result.confident:
            try:
                raw = self._run_runtime_operation(
                    lambda runtime: runtime.query_data(
                        rewritten_query,
                        conversation_history=conversation_history,
                    )
                )
                data = raw.get("data") if isinstance(raw, dict) else {}
                chunks = list(data.get("chunks") or []) if isinstance(data, dict) else []
                entities = list(data.get("entities") or []) if isinstance(data, dict) else []
                runtime_references = list(data.get("references") or []) if isinstance(data, dict) else []
                runtime_hits = self._build_hits_from_query_data(
                    query=rewritten_query,
                    chunks=chunks,
                    entities=entities,
                    limit=limit,
                )
            except Exception as exc:
                logger.warning("Knowledge query failed: %s", exc)

        all_hits: dict[str, dict[str, Any]] = {}
        for hit in local_result.hits:
            hit["score"] = int(hit.get("score") or 0)
            all_hits[hit["code"]] = hit

        for hit in runtime_hits:
            code = hit["code"]
            if code in all_hits:
                all_hits[code]["score"] = max(all_hits[code]["score"], int(hit.get("score") or 0) + 20)
                if not all_hits[code].get("tags") and hit.get("tags"):
                    all_hits[code]["tags"] = hit["tags"]
            else:
                hit["score"] = int(hit.get("score") or 0)
                all_hits[code] = hit

        merged_hits = sorted(all_hits.values(), key=lambda x: int(x.get("score") or 0), reverse=True)[:max(1, limit)]

        if not merged_hits:
            return {
                "result_type": "knowledge_search",
                "query": rewritten_query,
                "record_count": 0,
                "hits": [],
                "references": [],
                "raw_references": runtime_references,
                "message": "当前知识库中没有检索到与本次问题直接匹配的内容。",
            }

        return {
            "result_type": "knowledge_search",
            "query": rewritten_query,
            "record_count": len(merged_hits),
            "hits": merged_hits,
            "references": [
                str(item.get("code") or "").strip()
                for item in merged_hits
                if str(item.get("code") or "").strip()
            ],
            "raw_references": runtime_references,
            "metadata": {
                "retrieval_strategy": "fusion" if runtime_hits else "local_text_chunks",
                "local_total_chunks": local_result.total_chunks,
                "local_best_score": local_result.best_score,
            },
            "message": f"已从知识库中联合检索到 {len(merged_hits)} 条相关内容。",
        }

    def _rewrite_query(self, query: str, conversation_history: list[dict[str, str]]) -> str:
        if not self.db:
            return query

        from app.services.runtime_chat import RuntimeChatService
        try:
            chat_service = RuntimeChatService(self.db)
            messages: list[dict[str, Any]] = [{"role": "system", "content": "你是一个查询重写助手。你的任务是根据用户的多轮对话历史，将用户的最后一次提问重写为一句独立、完整的查询语句，以便于在知识库中进行向量检索。只输出重写后的句子，不要任何解释。"}]
            for msg in conversation_history[-6:]:
                messages.append({"role": msg.get("role", "user"), "content": msg.get("content", "")})
            messages.append({"role": "user", "content": f"当前提问：{query}\n\n请重写当前提问。"})

            rewritten = chat_service.complete(
                messages,
                max_tokens=60,
                temperature=0.1,
                timeout_seconds=10,
            )

            if rewritten and len(rewritten) > 2 and len(rewritten) < 80:
                logger.info("Query rewritten: '%s' -> '%s'", query, rewritten)
                return rewritten
        except Exception as exc:
            logger.warning("Query rewrite failed: %s", exc)

        return query

    def index_documents(
        self,
        *,
        document_ids: list[str],
        force: bool = False,
    ) -> dict[str, Any]:
        normalized_ids = [str(item).strip() for item in document_ids if str(item).strip()]
        if not normalized_ids:
            raise ValueError("没有可供索引的知识文档。")

        from app.services.knowledge import KnowledgeService
        from app.services.knowledge_normalizer import KnowledgeNormalizationService

        knowledge_service = KnowledgeService(storage_root=self.storage_root, db=self.db)
        normalization_service = (
            KnowledgeNormalizationService(self.db) if self.db is not None else None
        )
        texts: list[str] = []
        file_paths: list[str] = []
        document_summaries: list[dict[str, Any]] = []

        existing_statuses = self._run_runtime_operation(
            lambda runtime: runtime.get_document_statuses(normalized_ids)
        )

        for document_id in normalized_ids:
            entry = knowledge_service.get_document_entry(document_id)
            if force and document_id in existing_statuses:
                try:
                    self._run_runtime_operation(
                        lambda runtime, target_id=document_id: runtime.delete_document(target_id)
                    )
                except Exception as exc:
                    logger.warning(
                        "Delete existing LightRAG document failed doc_id=%s: %s", document_id, exc
                    )
            text = knowledge_service.extract_document_text(document_id)
            raw_text = text
            if normalization_service is not None:
                text = normalization_service.build_enriched_text(text)
            texts.append(text)
            file_paths.append(
                str(
                    (
                        knowledge_service.library_root / entry["folder"] / entry["stored_name"]
                    ).resolve()
                )
            )
            document_summaries.append(
                build_ingest_document_summary(
                    document_id=document_id,
                    entry=entry,
                    raw_text=raw_text,
                    indexed_text=text,
                )
            )

        track_id = self._run_runtime_operation(
            lambda runtime: runtime.insert_documents(
                texts=texts,
                document_ids=normalized_ids,
                file_paths=file_paths,
            )
        )

        statuses = self._run_runtime_operation(
            lambda runtime: runtime.get_document_statuses(normalized_ids)
        )
        succeeded_document_ids: list[str] = []
        failed_documents: list[dict[str, str]] = []
        summary_by_id = {
            str(item.get("document_id") or "").strip(): item
            for item in document_summaries
            if str(item.get("document_id") or "").strip()
        }

        for document_id in normalized_ids:
            status_obj = statuses.get(document_id)
            status_text = self._status_value(status_obj)
            status_payload = self._serialize_status(status_obj)
            workspace = (
                os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
                or DEFAULT_LIGHTRAG_WORKSPACE
            )
            graph_summary = build_document_graph_summary(
                self.storage_root,
                workspace=workspace,
                document_id=document_id,
            )
            if document_id in summary_by_id:
                summary_by_id[document_id].update(
                    build_ingest_status_summary(
                        status_payload=status_payload,
                        graph_summary=graph_summary,
                    )
                )
            if self.is_query_ready_status(status_obj):
                succeeded_document_ids.append(document_id)
                continue
            failed_documents.append(
                {
                    "document_id": document_id,
                    "status": status_text or "unknown",
                    "error": self._status_error(status_obj),
                }
            )

        return {
            "track_id": track_id,
            "requested_document_ids": normalized_ids,
            "succeeded_document_ids": succeeded_document_ids,
            "failed_documents": failed_documents,
            "document_summaries": [
                summary_by_id.get(document_id, {}) for document_id in normalized_ids
            ],
            "status_snapshot": {
                document_id: self._serialize_status(status_obj)
                for document_id, status_obj in statuses.items()
            },
        }

    def get_document_status_map(
        self, document_ids: list[str] | None = None
    ) -> dict[str, dict[str, Any]]:
        target_ids = [str(item).strip() for item in document_ids or [] if str(item).strip()]
        if not target_ids:
            return {}
        try:
            statuses = self._run_runtime_operation(
                lambda runtime: runtime.get_document_statuses(target_ids)
            )
        except Exception as exc:
            logger.warning("Load LightRAG document statuses failed: %s", exc)
            return {}
        return {
            document_id: self._serialize_status(status_obj)
            for document_id, status_obj in statuses.items()
        }

    def delete_document(self, document_id: str) -> None:
        normalized_id = str(document_id or "").strip()
        if not normalized_id:
            return
        try:
            self._run_runtime_operation(
                lambda runtime: runtime.delete_document(normalized_id)
            )
        except Exception as exc:
            logger.warning("Delete LightRAG document ignored doc_id=%s: %s", normalized_id, exc)

    def _run_runtime_operation(self, operation: Callable[[_LightRagRuntime], Any]) -> Any:
        signature, runtime_kwargs = self._build_runtime_signature()
        return _runtime_executor.submit(
            self._execute_runtime_operation,
            signature,
            runtime_kwargs,
            operation,
        ).result()

    def _execute_runtime_operation(
        self,
        signature: tuple[Any, ...],
        runtime_kwargs: dict[str, Any],
        operation: Callable[[_LightRagRuntime], Any],
    ) -> Any:
        return operation(self._get_runtime(signature=signature, runtime_kwargs=runtime_kwargs))

    def _get_runtime(
        self,
        *,
        signature: tuple[Any, ...] | None = None,
        runtime_kwargs: dict[str, Any] | None = None,
    ) -> _LightRagRuntime:
        if signature is None or runtime_kwargs is None:
            signature, runtime_kwargs = self._build_runtime_signature()
        with _runtime_lock:
            runtime = _runtime_instances.get(_RUNTIME_CACHE_KEY)
            if runtime is not None and _runtime_signatures.get(_RUNTIME_CACHE_KEY) == signature:
                return runtime

            if runtime is not None:
                try:
                    runtime.finalize()
                except Exception as exc:  # pragma: no cover - best effort cleanup
                    logger.warning("Finalize previous LightRAG runtime failed: %s", exc)

            runtime = _LightRagRuntime(**runtime_kwargs)
            _runtime_instances[_RUNTIME_CACHE_KEY] = runtime
            _runtime_signatures[_RUNTIME_CACHE_KEY] = signature
            return runtime

    def _build_runtime_signature(self) -> tuple[tuple[Any, ...], dict[str, Any]]:
        configs = self._load_runtime_configs()
        settings = get_settings()
        working_dir = (self.storage_root / "knowledge" / ".lightrag").resolve()
        workspace = (
            os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
            or DEFAULT_LIGHTRAG_WORKSPACE
        )
        qdrant_url = os.environ.get("QDRANT_URL", "").strip() or _resolve_default_qdrant_url()
        qdrant_api_key = os.environ.get("QDRANT_API_KEY", "").strip()

        signature = (
            str(working_dir),
            workspace,
            qdrant_url,
            qdrant_api_key,
            configs["main"].provider,
            configs["main"].model,
            configs["main"].endpoint,
            configs["main"].api_key,
            configs["backup"].provider if configs["backup"] else "",
            configs["backup"].model if configs["backup"] else "",
            configs["backup"].endpoint if configs["backup"] else "",
            configs["backup"].api_key if configs["backup"] else "",
            configs["embedding"].provider,
            configs["embedding"].model,
            configs["embedding"].endpoint,
            configs["embedding"].api_key,
            configs["reranker"].provider if configs["reranker"] else "",
            configs["reranker"].model if configs["reranker"] else "",
            configs["reranker"].endpoint if configs["reranker"] else "",
            configs["reranker"].api_key if configs["reranker"] else "",
            str(settings.resolved_storage_root_dir),
        )

        return signature, {
            "working_dir": working_dir,
            "workspace": workspace,
            "qdrant_url": qdrant_url,
            "qdrant_api_key": qdrant_api_key,
            "primary_chat": configs["main"],
            "backup_chat": configs["backup"],
            "embedding": configs["embedding"],
            "reranker": configs["reranker"],
        }

    def _load_runtime_configs(self) -> dict[str, RuntimeModelConfig | None]:
        owned_session = False
        session = self.db
        if session is None:
            session = get_session_factory()()
            owned_session = True

        try:
            settings_service = SettingsService(session)
            main = self._normalize_runtime_model(settings_service.get_runtime_model_config("main"))
            embedding = self._normalize_runtime_model(
                settings_service.get_runtime_model_config("embedding")
            )
            try:
                backup_raw = settings_service.get_runtime_model_config("backup")
                backup = self._normalize_runtime_model(backup_raw)
            except Exception:
                backup = None
            try:
                reranker_raw = settings_service.get_runtime_model_config("reranker")
                reranker = self._normalize_runtime_model(reranker_raw)
            except Exception:
                reranker = None
            if backup is not None and (
                not backup.endpoint
                or not backup.model
                or (backup.provider != "Ollama" and not backup.api_key)
            ):
                backup = None
            if reranker is not None and (
                not reranker.endpoint
                or not reranker.model
                or (reranker.provider != "Ollama" and not reranker.api_key)
            ):
                reranker = None
            if not main.endpoint or not main.model:
                raise KnowledgeRagError("主对话模型未配置，无法初始化 LightRAG。")
            if main.provider != "Ollama" and not main.api_key:
                raise KnowledgeRagError("主对话模型缺少 API Key，无法初始化 LightRAG。")
            if not embedding.endpoint or not embedding.model:
                raise KnowledgeRagError("Embedding 模型未配置，无法初始化 LightRAG。")
            if embedding.provider != "Ollama" and not embedding.api_key:
                raise KnowledgeRagError("Embedding 模型缺少 API Key，无法初始化 LightRAG。")
            return {
                "main": main,
                "backup": backup,
                "embedding": embedding,
                "reranker": reranker,
            }
        finally:
            if owned_session and session is not None:
                session.close()

    @staticmethod
    def _normalize_runtime_model(payload: dict[str, str]) -> RuntimeModelConfig:
        return RuntimeModelConfig(
            slot=str(payload.get("slot") or "").strip(),
            provider=str(payload.get("provider") or "").strip(),
            model=str(payload.get("model") or "").strip(),
            endpoint=str(payload.get("endpoint") or "").strip(),
            api_key=str(payload.get("apiKey") or "").strip(),
            capability=str(payload.get("capability") or "").strip(),
        )

    @staticmethod
    def _build_hits_from_query_data(
        *,
        query: str,
        chunks: list[dict[str, Any]],
        entities: list[dict[str, Any]],
        limit: int,
    ) -> list[dict[str, Any]]:
        entity_tags_by_path: dict[str, list[str]] = {}

        for entity in entities:
            if not isinstance(entity, dict):
                continue
            file_path = str(entity.get("file_path") or "").strip()
            entity_name = str(entity.get("entity_name") or "").strip()
            if not file_path or not entity_name:
                continue
            entity_tags_by_path.setdefault(file_path, [])
            if entity_name not in entity_tags_by_path[file_path]:
                entity_tags_by_path[file_path].append(entity_name)

        query_terms = _extract_query_terms(query)
        prefers_tabular_evidence = any(hint in query for hint in TABLE_OR_STANDARD_QUERY_HINTS)
        candidates: list[dict[str, Any]] = []
        for rank, chunk in enumerate(chunks, start=1):
            if not isinstance(chunk, dict):
                continue
            file_path = str(chunk.get("file_path") or "").strip()
            chunk_id = str(chunk.get("chunk_id") or "").strip()
            content = str(chunk.get("content") or "").strip()
            if not file_path or not content:
                continue

            document_id, document_name = _parse_document_identity(file_path)
            normalized_chunk_id = chunk_id or f"path-{rank}"
            normalized_content = _truncate_text(
                content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH
            )
            excerpt = _build_query_focused_excerpt(
                normalized_content,
                query_terms=query_terms,
                max_length=MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH,
            )
            candidates.append(
                {
                    "code": f"knowledge.{document_id or 'unknown'}.{normalized_chunk_id}",
                    "candidate_id": normalized_chunk_id,
                    "title": document_name or "知识库文档",
                    "content": normalized_content,
                    "excerpt": excerpt,
                    "document_id": document_id,
                    "document_name": document_name or Path(file_path).name,
                    "version": None,
                    "updated_at": None,
                    "score": max(1, 100 - rank),
                    "tags": entity_tags_by_path.get(file_path, [])[:5],
                    "evidence": [normalized_chunk_id],
                    "file_path": file_path,
                    "_rank": rank,
                }
            )

        ranked = sorted(
            candidates,
            key=lambda item: (
                _score_knowledge_hit(
                    item,
                    query_terms=query_terms,
                    prefers_tabular_evidence=prefers_tabular_evidence,
                ),
                -int(item.get("_rank") or 0),
            ),
            reverse=True,
        )

        hits: list[dict[str, Any]] = []
        for item in ranked[: max(1, limit)]:
            normalized = dict(item)
            normalized.pop("_rank", None)
            hits.append(normalized)
        return hits

    @staticmethod
    def _serialize_status(status_obj: Any) -> dict[str, Any]:
        if status_obj is None:
            return {}
        if hasattr(status_obj, "__dict__"):
            payload = dict(status_obj.__dict__)
        elif isinstance(status_obj, dict):
            payload = dict(status_obj)
        else:
            payload = {}
        payload["status"] = KnowledgeRagService._status_value(status_obj)
        payload["error_msg"] = KnowledgeRagService._status_error(status_obj)
        payload["query_ready"] = KnowledgeRagService.is_query_ready_status(status_obj)
        return payload

    @staticmethod
    def _status_value(status_obj: Any) -> str:
        raw_status = getattr(status_obj, "status", None)
        if raw_status is None and isinstance(status_obj, dict):
            raw_status = status_obj.get("status")
        normalized = str(raw_status or "").strip().lower()
        if "." in normalized:
            normalized = normalized.split(".")[-1].strip()
        if ":" in normalized and normalized.endswith(">"):
            normalized = normalized.split(":")[0].strip("<> '\"")
        return normalized

    @staticmethod
    def _status_error(status_obj: Any) -> str:
        value = getattr(status_obj, "error_msg", None)
        if value is None and isinstance(status_obj, dict):
            value = status_obj.get("error_msg")
        return str(value or "").strip()

    @staticmethod
    def is_query_ready_status(status_obj: Any) -> bool:
        status_text = KnowledgeRagService._status_value(status_obj)
        if status_text in {"failed", "error", "aborted"}:
            return False
        if status_text == "processed":
            return True
        if status_text in {"pending", "processing", "preprocessed"}:
            return False

        chunks_count = getattr(status_obj, "chunks_count", None)
        if chunks_count is None and isinstance(status_obj, dict):
            chunks_count = status_obj.get("chunks_count")
        try:
            if int(chunks_count or 0) > 0:
                return True
        except (TypeError, ValueError):
            pass

        chunks_list = getattr(status_obj, "chunks_list", None)
        if chunks_list is None and isinstance(status_obj, dict):
            chunks_list = status_obj.get("chunks_list")
        return bool(chunks_list)


def shutdown_knowledge_rag_runtime() -> None:
    _runtime_executor.submit(_shutdown_runtime_instances).result()


def _shutdown_runtime_instances() -> None:
    with _runtime_lock:
        for runtime in list(_runtime_instances.values()):
            try:
                runtime.finalize()
            except Exception as exc:  # pragma: no cover - best effort cleanup
                logger.warning("Finalize LightRAG runtime failed during shutdown: %s", exc)
        _runtime_instances.clear()
        _runtime_signatures.clear()


def _parse_document_identity(file_path: str) -> tuple[str, str]:
    path = Path(str(file_path or "").strip())
    name = path.name
    if "__" not in name:
        return "", name
    document_id, document_name = name.split("__", maxsplit=1)
    return document_id.strip(), document_name.strip()


def _build_excerpt(text: str, *, max_length: int = 180) -> str:
    normalized = " ".join(str(text or "").split()).strip()
    if len(normalized) <= max_length:
        return normalized
    return f"{normalized[: max_length - 3].rstrip()}..."


def _build_query_focused_excerpt(
    text: str,
    *,
    query_terms: list[str],
    max_length: int = 180,
) -> str:
    normalized = " ".join(str(text or "").split()).strip()
    if not normalized:
        return ""

    lowered = normalized.lower()
    match_positions = [
        lowered.find(term) for term in query_terms if term and lowered.find(term) >= 0
    ]
    if not match_positions:
        return _build_excerpt(normalized, max_length=max_length)

    start = max(0, min(match_positions) - max_length // 3)
    end = min(len(normalized), start + max_length)
    snippet = normalized[start:end].strip()
    if start > 0:
        snippet = f"...{snippet.lstrip()}"
    if end < len(normalized):
        snippet = f"{snippet.rstrip()}..."
    return snippet


def _truncate_text(text: str, *, max_length: int) -> str:
    normalized = str(text or "").strip()
    if len(normalized) <= max_length:
        return normalized
    return f"{normalized[: max_length - 3].rstrip()}..."


def _resolve_default_qdrant_url() -> str:
    if _hostname_resolves("qdrant"):
        return CONTAINER_QDRANT_URL
    return DEFAULT_QDRANT_URL


def _hostname_resolves(hostname: str) -> bool:
    try:
        socket.getaddrinfo(hostname, None)
    except OSError:
        return False
    return True


def _extract_query_terms(query: str) -> list[str]:
    normalized_query = str(query or "").strip().lower()
    if not normalized_query:
        return []

    terms: list[str] = []
    seen: set[str] = set()

    def remember(term: str) -> None:
        normalized_term = str(term or "").strip().lower()
        if (
            not normalized_term
            or normalized_term in seen
            or normalized_term in QUERY_TERM_STOPWORDS
            or len(normalized_term) < 2
        ):
            return
        seen.add(normalized_term)
        terms.append(normalized_term)

    for item in re.findall(r"[a-z0-9][a-z0-9_\-]{1,}", normalized_query):
        remember(item)

    for block in re.findall(r"[\u4e00-\u9fff]{2,20}", normalized_query):
        for marker in ("标准", "金额", "限额", "额度"):
            marker_index = block.find(marker)
            if marker_index <= 0:
                continue
            subject = block[:marker_index]
            for width in (6, 4, 3, 2):
                remember(subject[-width:])
        for anchor in QUERY_ANCHOR_TERMS:
            if anchor in block:
                remember(anchor)
        tail = block[-14:]
        for size in (8, 7, 6, 5, 4):
            for start in range(0, len(tail) - size + 1):
                piece = tail[start : start + size]
                if any(anchor in piece for anchor in QUERY_ANCHOR_TERMS):
                    remember(piece)
                    if len(terms) >= MAX_QUERY_TERMS:
                        return terms
        if len(block) <= 4:
            remember(block)
            continue
        for size in (4, 3, 2):
            for start in range(0, len(block) - size + 1):
                remember(block[start : start + size])
                if len(terms) >= MAX_QUERY_TERMS:
                    return terms

    return terms[:MAX_QUERY_TERMS]


def _score_knowledge_hit(
    item: dict[str, Any],
    *,
    query_terms: list[str],
    prefers_tabular_evidence: bool,
) -> int:
    rank = max(1, int(item.get("_rank") or 1))
    title = str(item.get("title") or item.get("document_name") or "").lower()
    content = str(item.get("content") or "").lower()
    excerpt = str(item.get("excerpt") or "").lower()
    tags = " ".join(str(value).lower() for value in list(item.get("tags") or [])[:5])
    haystack = "\n".join([title, excerpt, tags, content[:1200]])

    score = max(1, 120 - rank * 4)
    matched_terms = [term for term in query_terms if term in haystack]
    score += len(matched_terms) * 8
    score += sum(1 for term in matched_terms if term in title) * 6
    score += sum(
        (len(term) - 3) * 12
        for term in matched_terms
        if len(term) >= 4 and term in title and term not in GENERIC_TITLE_TERMS
    )

    leading_appendix_marker = _leading_structured_appendix_marker(content)
    if leading_appendix_marker == "# 章节导航":
        score -= 24
    elif leading_appendix_marker == "# 重点章节摘录":
        score += 4 if matched_terms else -12
    elif leading_appendix_marker == "# 问答线索补充":
        score += (
            8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
        )
    elif leading_appendix_marker == "# 结构化表格补充":
        if prefers_tabular_evidence and matched_terms:
            score += 16
        elif matched_terms:
            score += 6
        else:
            score -= 18

    if prefers_tabular_evidence and matched_terms and ("|" in content or "表" in content):
        score += 10
    if matched_terms and any(marker in content for marker in ("：", ":")):
        score += 10
    if matched_terms and "\n" in content:
        score += 4
    if matched_terms and any(marker in content for marker in ("附表", "第", "条")):
        score += 4
    if (
        not prefers_tabular_evidence
        and matched_terms
        and any(marker in content for marker in ("第", "条", "：", "-", "•"))
    ):
        score += 4
    if title and any(term in title for term in query_terms):
        score += 6
    if re.search(r"没有.{0,8}(信息|规定|说明|依据)", content):
        score -= 12

    return score


def _leading_structured_appendix_marker(content: str) -> str:
    normalized = str(content or "").lstrip()
    for marker in STRUCTURED_APPENDIX_LEADING_MARKERS:
        index = normalized.find(marker)
        if 0 <= index <= STRUCTURED_APPENDIX_LEADING_WINDOW:
            return marker
    return ""