feat: 增强知识库功能，优化索引和RAG检索

2026-05-18 02:49:39 +00:00
parent 55e0591a5e
commit 4414ffb34c
18 changed files with 5656 additions and 4659 deletions
--- a/server/src/app/services/knowledge_rag.py
+++ b/server/src/app/services/knowledge_rag.py
@@ -33,6 +33,7 @@ DEFAULT_LIGHTRAG_QUERY_MODE = "naive"
 DEFAULT_LLM_TIMEOUT_SECONDS = 180
 DEFAULT_EMBEDDING_TIMEOUT_SECONDS = 120
 MAX_KNOWLEDGE_HIT_CONTENT_LENGTH = 2200
+MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH = 220
 MAX_QUERY_TERMS = 12
 QUERY_TERM_STOPWORDS = {
    "什么",
@@ -62,6 +63,13 @@ TABLE_OR_STANDARD_QUERY_HINTS = (
    "档位",
    "额度",
 )
+STRUCTURED_APPENDIX_LEADING_MARKERS = (
+    "# 章节导航",
+    "# 重点章节摘录",
+    "# 问答线索补充",
+    "# 结构化表格补充",
+)
+STRUCTURED_APPENDIX_LEADING_WINDOW = 220

 _runtime_lock = threading.RLock()
 _runtime_instance: _LightRagRuntime | None = None
@@ -830,7 +838,11 @@ class KnowledgeRagService:
            document_id, document_name = _parse_document_identity(file_path)
            normalized_chunk_id = chunk_id or f"path-{rank}"
            normalized_content = _truncate_text(content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH)
-            excerpt = _build_excerpt(normalized_content, max_length=220)
+            excerpt = _build_query_focused_excerpt(
+                normalized_content,
+                query_terms=query_terms,
+                max_length=MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH,
+            )
            candidates.append(
                {
                    "code": f"knowledge.{document_id or 'unknown'}.{normalized_chunk_id}",
@@ -907,8 +919,12 @@ class KnowledgeRagService:
    @staticmethod
    def is_query_ready_status(status_obj: Any) -> bool:
        status_text = KnowledgeRagService._status_value(status_obj)
+        if status_text in {"failed", "error", "aborted"}:
+            return False
        if status_text == "processed":
            return True
+        if status_text in {"pending", "processing", "preprocessed"}:
+            return False

        chunks_count = getattr(status_obj, "chunks_count", None)
        if chunks_count is None and isinstance(status_obj, dict):
@@ -1168,6 +1184,35 @@ def _build_excerpt(text: str, *, max_length: int = 180) -> str:
    return f"{normalized[: max_length - 3].rstrip()}..."


+def _build_query_focused_excerpt(
+    text: str,
+    *,
+    query_terms: list[str],
+    max_length: int = 180,
+) -> str:
+    normalized = " ".join(str(text or "").split()).strip()
+    if not normalized:
+        return ""
+
+    lowered = normalized.lower()
+    match_positions = [
+        lowered.find(term)
+        for term in query_terms
+        if term and lowered.find(term) >= 0
+    ]
+    if not match_positions:
+        return _build_excerpt(normalized, max_length=max_length)
+
+    start = max(0, min(match_positions) - max_length // 3)
+    end = min(len(normalized), start + max_length)
+    snippet = normalized[start:end].strip()
+    if start > 0:
+        snippet = f"...{snippet.lstrip()}"
+    if end < len(normalized):
+        snippet = f"{snippet.rstrip()}..."
+    return snippet
+
+
 def _truncate_text(text: str, *, max_length: int) -> str:
    normalized = str(text or "").strip()
    if len(normalized) <= max_length:
@@ -1243,19 +1288,43 @@ def _score_knowledge_hit(
    score += len(matched_terms) * 8
    score += sum(1 for term in matched_terms if term in title) * 6

-    if "结构化表格补充" in content:
-        score += 18
-    if "问答线索补充" in content:
-        score += 16 if not prefers_tabular_evidence else 8
-    if "重点章节摘录" in content:
+    leading_appendix_marker = _leading_structured_appendix_marker(content)
+    if leading_appendix_marker == "# 章节导航":
+        score -= 24
+    elif leading_appendix_marker == "# 重点章节摘录":
+        score += 4 if matched_terms else -12
+    elif leading_appendix_marker == "# 问答线索补充":
+        score += 8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
+    elif leading_appendix_marker == "# 结构化表格补充":
+        if prefers_tabular_evidence and matched_terms:
+            score += 16
+        elif matched_terms:
+            score += 6
+        else:
+            score -= 18
+
+    if prefers_tabular_evidence and matched_terms and ("|" in content or "表" in content):
        score += 10
-    if "章节导航" in content:
+    if matched_terms and any(marker in content for marker in ("：", ":")):
+        score += 10
+    if matched_terms and "\n" in content:
        score += 4
-    if prefers_tabular_evidence and ("|" in content or "表" in content or "结构化表格补充" in content):
-        score += 12
-    if not prefers_tabular_evidence and any(marker in content for marker in ("第", "条", "：", "-", "•")):
+    if matched_terms and any(marker in content for marker in ("附表", "第", "条")):
+        score += 4
+    if not prefers_tabular_evidence and matched_terms and any(marker in content for marker in ("第", "条", "：", "-", "•")):
        score += 4
    if title and any(term in title for term in query_terms):
        score += 6
+    if re.search(r"没有.{0,8}(信息|规定|说明|依据)", content):
+        score -= 12

    return score
+
+
+def _leading_structured_appendix_marker(content: str) -> str:
+    normalized = str(content or "").lstrip()
+    for marker in STRUCTURED_APPENDIX_LEADING_MARKERS:
+        index = normalized.find(marker)
+        if 0 <= index <= STRUCTURED_APPENDIX_LEADING_WINDOW:
+            return marker
+    return ""