feat: 增强知识库功能,优化索引和RAG检索

This commit is contained in:
caoxiaozhu
2026-05-18 02:49:39 +00:00
parent 55e0591a5e
commit 4414ffb34c
18 changed files with 5656 additions and 4659 deletions

View File

@@ -856,7 +856,13 @@ class KnowledgeService:
status_payload = status_map.get(document_id) or {}
rag_status = str(status_payload.get("status") or "").strip().lower()
if bool(status_payload.get("query_ready")):
linked_run_status = self._resolve_linked_ingest_run_status(entry)
if (
linked_run_status == AgentRunStatus.FAILED.value
and rag_status in {"pending", "processing", "preprocessed"}
):
desired_status = KNOWLEDGE_INGEST_STATUS_FAILED
elif bool(status_payload.get("query_ready")):
desired_status = KNOWLEDGE_INGEST_STATUS_INGESTED
elif rag_status in {"pending", "processing", "preprocessed"}:
desired_status = KNOWLEDGE_INGEST_STATUS_SYNCING
@@ -1007,12 +1013,22 @@ class KnowledgeService:
probe_entry = {"ingest_status_updated_at": heartbeat_at}
return not self._is_syncing_status_stale(probe_entry)
return not self._is_syncing_status_stale(entry)
def _require_entry(self, index: dict[str, Any], document_id: str) -> dict[str, Any]:
for entry in index["documents"]:
if entry["id"] == document_id:
return entry
return not self._is_syncing_status_stale(entry)
def _resolve_linked_ingest_run_status(self, entry: dict[str, Any]) -> str:
agent_run_id = str(entry.get("ingest_agent_run_id") or "").strip()
if not agent_run_id or self.db is None:
return ""
run = self.db.scalar(select(AgentRun).where(AgentRun.run_id == agent_run_id))
if run is None:
return ""
return str(run.status or "").strip()
def _require_entry(self, index: dict[str, Any], document_id: str) -> dict[str, Any]:
for entry in index["documents"]:
if entry["id"] == document_id:
return entry
raise FileNotFoundError(document_id)
def _resolve_document_path(self, entry: dict[str, Any]) -> Path:

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import threading
from concurrent.futures import Future, ThreadPoolExecutor
from datetime import UTC, datetime
from time import perf_counter
@@ -18,6 +19,7 @@ from app.services.knowledge import (
from app.services.knowledge_rag import KnowledgeRagService
logger = get_logger("app.services.knowledge_index_tasks")
HEARTBEAT_INTERVAL_SECONDS = 10
class KnowledgeIndexTaskManager:
@@ -58,6 +60,15 @@ class KnowledgeIndexTaskManager:
session_factory = get_session_factory()
db = session_factory()
started = perf_counter()
heartbeat_stop = threading.Event()
heartbeat_thread: threading.Thread | None = None
tool_call_id = ""
tool_request_json = {
"agent": AgentName.HERMES.value,
"folder": folder,
"document_ids": document_ids,
"force": force,
}
try:
run_service = AgentRunService(db)
@@ -84,6 +95,44 @@ class KnowledgeIndexTaskManager:
},
},
)
tool_call = run_service.record_tool_call(
run_id=agent_run_id,
tool_type=AgentToolType.LLM.value,
tool_name="lightrag.index_documents",
request_json=tool_request_json,
response_json={"phase": "indexing"},
status="running",
duration_ms=0,
error_message=None,
)
tool_call_id = tool_call.id
def heartbeat_worker() -> None:
while not heartbeat_stop.wait(HEARTBEAT_INTERVAL_SECONDS):
heartbeat_db = session_factory()
try:
AgentRunService(heartbeat_db).merge_route_json(
agent_run_id,
{
"job_type": "knowledge_index_sync",
"phase": "indexing",
"heartbeat_at": datetime.now(UTC).isoformat(),
},
)
except Exception:
logger.exception(
"Knowledge index heartbeat update failed run_id=%s",
agent_run_id,
)
finally:
heartbeat_db.close()
heartbeat_thread = threading.Thread(
target=heartbeat_worker,
name=f"knowledge-index-heartbeat-{agent_run_id}",
daemon=True,
)
heartbeat_thread.start()
response = rag_service.index_documents(document_ids=document_ids, force=force)
succeeded_document_ids = [
@@ -117,16 +166,11 @@ class KnowledgeIndexTaskManager:
duration_ms = int((perf_counter() - started) * 1000)
tool_status = "succeeded" if not failed_document_ids else "failed"
run_service.record_tool_call(
run_id=agent_run_id,
tool_type=AgentToolType.LLM.value,
tool_name="lightrag.index_documents",
request_json={
"agent": AgentName.HERMES.value,
"folder": folder,
"document_ids": document_ids,
"force": force,
},
heartbeat_stop.set()
if heartbeat_thread is not None:
heartbeat_thread.join(timeout=1)
run_service.update_tool_call(
tool_call_id,
response_json=response,
status=tool_status,
duration_ms=duration_ms,
@@ -166,22 +210,29 @@ class KnowledgeIndexTaskManager:
finished_at=datetime.now(UTC),
)
except Exception as exc:
heartbeat_stop.set()
if heartbeat_thread is not None:
heartbeat_thread.join(timeout=1)
try:
AgentRunService(db).record_tool_call(
run_id=agent_run_id,
tool_type=AgentToolType.LLM.value,
tool_name="lightrag.index_documents",
request_json={
"agent": AgentName.HERMES.value,
"folder": folder,
"document_ids": document_ids,
"force": force,
},
response_json={"error": str(exc)},
status="failed",
duration_ms=int((perf_counter() - started) * 1000),
error_message=str(exc),
)
if tool_call_id:
AgentRunService(db).update_tool_call(
tool_call_id,
response_json={"error": str(exc)},
status="failed",
duration_ms=int((perf_counter() - started) * 1000),
error_message=str(exc),
)
else:
AgentRunService(db).record_tool_call(
run_id=agent_run_id,
tool_type=AgentToolType.LLM.value,
tool_name="lightrag.index_documents",
request_json=tool_request_json,
response_json={"error": str(exc)},
status="failed",
duration_ms=int((perf_counter() - started) * 1000),
error_message=str(exc),
)
KnowledgeService(db=db).set_document_ingest_statuses(
document_ids,
KNOWLEDGE_INGEST_STATUS_FAILED,
@@ -210,6 +261,9 @@ class KnowledgeIndexTaskManager:
logger.exception("Knowledge index task finalization failed run_id=%s", agent_run_id)
logger.exception("Knowledge index task failed run_id=%s", agent_run_id)
finally:
heartbeat_stop.set()
if heartbeat_thread is not None and heartbeat_thread.is_alive():
heartbeat_thread.join(timeout=1)
db.close()

View File

@@ -83,24 +83,23 @@ class KnowledgeNormalizationService:
if rendered:
normalized_tables.append(f"## {candidate.title}\n\n{rendered}")
parts: list[str] = []
appendix_parts: list[str] = []
if section_appendix:
parts.append(section_appendix)
appendix_parts.append(section_appendix)
if answer_clue_appendix:
parts.append(answer_clue_appendix)
appendix_parts.append(answer_clue_appendix)
if normalized_tables:
appendix = "\n\n".join(normalized_tables)
parts.append(
appendix_parts.append(
"# 结构化表格补充\n\n"
"以下表格由知识归纳阶段依据原文重新整理,供问答检索时优先理解行列关系。\n\n"
f"{appendix}"
)
if not parts:
if not appendix_parts:
return normalized_text
parts.append(f"# 原文\n\n{normalized_text}")
return "\n\n".join(parts)
return "\n\n".join([normalized_text, *appendix_parts])
@staticmethod
def _extract_table_candidates(text: str) -> list[TableCandidate]:

View File

@@ -33,6 +33,7 @@ DEFAULT_LIGHTRAG_QUERY_MODE = "naive"
DEFAULT_LLM_TIMEOUT_SECONDS = 180
DEFAULT_EMBEDDING_TIMEOUT_SECONDS = 120
MAX_KNOWLEDGE_HIT_CONTENT_LENGTH = 2200
MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH = 220
MAX_QUERY_TERMS = 12
QUERY_TERM_STOPWORDS = {
"什么",
@@ -62,6 +63,13 @@ TABLE_OR_STANDARD_QUERY_HINTS = (
"档位",
"额度",
)
STRUCTURED_APPENDIX_LEADING_MARKERS = (
"# 章节导航",
"# 重点章节摘录",
"# 问答线索补充",
"# 结构化表格补充",
)
STRUCTURED_APPENDIX_LEADING_WINDOW = 220
_runtime_lock = threading.RLock()
_runtime_instance: _LightRagRuntime | None = None
@@ -830,7 +838,11 @@ class KnowledgeRagService:
document_id, document_name = _parse_document_identity(file_path)
normalized_chunk_id = chunk_id or f"path-{rank}"
normalized_content = _truncate_text(content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH)
excerpt = _build_excerpt(normalized_content, max_length=220)
excerpt = _build_query_focused_excerpt(
normalized_content,
query_terms=query_terms,
max_length=MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH,
)
candidates.append(
{
"code": f"knowledge.{document_id or 'unknown'}.{normalized_chunk_id}",
@@ -907,8 +919,12 @@ class KnowledgeRagService:
@staticmethod
def is_query_ready_status(status_obj: Any) -> bool:
status_text = KnowledgeRagService._status_value(status_obj)
if status_text in {"failed", "error", "aborted"}:
return False
if status_text == "processed":
return True
if status_text in {"pending", "processing", "preprocessed"}:
return False
chunks_count = getattr(status_obj, "chunks_count", None)
if chunks_count is None and isinstance(status_obj, dict):
@@ -1168,6 +1184,35 @@ def _build_excerpt(text: str, *, max_length: int = 180) -> str:
return f"{normalized[: max_length - 3].rstrip()}..."
def _build_query_focused_excerpt(
text: str,
*,
query_terms: list[str],
max_length: int = 180,
) -> str:
normalized = " ".join(str(text or "").split()).strip()
if not normalized:
return ""
lowered = normalized.lower()
match_positions = [
lowered.find(term)
for term in query_terms
if term and lowered.find(term) >= 0
]
if not match_positions:
return _build_excerpt(normalized, max_length=max_length)
start = max(0, min(match_positions) - max_length // 3)
end = min(len(normalized), start + max_length)
snippet = normalized[start:end].strip()
if start > 0:
snippet = f"...{snippet.lstrip()}"
if end < len(normalized):
snippet = f"{snippet.rstrip()}..."
return snippet
def _truncate_text(text: str, *, max_length: int) -> str:
normalized = str(text or "").strip()
if len(normalized) <= max_length:
@@ -1243,19 +1288,43 @@ def _score_knowledge_hit(
score += len(matched_terms) * 8
score += sum(1 for term in matched_terms if term in title) * 6
if "结构化表格补充" in content:
score += 18
if "问答线索补充" in content:
score += 16 if not prefers_tabular_evidence else 8
if "重点章节摘录" in content:
leading_appendix_marker = _leading_structured_appendix_marker(content)
if leading_appendix_marker == "# 章节导航":
score -= 24
elif leading_appendix_marker == "# 重点章节摘录":
score += 4 if matched_terms else -12
elif leading_appendix_marker == "# 问答线索补充":
score += 8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
elif leading_appendix_marker == "# 结构化表格补充":
if prefers_tabular_evidence and matched_terms:
score += 16
elif matched_terms:
score += 6
else:
score -= 18
if prefers_tabular_evidence and matched_terms and ("|" in content or "" in content):
score += 10
if "章节导航" in content:
if matched_terms and any(marker in content for marker in ("", ":")):
score += 10
if matched_terms and "\n" in content:
score += 4
if prefers_tabular_evidence and ("|" in content or "" in content or "结构化表格补充" in content):
score += 12
if not prefers_tabular_evidence and any(marker in content for marker in ("", "", "", "-", "")):
if matched_terms and any(marker in content for marker in ("", "", "")):
score += 4
if not prefers_tabular_evidence and matched_terms and any(marker in content for marker in ("", "", "", "-", "")):
score += 4
if title and any(term in title for term in query_terms):
score += 6
if re.search(r"没有.{0,8}(信息|规定|说明|依据)", content):
score -= 12
return score
def _leading_structured_appendix_marker(content: str) -> str:
normalized = str(content or "").lstrip()
for marker in STRUCTURED_APPENDIX_LEADING_MARKERS:
index = normalized.find(marker)
if 0 <= index <= STRUCTURED_APPENDIX_LEADING_WINDOW:
return marker
return ""