feat: 增强知识库功能,优化索引和RAG检索
This commit is contained in:
@@ -856,7 +856,13 @@ class KnowledgeService:
|
||||
|
||||
status_payload = status_map.get(document_id) or {}
|
||||
rag_status = str(status_payload.get("status") or "").strip().lower()
|
||||
if bool(status_payload.get("query_ready")):
|
||||
linked_run_status = self._resolve_linked_ingest_run_status(entry)
|
||||
if (
|
||||
linked_run_status == AgentRunStatus.FAILED.value
|
||||
and rag_status in {"pending", "processing", "preprocessed"}
|
||||
):
|
||||
desired_status = KNOWLEDGE_INGEST_STATUS_FAILED
|
||||
elif bool(status_payload.get("query_ready")):
|
||||
desired_status = KNOWLEDGE_INGEST_STATUS_INGESTED
|
||||
elif rag_status in {"pending", "processing", "preprocessed"}:
|
||||
desired_status = KNOWLEDGE_INGEST_STATUS_SYNCING
|
||||
@@ -1007,12 +1013,22 @@ class KnowledgeService:
|
||||
probe_entry = {"ingest_status_updated_at": heartbeat_at}
|
||||
return not self._is_syncing_status_stale(probe_entry)
|
||||
|
||||
return not self._is_syncing_status_stale(entry)
|
||||
|
||||
def _require_entry(self, index: dict[str, Any], document_id: str) -> dict[str, Any]:
|
||||
for entry in index["documents"]:
|
||||
if entry["id"] == document_id:
|
||||
return entry
|
||||
return not self._is_syncing_status_stale(entry)
|
||||
|
||||
def _resolve_linked_ingest_run_status(self, entry: dict[str, Any]) -> str:
|
||||
agent_run_id = str(entry.get("ingest_agent_run_id") or "").strip()
|
||||
if not agent_run_id or self.db is None:
|
||||
return ""
|
||||
|
||||
run = self.db.scalar(select(AgentRun).where(AgentRun.run_id == agent_run_id))
|
||||
if run is None:
|
||||
return ""
|
||||
return str(run.status or "").strip()
|
||||
|
||||
def _require_entry(self, index: dict[str, Any], document_id: str) -> dict[str, Any]:
|
||||
for entry in index["documents"]:
|
||||
if entry["id"] == document_id:
|
||||
return entry
|
||||
raise FileNotFoundError(document_id)
|
||||
|
||||
def _resolve_document_path(self, entry: dict[str, Any]) -> Path:
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import threading
|
||||
from concurrent.futures import Future, ThreadPoolExecutor
|
||||
from datetime import UTC, datetime
|
||||
from time import perf_counter
|
||||
@@ -18,6 +19,7 @@ from app.services.knowledge import (
|
||||
from app.services.knowledge_rag import KnowledgeRagService
|
||||
|
||||
logger = get_logger("app.services.knowledge_index_tasks")
|
||||
HEARTBEAT_INTERVAL_SECONDS = 10
|
||||
|
||||
|
||||
class KnowledgeIndexTaskManager:
|
||||
@@ -58,6 +60,15 @@ class KnowledgeIndexTaskManager:
|
||||
session_factory = get_session_factory()
|
||||
db = session_factory()
|
||||
started = perf_counter()
|
||||
heartbeat_stop = threading.Event()
|
||||
heartbeat_thread: threading.Thread | None = None
|
||||
tool_call_id = ""
|
||||
tool_request_json = {
|
||||
"agent": AgentName.HERMES.value,
|
||||
"folder": folder,
|
||||
"document_ids": document_ids,
|
||||
"force": force,
|
||||
}
|
||||
|
||||
try:
|
||||
run_service = AgentRunService(db)
|
||||
@@ -84,6 +95,44 @@ class KnowledgeIndexTaskManager:
|
||||
},
|
||||
},
|
||||
)
|
||||
tool_call = run_service.record_tool_call(
|
||||
run_id=agent_run_id,
|
||||
tool_type=AgentToolType.LLM.value,
|
||||
tool_name="lightrag.index_documents",
|
||||
request_json=tool_request_json,
|
||||
response_json={"phase": "indexing"},
|
||||
status="running",
|
||||
duration_ms=0,
|
||||
error_message=None,
|
||||
)
|
||||
tool_call_id = tool_call.id
|
||||
|
||||
def heartbeat_worker() -> None:
|
||||
while not heartbeat_stop.wait(HEARTBEAT_INTERVAL_SECONDS):
|
||||
heartbeat_db = session_factory()
|
||||
try:
|
||||
AgentRunService(heartbeat_db).merge_route_json(
|
||||
agent_run_id,
|
||||
{
|
||||
"job_type": "knowledge_index_sync",
|
||||
"phase": "indexing",
|
||||
"heartbeat_at": datetime.now(UTC).isoformat(),
|
||||
},
|
||||
)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"Knowledge index heartbeat update failed run_id=%s",
|
||||
agent_run_id,
|
||||
)
|
||||
finally:
|
||||
heartbeat_db.close()
|
||||
|
||||
heartbeat_thread = threading.Thread(
|
||||
target=heartbeat_worker,
|
||||
name=f"knowledge-index-heartbeat-{agent_run_id}",
|
||||
daemon=True,
|
||||
)
|
||||
heartbeat_thread.start()
|
||||
|
||||
response = rag_service.index_documents(document_ids=document_ids, force=force)
|
||||
succeeded_document_ids = [
|
||||
@@ -117,16 +166,11 @@ class KnowledgeIndexTaskManager:
|
||||
|
||||
duration_ms = int((perf_counter() - started) * 1000)
|
||||
tool_status = "succeeded" if not failed_document_ids else "failed"
|
||||
run_service.record_tool_call(
|
||||
run_id=agent_run_id,
|
||||
tool_type=AgentToolType.LLM.value,
|
||||
tool_name="lightrag.index_documents",
|
||||
request_json={
|
||||
"agent": AgentName.HERMES.value,
|
||||
"folder": folder,
|
||||
"document_ids": document_ids,
|
||||
"force": force,
|
||||
},
|
||||
heartbeat_stop.set()
|
||||
if heartbeat_thread is not None:
|
||||
heartbeat_thread.join(timeout=1)
|
||||
run_service.update_tool_call(
|
||||
tool_call_id,
|
||||
response_json=response,
|
||||
status=tool_status,
|
||||
duration_ms=duration_ms,
|
||||
@@ -166,22 +210,29 @@ class KnowledgeIndexTaskManager:
|
||||
finished_at=datetime.now(UTC),
|
||||
)
|
||||
except Exception as exc:
|
||||
heartbeat_stop.set()
|
||||
if heartbeat_thread is not None:
|
||||
heartbeat_thread.join(timeout=1)
|
||||
try:
|
||||
AgentRunService(db).record_tool_call(
|
||||
run_id=agent_run_id,
|
||||
tool_type=AgentToolType.LLM.value,
|
||||
tool_name="lightrag.index_documents",
|
||||
request_json={
|
||||
"agent": AgentName.HERMES.value,
|
||||
"folder": folder,
|
||||
"document_ids": document_ids,
|
||||
"force": force,
|
||||
},
|
||||
response_json={"error": str(exc)},
|
||||
status="failed",
|
||||
duration_ms=int((perf_counter() - started) * 1000),
|
||||
error_message=str(exc),
|
||||
)
|
||||
if tool_call_id:
|
||||
AgentRunService(db).update_tool_call(
|
||||
tool_call_id,
|
||||
response_json={"error": str(exc)},
|
||||
status="failed",
|
||||
duration_ms=int((perf_counter() - started) * 1000),
|
||||
error_message=str(exc),
|
||||
)
|
||||
else:
|
||||
AgentRunService(db).record_tool_call(
|
||||
run_id=agent_run_id,
|
||||
tool_type=AgentToolType.LLM.value,
|
||||
tool_name="lightrag.index_documents",
|
||||
request_json=tool_request_json,
|
||||
response_json={"error": str(exc)},
|
||||
status="failed",
|
||||
duration_ms=int((perf_counter() - started) * 1000),
|
||||
error_message=str(exc),
|
||||
)
|
||||
KnowledgeService(db=db).set_document_ingest_statuses(
|
||||
document_ids,
|
||||
KNOWLEDGE_INGEST_STATUS_FAILED,
|
||||
@@ -210,6 +261,9 @@ class KnowledgeIndexTaskManager:
|
||||
logger.exception("Knowledge index task finalization failed run_id=%s", agent_run_id)
|
||||
logger.exception("Knowledge index task failed run_id=%s", agent_run_id)
|
||||
finally:
|
||||
heartbeat_stop.set()
|
||||
if heartbeat_thread is not None and heartbeat_thread.is_alive():
|
||||
heartbeat_thread.join(timeout=1)
|
||||
db.close()
|
||||
|
||||
|
||||
|
||||
@@ -83,24 +83,23 @@ class KnowledgeNormalizationService:
|
||||
if rendered:
|
||||
normalized_tables.append(f"## {candidate.title}\n\n{rendered}")
|
||||
|
||||
parts: list[str] = []
|
||||
appendix_parts: list[str] = []
|
||||
if section_appendix:
|
||||
parts.append(section_appendix)
|
||||
appendix_parts.append(section_appendix)
|
||||
if answer_clue_appendix:
|
||||
parts.append(answer_clue_appendix)
|
||||
appendix_parts.append(answer_clue_appendix)
|
||||
if normalized_tables:
|
||||
appendix = "\n\n".join(normalized_tables)
|
||||
parts.append(
|
||||
appendix_parts.append(
|
||||
"# 结构化表格补充\n\n"
|
||||
"以下表格由知识归纳阶段依据原文重新整理,供问答检索时优先理解行列关系。\n\n"
|
||||
f"{appendix}"
|
||||
)
|
||||
|
||||
if not parts:
|
||||
if not appendix_parts:
|
||||
return normalized_text
|
||||
|
||||
parts.append(f"# 原文\n\n{normalized_text}")
|
||||
return "\n\n".join(parts)
|
||||
return "\n\n".join([normalized_text, *appendix_parts])
|
||||
|
||||
@staticmethod
|
||||
def _extract_table_candidates(text: str) -> list[TableCandidate]:
|
||||
|
||||
@@ -33,6 +33,7 @@ DEFAULT_LIGHTRAG_QUERY_MODE = "naive"
|
||||
DEFAULT_LLM_TIMEOUT_SECONDS = 180
|
||||
DEFAULT_EMBEDDING_TIMEOUT_SECONDS = 120
|
||||
MAX_KNOWLEDGE_HIT_CONTENT_LENGTH = 2200
|
||||
MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH = 220
|
||||
MAX_QUERY_TERMS = 12
|
||||
QUERY_TERM_STOPWORDS = {
|
||||
"什么",
|
||||
@@ -62,6 +63,13 @@ TABLE_OR_STANDARD_QUERY_HINTS = (
|
||||
"档位",
|
||||
"额度",
|
||||
)
|
||||
STRUCTURED_APPENDIX_LEADING_MARKERS = (
|
||||
"# 章节导航",
|
||||
"# 重点章节摘录",
|
||||
"# 问答线索补充",
|
||||
"# 结构化表格补充",
|
||||
)
|
||||
STRUCTURED_APPENDIX_LEADING_WINDOW = 220
|
||||
|
||||
_runtime_lock = threading.RLock()
|
||||
_runtime_instance: _LightRagRuntime | None = None
|
||||
@@ -830,7 +838,11 @@ class KnowledgeRagService:
|
||||
document_id, document_name = _parse_document_identity(file_path)
|
||||
normalized_chunk_id = chunk_id or f"path-{rank}"
|
||||
normalized_content = _truncate_text(content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH)
|
||||
excerpt = _build_excerpt(normalized_content, max_length=220)
|
||||
excerpt = _build_query_focused_excerpt(
|
||||
normalized_content,
|
||||
query_terms=query_terms,
|
||||
max_length=MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH,
|
||||
)
|
||||
candidates.append(
|
||||
{
|
||||
"code": f"knowledge.{document_id or 'unknown'}.{normalized_chunk_id}",
|
||||
@@ -907,8 +919,12 @@ class KnowledgeRagService:
|
||||
@staticmethod
|
||||
def is_query_ready_status(status_obj: Any) -> bool:
|
||||
status_text = KnowledgeRagService._status_value(status_obj)
|
||||
if status_text in {"failed", "error", "aborted"}:
|
||||
return False
|
||||
if status_text == "processed":
|
||||
return True
|
||||
if status_text in {"pending", "processing", "preprocessed"}:
|
||||
return False
|
||||
|
||||
chunks_count = getattr(status_obj, "chunks_count", None)
|
||||
if chunks_count is None and isinstance(status_obj, dict):
|
||||
@@ -1168,6 +1184,35 @@ def _build_excerpt(text: str, *, max_length: int = 180) -> str:
|
||||
return f"{normalized[: max_length - 3].rstrip()}..."
|
||||
|
||||
|
||||
def _build_query_focused_excerpt(
|
||||
text: str,
|
||||
*,
|
||||
query_terms: list[str],
|
||||
max_length: int = 180,
|
||||
) -> str:
|
||||
normalized = " ".join(str(text or "").split()).strip()
|
||||
if not normalized:
|
||||
return ""
|
||||
|
||||
lowered = normalized.lower()
|
||||
match_positions = [
|
||||
lowered.find(term)
|
||||
for term in query_terms
|
||||
if term and lowered.find(term) >= 0
|
||||
]
|
||||
if not match_positions:
|
||||
return _build_excerpt(normalized, max_length=max_length)
|
||||
|
||||
start = max(0, min(match_positions) - max_length // 3)
|
||||
end = min(len(normalized), start + max_length)
|
||||
snippet = normalized[start:end].strip()
|
||||
if start > 0:
|
||||
snippet = f"...{snippet.lstrip()}"
|
||||
if end < len(normalized):
|
||||
snippet = f"{snippet.rstrip()}..."
|
||||
return snippet
|
||||
|
||||
|
||||
def _truncate_text(text: str, *, max_length: int) -> str:
|
||||
normalized = str(text or "").strip()
|
||||
if len(normalized) <= max_length:
|
||||
@@ -1243,19 +1288,43 @@ def _score_knowledge_hit(
|
||||
score += len(matched_terms) * 8
|
||||
score += sum(1 for term in matched_terms if term in title) * 6
|
||||
|
||||
if "结构化表格补充" in content:
|
||||
score += 18
|
||||
if "问答线索补充" in content:
|
||||
score += 16 if not prefers_tabular_evidence else 8
|
||||
if "重点章节摘录" in content:
|
||||
leading_appendix_marker = _leading_structured_appendix_marker(content)
|
||||
if leading_appendix_marker == "# 章节导航":
|
||||
score -= 24
|
||||
elif leading_appendix_marker == "# 重点章节摘录":
|
||||
score += 4 if matched_terms else -12
|
||||
elif leading_appendix_marker == "# 问答线索补充":
|
||||
score += 8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
|
||||
elif leading_appendix_marker == "# 结构化表格补充":
|
||||
if prefers_tabular_evidence and matched_terms:
|
||||
score += 16
|
||||
elif matched_terms:
|
||||
score += 6
|
||||
else:
|
||||
score -= 18
|
||||
|
||||
if prefers_tabular_evidence and matched_terms and ("|" in content or "表" in content):
|
||||
score += 10
|
||||
if "章节导航" in content:
|
||||
if matched_terms and any(marker in content for marker in (":", ":")):
|
||||
score += 10
|
||||
if matched_terms and "\n" in content:
|
||||
score += 4
|
||||
if prefers_tabular_evidence and ("|" in content or "表" in content or "结构化表格补充" in content):
|
||||
score += 12
|
||||
if not prefers_tabular_evidence and any(marker in content for marker in ("第", "条", ":", "-", "•")):
|
||||
if matched_terms and any(marker in content for marker in ("附表", "第", "条")):
|
||||
score += 4
|
||||
if not prefers_tabular_evidence and matched_terms and any(marker in content for marker in ("第", "条", ":", "-", "•")):
|
||||
score += 4
|
||||
if title and any(term in title for term in query_terms):
|
||||
score += 6
|
||||
if re.search(r"没有.{0,8}(信息|规定|说明|依据)", content):
|
||||
score -= 12
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def _leading_structured_appendix_marker(content: str) -> str:
|
||||
normalized = str(content or "").lstrip()
|
||||
for marker in STRUCTURED_APPENDIX_LEADING_MARKERS:
|
||||
index = normalized.find(marker)
|
||||
if 0 <= index <= STRUCTURED_APPENDIX_LEADING_WINDOW:
|
||||
return marker
|
||||
return ""
|
||||
|
||||
Reference in New Issue
Block a user