feat: 增强知识库索引与设置页面模块化拆分

扩展知识库索引任务和 RAG 检索支持增量入库和文档去重,优
化本体检测和规则匹配精度,前端设置页面拆分为 LLM、邮件
和 Hermes 员工同步子面板并重构样式,新增日志详情组件和
知识入库日志模型,补充单元测试覆盖。
This commit is contained in:
caoxiaozhu
2026-05-22 23:47:28 +08:00
parent 88ff04bef8
commit 5b388d08c0
84 changed files with 10170 additions and 2599 deletions

View File

@@ -12,24 +12,15 @@ from sqlalchemy.orm import Session
from app.core.config import get_settings
from app.core.logging import get_logger
from app.db.session import get_session_factory
from app.services.knowledge_ingest_log import (
build_document_graph_summary,
build_ingest_document_summary,
build_ingest_status_summary,
)
from app.services.knowledge_rag_runtime import (
DEFAULT_EMBEDDING_TIMEOUT_SECONDS,
DEFAULT_LIGHTRAG_QUERY_MODE,
DEFAULT_LLM_TIMEOUT_SECONDS,
KnowledgeRagError,
RuntimeModelConfig,
_LightRagRuntime,
_build_ali_rerank_request,
_build_azure_deployment_base,
_build_headers,
_ensure_path,
_extract_chat_text,
_extract_embedding_vectors,
_extract_error_message,
_extract_rerank_results,
_normalize_endpoint,
_parse_json_body,
_send_json_request,
)
from app.services.settings import SettingsService
@@ -76,11 +67,9 @@ STRUCTURED_APPENDIX_LEADING_MARKERS = (
"# 结构化表格补充",
)
STRUCTURED_APPENDIX_LEADING_WINDOW = 220
_runtime_lock = threading.RLock()
_runtime_instance: _LightRagRuntime | None = None
_runtime_signature: tuple[Any, ...] | None = None
_runtime_instances: dict[int, _LightRagRuntime] = {}
_runtime_signatures: dict[int, tuple[Any, ...]] = {}
class KnowledgeRagService:
@@ -147,7 +136,11 @@ class KnowledgeRagService:
"query": normalized_query,
"record_count": len(hits),
"hits": hits,
"references": [str(item.get("code") or "").strip() for item in hits if str(item.get("code") or "").strip()],
"references": [
str(item.get("code") or "").strip()
for item in hits
if str(item.get("code") or "").strip()
],
"raw_references": references,
"metadata": raw.get("metadata") if isinstance(raw, dict) else {},
"message": f"已从知识库中检索到 {len(hits)} 条相关内容。",
@@ -172,6 +165,7 @@ class KnowledgeRagService:
)
texts: list[str] = []
file_paths: list[str] = []
document_summaries: list[dict[str, Any]] = []
runtime = self._get_runtime()
existing_statuses = runtime.get_document_statuses(normalized_ids)
@@ -182,12 +176,29 @@ class KnowledgeRagService:
try:
runtime.delete_document(document_id)
except Exception as exc:
logger.warning("Delete existing LightRAG document failed doc_id=%s: %s", document_id, exc)
logger.warning(
"Delete existing LightRAG document failed doc_id=%s: %s", document_id, exc
)
text = knowledge_service.extract_document_text(document_id)
raw_text = text
if normalization_service is not None:
text = normalization_service.build_enriched_text(text)
texts.append(text)
file_paths.append(str((knowledge_service.library_root / entry["folder"] / entry["stored_name"]).resolve()))
file_paths.append(
str(
(
knowledge_service.library_root / entry["folder"] / entry["stored_name"]
).resolve()
)
)
document_summaries.append(
build_ingest_document_summary(
document_id=document_id,
entry=entry,
raw_text=raw_text,
indexed_text=text,
)
)
track_id = runtime.insert_documents(
texts=texts,
@@ -198,10 +209,32 @@ class KnowledgeRagService:
statuses = runtime.get_document_statuses(normalized_ids)
succeeded_document_ids: list[str] = []
failed_documents: list[dict[str, str]] = []
summary_by_id = {
str(item.get("document_id") or "").strip(): item
for item in document_summaries
if str(item.get("document_id") or "").strip()
}
for document_id in normalized_ids:
status_obj = statuses.get(document_id)
status_text = self._status_value(status_obj)
status_payload = self._serialize_status(status_obj)
workspace = (
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
or DEFAULT_LIGHTRAG_WORKSPACE
)
graph_summary = build_document_graph_summary(
self.storage_root,
workspace=workspace,
document_id=document_id,
)
if document_id in summary_by_id:
summary_by_id[document_id].update(
build_ingest_status_summary(
status_payload=status_payload,
graph_summary=graph_summary,
)
)
if self.is_query_ready_status(status_obj):
succeeded_document_ids.append(document_id)
continue
@@ -218,13 +251,18 @@ class KnowledgeRagService:
"requested_document_ids": normalized_ids,
"succeeded_document_ids": succeeded_document_ids,
"failed_documents": failed_documents,
"document_summaries": [
summary_by_id.get(document_id, {}) for document_id in normalized_ids
],
"status_snapshot": {
document_id: self._serialize_status(status_obj)
for document_id, status_obj in statuses.items()
},
}
def get_document_status_map(self, document_ids: list[str] | None = None) -> dict[str, dict[str, Any]]:
def get_document_status_map(
self, document_ids: list[str] | None = None
) -> dict[str, dict[str, Any]]:
target_ids = [str(item).strip() for item in document_ids or [] if str(item).strip()]
if not target_ids:
return {}
@@ -248,28 +286,32 @@ class KnowledgeRagService:
logger.warning("Delete LightRAG document ignored doc_id=%s: %s", normalized_id, exc)
def _get_runtime(self) -> _LightRagRuntime:
global _runtime_instance, _runtime_signature
signature, runtime_kwargs = self._build_runtime_signature()
thread_id = threading.get_ident()
with _runtime_lock:
if _runtime_instance is not None and _runtime_signature == signature:
return _runtime_instance
runtime = _runtime_instances.get(thread_id)
if runtime is not None and _runtime_signatures.get(thread_id) == signature:
return runtime
if _runtime_instance is not None:
if runtime is not None:
try:
_runtime_instance.finalize()
runtime.finalize()
except Exception as exc: # pragma: no cover - best effort cleanup
logger.warning("Finalize previous LightRAG runtime failed: %s", exc)
_runtime_instance = _LightRagRuntime(**runtime_kwargs)
_runtime_signature = signature
return _runtime_instance
runtime = _LightRagRuntime(**runtime_kwargs)
_runtime_instances[thread_id] = runtime
_runtime_signatures[thread_id] = signature
return runtime
def _build_runtime_signature(self) -> tuple[tuple[Any, ...], dict[str, Any]]:
configs = self._load_runtime_configs()
settings = get_settings()
working_dir = (self.storage_root / "knowledge" / ".lightrag").resolve()
workspace = os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip() or DEFAULT_LIGHTRAG_WORKSPACE
workspace = (
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
or DEFAULT_LIGHTRAG_WORKSPACE
)
qdrant_url = os.environ.get("QDRANT_URL", "").strip() or _resolve_default_qdrant_url()
qdrant_api_key = os.environ.get("QDRANT_API_KEY", "").strip()
@@ -318,7 +360,9 @@ class KnowledgeRagService:
try:
settings_service = SettingsService(session)
main = self._normalize_runtime_model(settings_service.get_runtime_model_config("main"))
embedding = self._normalize_runtime_model(settings_service.get_runtime_model_config("embedding"))
embedding = self._normalize_runtime_model(
settings_service.get_runtime_model_config("embedding")
)
try:
backup_raw = settings_service.get_runtime_model_config("backup")
backup = self._normalize_runtime_model(backup_raw)
@@ -405,7 +449,9 @@ class KnowledgeRagService:
document_id, document_name = _parse_document_identity(file_path)
normalized_chunk_id = chunk_id or f"path-{rank}"
normalized_content = _truncate_text(content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH)
normalized_content = _truncate_text(
content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH
)
excerpt = _build_query_focused_excerpt(
normalized_content,
query_terms=query_terms,
@@ -510,17 +556,14 @@ class KnowledgeRagService:
def shutdown_knowledge_rag_runtime() -> None:
global _runtime_instance, _runtime_signature
with _runtime_lock:
if _runtime_instance is None:
return
try:
_runtime_instance.finalize()
except Exception as exc: # pragma: no cover - best effort cleanup
logger.warning("Finalize LightRAG runtime failed during shutdown: %s", exc)
_runtime_instance = None
_runtime_signature = None
for runtime in list(_runtime_instances.values()):
try:
runtime.finalize()
except Exception as exc: # pragma: no cover - best effort cleanup
logger.warning("Finalize LightRAG runtime failed during shutdown: %s", exc)
_runtime_instances.clear()
_runtime_signatures.clear()
def _parse_document_identity(file_path: str) -> tuple[str, str]:
@@ -551,9 +594,7 @@ def _build_query_focused_excerpt(
lowered = normalized.lower()
match_positions = [
lowered.find(term)
for term in query_terms
if term and lowered.find(term) >= 0
lowered.find(term) for term in query_terms if term and lowered.find(term) >= 0
]
if not match_positions:
return _build_excerpt(normalized, max_length=max_length)
@@ -649,7 +690,9 @@ def _score_knowledge_hit(
elif leading_appendix_marker == "# 重点章节摘录":
score += 4 if matched_terms else -12
elif leading_appendix_marker == "# 问答线索补充":
score += 8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
score += (
8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
)
elif leading_appendix_marker == "# 结构化表格补充":
if prefers_tabular_evidence and matched_terms:
score += 16
@@ -666,7 +709,11 @@ def _score_knowledge_hit(
score += 4
if matched_terms and any(marker in content for marker in ("附表", "", "")):
score += 4
if not prefers_tabular_evidence and matched_terms and any(marker in content for marker in ("", "", "", "-", "")):
if (
not prefers_tabular_evidence
and matched_terms
and any(marker in content for marker in ("", "", "", "-", ""))
):
score += 4
if title and any(term in title for term in query_terms):
score += 6