feat: 增强知识库索引与设置页面模块化拆分
扩展知识库索引任务和 RAG 检索支持增量入库和文档去重,优 化本体检测和规则匹配精度,前端设置页面拆分为 LLM、邮件 和 Hermes 员工同步子面板并重构样式,新增日志详情组件和 知识入库日志模型,补充单元测试覆盖。
This commit is contained in:
@@ -12,24 +12,15 @@ from sqlalchemy.orm import Session
|
||||
from app.core.config import get_settings
|
||||
from app.core.logging import get_logger
|
||||
from app.db.session import get_session_factory
|
||||
from app.services.knowledge_ingest_log import (
|
||||
build_document_graph_summary,
|
||||
build_ingest_document_summary,
|
||||
build_ingest_status_summary,
|
||||
)
|
||||
from app.services.knowledge_rag_runtime import (
|
||||
DEFAULT_EMBEDDING_TIMEOUT_SECONDS,
|
||||
DEFAULT_LIGHTRAG_QUERY_MODE,
|
||||
DEFAULT_LLM_TIMEOUT_SECONDS,
|
||||
KnowledgeRagError,
|
||||
RuntimeModelConfig,
|
||||
_LightRagRuntime,
|
||||
_build_ali_rerank_request,
|
||||
_build_azure_deployment_base,
|
||||
_build_headers,
|
||||
_ensure_path,
|
||||
_extract_chat_text,
|
||||
_extract_embedding_vectors,
|
||||
_extract_error_message,
|
||||
_extract_rerank_results,
|
||||
_normalize_endpoint,
|
||||
_parse_json_body,
|
||||
_send_json_request,
|
||||
)
|
||||
from app.services.settings import SettingsService
|
||||
|
||||
@@ -76,11 +67,9 @@ STRUCTURED_APPENDIX_LEADING_MARKERS = (
|
||||
"# 结构化表格补充",
|
||||
)
|
||||
STRUCTURED_APPENDIX_LEADING_WINDOW = 220
|
||||
|
||||
|
||||
_runtime_lock = threading.RLock()
|
||||
_runtime_instance: _LightRagRuntime | None = None
|
||||
_runtime_signature: tuple[Any, ...] | None = None
|
||||
_runtime_instances: dict[int, _LightRagRuntime] = {}
|
||||
_runtime_signatures: dict[int, tuple[Any, ...]] = {}
|
||||
|
||||
|
||||
class KnowledgeRagService:
|
||||
@@ -147,7 +136,11 @@ class KnowledgeRagService:
|
||||
"query": normalized_query,
|
||||
"record_count": len(hits),
|
||||
"hits": hits,
|
||||
"references": [str(item.get("code") or "").strip() for item in hits if str(item.get("code") or "").strip()],
|
||||
"references": [
|
||||
str(item.get("code") or "").strip()
|
||||
for item in hits
|
||||
if str(item.get("code") or "").strip()
|
||||
],
|
||||
"raw_references": references,
|
||||
"metadata": raw.get("metadata") if isinstance(raw, dict) else {},
|
||||
"message": f"已从知识库中检索到 {len(hits)} 条相关内容。",
|
||||
@@ -172,6 +165,7 @@ class KnowledgeRagService:
|
||||
)
|
||||
texts: list[str] = []
|
||||
file_paths: list[str] = []
|
||||
document_summaries: list[dict[str, Any]] = []
|
||||
|
||||
runtime = self._get_runtime()
|
||||
existing_statuses = runtime.get_document_statuses(normalized_ids)
|
||||
@@ -182,12 +176,29 @@ class KnowledgeRagService:
|
||||
try:
|
||||
runtime.delete_document(document_id)
|
||||
except Exception as exc:
|
||||
logger.warning("Delete existing LightRAG document failed doc_id=%s: %s", document_id, exc)
|
||||
logger.warning(
|
||||
"Delete existing LightRAG document failed doc_id=%s: %s", document_id, exc
|
||||
)
|
||||
text = knowledge_service.extract_document_text(document_id)
|
||||
raw_text = text
|
||||
if normalization_service is not None:
|
||||
text = normalization_service.build_enriched_text(text)
|
||||
texts.append(text)
|
||||
file_paths.append(str((knowledge_service.library_root / entry["folder"] / entry["stored_name"]).resolve()))
|
||||
file_paths.append(
|
||||
str(
|
||||
(
|
||||
knowledge_service.library_root / entry["folder"] / entry["stored_name"]
|
||||
).resolve()
|
||||
)
|
||||
)
|
||||
document_summaries.append(
|
||||
build_ingest_document_summary(
|
||||
document_id=document_id,
|
||||
entry=entry,
|
||||
raw_text=raw_text,
|
||||
indexed_text=text,
|
||||
)
|
||||
)
|
||||
|
||||
track_id = runtime.insert_documents(
|
||||
texts=texts,
|
||||
@@ -198,10 +209,32 @@ class KnowledgeRagService:
|
||||
statuses = runtime.get_document_statuses(normalized_ids)
|
||||
succeeded_document_ids: list[str] = []
|
||||
failed_documents: list[dict[str, str]] = []
|
||||
summary_by_id = {
|
||||
str(item.get("document_id") or "").strip(): item
|
||||
for item in document_summaries
|
||||
if str(item.get("document_id") or "").strip()
|
||||
}
|
||||
|
||||
for document_id in normalized_ids:
|
||||
status_obj = statuses.get(document_id)
|
||||
status_text = self._status_value(status_obj)
|
||||
status_payload = self._serialize_status(status_obj)
|
||||
workspace = (
|
||||
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
|
||||
or DEFAULT_LIGHTRAG_WORKSPACE
|
||||
)
|
||||
graph_summary = build_document_graph_summary(
|
||||
self.storage_root,
|
||||
workspace=workspace,
|
||||
document_id=document_id,
|
||||
)
|
||||
if document_id in summary_by_id:
|
||||
summary_by_id[document_id].update(
|
||||
build_ingest_status_summary(
|
||||
status_payload=status_payload,
|
||||
graph_summary=graph_summary,
|
||||
)
|
||||
)
|
||||
if self.is_query_ready_status(status_obj):
|
||||
succeeded_document_ids.append(document_id)
|
||||
continue
|
||||
@@ -218,13 +251,18 @@ class KnowledgeRagService:
|
||||
"requested_document_ids": normalized_ids,
|
||||
"succeeded_document_ids": succeeded_document_ids,
|
||||
"failed_documents": failed_documents,
|
||||
"document_summaries": [
|
||||
summary_by_id.get(document_id, {}) for document_id in normalized_ids
|
||||
],
|
||||
"status_snapshot": {
|
||||
document_id: self._serialize_status(status_obj)
|
||||
for document_id, status_obj in statuses.items()
|
||||
},
|
||||
}
|
||||
|
||||
def get_document_status_map(self, document_ids: list[str] | None = None) -> dict[str, dict[str, Any]]:
|
||||
def get_document_status_map(
|
||||
self, document_ids: list[str] | None = None
|
||||
) -> dict[str, dict[str, Any]]:
|
||||
target_ids = [str(item).strip() for item in document_ids or [] if str(item).strip()]
|
||||
if not target_ids:
|
||||
return {}
|
||||
@@ -248,28 +286,32 @@ class KnowledgeRagService:
|
||||
logger.warning("Delete LightRAG document ignored doc_id=%s: %s", normalized_id, exc)
|
||||
|
||||
def _get_runtime(self) -> _LightRagRuntime:
|
||||
global _runtime_instance, _runtime_signature
|
||||
|
||||
signature, runtime_kwargs = self._build_runtime_signature()
|
||||
thread_id = threading.get_ident()
|
||||
with _runtime_lock:
|
||||
if _runtime_instance is not None and _runtime_signature == signature:
|
||||
return _runtime_instance
|
||||
runtime = _runtime_instances.get(thread_id)
|
||||
if runtime is not None and _runtime_signatures.get(thread_id) == signature:
|
||||
return runtime
|
||||
|
||||
if _runtime_instance is not None:
|
||||
if runtime is not None:
|
||||
try:
|
||||
_runtime_instance.finalize()
|
||||
runtime.finalize()
|
||||
except Exception as exc: # pragma: no cover - best effort cleanup
|
||||
logger.warning("Finalize previous LightRAG runtime failed: %s", exc)
|
||||
|
||||
_runtime_instance = _LightRagRuntime(**runtime_kwargs)
|
||||
_runtime_signature = signature
|
||||
return _runtime_instance
|
||||
runtime = _LightRagRuntime(**runtime_kwargs)
|
||||
_runtime_instances[thread_id] = runtime
|
||||
_runtime_signatures[thread_id] = signature
|
||||
return runtime
|
||||
|
||||
def _build_runtime_signature(self) -> tuple[tuple[Any, ...], dict[str, Any]]:
|
||||
configs = self._load_runtime_configs()
|
||||
settings = get_settings()
|
||||
working_dir = (self.storage_root / "knowledge" / ".lightrag").resolve()
|
||||
workspace = os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip() or DEFAULT_LIGHTRAG_WORKSPACE
|
||||
workspace = (
|
||||
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
|
||||
or DEFAULT_LIGHTRAG_WORKSPACE
|
||||
)
|
||||
qdrant_url = os.environ.get("QDRANT_URL", "").strip() or _resolve_default_qdrant_url()
|
||||
qdrant_api_key = os.environ.get("QDRANT_API_KEY", "").strip()
|
||||
|
||||
@@ -318,7 +360,9 @@ class KnowledgeRagService:
|
||||
try:
|
||||
settings_service = SettingsService(session)
|
||||
main = self._normalize_runtime_model(settings_service.get_runtime_model_config("main"))
|
||||
embedding = self._normalize_runtime_model(settings_service.get_runtime_model_config("embedding"))
|
||||
embedding = self._normalize_runtime_model(
|
||||
settings_service.get_runtime_model_config("embedding")
|
||||
)
|
||||
try:
|
||||
backup_raw = settings_service.get_runtime_model_config("backup")
|
||||
backup = self._normalize_runtime_model(backup_raw)
|
||||
@@ -405,7 +449,9 @@ class KnowledgeRagService:
|
||||
|
||||
document_id, document_name = _parse_document_identity(file_path)
|
||||
normalized_chunk_id = chunk_id or f"path-{rank}"
|
||||
normalized_content = _truncate_text(content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH)
|
||||
normalized_content = _truncate_text(
|
||||
content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH
|
||||
)
|
||||
excerpt = _build_query_focused_excerpt(
|
||||
normalized_content,
|
||||
query_terms=query_terms,
|
||||
@@ -510,17 +556,14 @@ class KnowledgeRagService:
|
||||
|
||||
|
||||
def shutdown_knowledge_rag_runtime() -> None:
|
||||
global _runtime_instance, _runtime_signature
|
||||
|
||||
with _runtime_lock:
|
||||
if _runtime_instance is None:
|
||||
return
|
||||
try:
|
||||
_runtime_instance.finalize()
|
||||
except Exception as exc: # pragma: no cover - best effort cleanup
|
||||
logger.warning("Finalize LightRAG runtime failed during shutdown: %s", exc)
|
||||
_runtime_instance = None
|
||||
_runtime_signature = None
|
||||
for runtime in list(_runtime_instances.values()):
|
||||
try:
|
||||
runtime.finalize()
|
||||
except Exception as exc: # pragma: no cover - best effort cleanup
|
||||
logger.warning("Finalize LightRAG runtime failed during shutdown: %s", exc)
|
||||
_runtime_instances.clear()
|
||||
_runtime_signatures.clear()
|
||||
|
||||
|
||||
def _parse_document_identity(file_path: str) -> tuple[str, str]:
|
||||
@@ -551,9 +594,7 @@ def _build_query_focused_excerpt(
|
||||
|
||||
lowered = normalized.lower()
|
||||
match_positions = [
|
||||
lowered.find(term)
|
||||
for term in query_terms
|
||||
if term and lowered.find(term) >= 0
|
||||
lowered.find(term) for term in query_terms if term and lowered.find(term) >= 0
|
||||
]
|
||||
if not match_positions:
|
||||
return _build_excerpt(normalized, max_length=max_length)
|
||||
@@ -649,7 +690,9 @@ def _score_knowledge_hit(
|
||||
elif leading_appendix_marker == "# 重点章节摘录":
|
||||
score += 4 if matched_terms else -12
|
||||
elif leading_appendix_marker == "# 问答线索补充":
|
||||
score += 8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
|
||||
score += (
|
||||
8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
|
||||
)
|
||||
elif leading_appendix_marker == "# 结构化表格补充":
|
||||
if prefers_tabular_evidence and matched_terms:
|
||||
score += 16
|
||||
@@ -666,7 +709,11 @@ def _score_knowledge_hit(
|
||||
score += 4
|
||||
if matched_terms and any(marker in content for marker in ("附表", "第", "条")):
|
||||
score += 4
|
||||
if not prefers_tabular_evidence and matched_terms and any(marker in content for marker in ("第", "条", ":", "-", "•")):
|
||||
if (
|
||||
not prefers_tabular_evidence
|
||||
and matched_terms
|
||||
and any(marker in content for marker in ("第", "条", ":", "-", "•"))
|
||||
):
|
||||
score += 4
|
||||
if title and any(term in title for term in query_terms):
|
||||
score += 6
|
||||
|
||||
Reference in New Issue
Block a user