2026-05-22 10:42:31 +08:00
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
2026-05-17 08:38:41 +00:00
|
|
|
|
import os
|
|
|
|
|
|
import re
|
|
|
|
|
|
import socket
|
|
|
|
|
|
import threading
|
2026-05-27 17:31:27 +08:00
|
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
2026-05-17 08:38:41 +00:00
|
|
|
|
from pathlib import Path
|
2026-05-27 17:31:27 +08:00
|
|
|
|
from typing import Any, Callable
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
|
|
|
|
|
|
|
|
from app.core.config import get_settings
|
|
|
|
|
|
from app.core.logging import get_logger
|
|
|
|
|
|
from app.db.session import get_session_factory
|
2026-05-22 23:47:28 +08:00
|
|
|
|
from app.services.knowledge_ingest_log import (
|
|
|
|
|
|
build_document_graph_summary,
|
|
|
|
|
|
build_ingest_document_summary,
|
|
|
|
|
|
build_ingest_status_summary,
|
|
|
|
|
|
)
|
2026-05-23 19:54:42 +08:00
|
|
|
|
from app.services.knowledge_rag_local import query_local_text_chunks
|
2026-05-22 10:42:31 +08:00
|
|
|
|
from app.services.knowledge_rag_runtime import (
|
|
|
|
|
|
KnowledgeRagError,
|
|
|
|
|
|
RuntimeModelConfig,
|
|
|
|
|
|
_LightRagRuntime,
|
|
|
|
|
|
)
|
|
|
|
|
|
from app.services.settings import SettingsService
|
|
|
|
|
|
|
|
|
|
|
|
logger = get_logger("app.services.knowledge_rag")
|
|
|
|
|
|
|
2026-05-17 08:38:41 +00:00
|
|
|
|
DEFAULT_QDRANT_URL = "http://127.0.0.1:6333"
|
|
|
|
|
|
CONTAINER_QDRANT_URL = "http://qdrant:6333"
|
|
|
|
|
|
DEFAULT_LIGHTRAG_WORKSPACE = "x_financial_knowledge"
|
|
|
|
|
|
MAX_KNOWLEDGE_HIT_CONTENT_LENGTH = 2200
|
2026-05-18 02:49:39 +00:00
|
|
|
|
MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH = 220
|
2026-05-17 08:38:41 +00:00
|
|
|
|
MAX_QUERY_TERMS = 12
|
|
|
|
|
|
QUERY_TERM_STOPWORDS = {
|
|
|
|
|
|
"什么",
|
|
|
|
|
|
"多少",
|
|
|
|
|
|
"哪些",
|
|
|
|
|
|
"怎么",
|
|
|
|
|
|
"如何",
|
|
|
|
|
|
"请问",
|
|
|
|
|
|
"一下",
|
|
|
|
|
|
"关于",
|
|
|
|
|
|
"规定",
|
|
|
|
|
|
"标准",
|
|
|
|
|
|
"可以",
|
|
|
|
|
|
"是否",
|
|
|
|
|
|
"一个",
|
|
|
|
|
|
"哪些人",
|
|
|
|
|
|
}
|
|
|
|
|
|
TABLE_OR_STANDARD_QUERY_HINTS = (
|
2026-05-24 21:44:17 +08:00
|
|
|
|
"表",
|
|
|
|
|
|
"表格",
|
|
|
|
|
|
"清单",
|
|
|
|
|
|
"明细",
|
|
|
|
|
|
"目录",
|
|
|
|
|
|
"科目",
|
2026-05-17 08:38:41 +00:00
|
|
|
|
"标准",
|
|
|
|
|
|
"金额",
|
|
|
|
|
|
"限额",
|
|
|
|
|
|
"补贴",
|
|
|
|
|
|
"住宿",
|
|
|
|
|
|
"餐费",
|
|
|
|
|
|
"交通",
|
|
|
|
|
|
"报销",
|
|
|
|
|
|
"档位",
|
|
|
|
|
|
"额度",
|
|
|
|
|
|
)
|
2026-05-24 21:44:17 +08:00
|
|
|
|
QUERY_ANCHOR_TERMS = (
|
|
|
|
|
|
"财务基础知识手册",
|
|
|
|
|
|
"基础知识手册",
|
|
|
|
|
|
"会计科目",
|
|
|
|
|
|
"常用会计科目",
|
|
|
|
|
|
"财务报表",
|
|
|
|
|
|
"主要税种",
|
|
|
|
|
|
"税种",
|
|
|
|
|
|
"标准",
|
|
|
|
|
|
"清单",
|
|
|
|
|
|
"明细",
|
|
|
|
|
|
"流程",
|
|
|
|
|
|
)
|
|
|
|
|
|
GENERIC_TITLE_TERMS = {"远光软件", "股份有限", "有限公司"}
|
2026-05-18 02:49:39 +00:00
|
|
|
|
STRUCTURED_APPENDIX_LEADING_MARKERS = (
|
|
|
|
|
|
"# 章节导航",
|
|
|
|
|
|
"# 重点章节摘录",
|
|
|
|
|
|
"# 问答线索补充",
|
|
|
|
|
|
"# 结构化表格补充",
|
|
|
|
|
|
)
|
|
|
|
|
|
STRUCTURED_APPENDIX_LEADING_WINDOW = 220
|
2026-05-22 10:42:31 +08:00
|
|
|
|
_runtime_lock = threading.RLock()
|
2026-05-27 17:31:27 +08:00
|
|
|
|
_runtime_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="knowledge-rag-runtime")
|
|
|
|
|
|
_runtime_instances: dict[str, _LightRagRuntime] = {}
|
|
|
|
|
|
_runtime_signatures: dict[str, tuple[Any, ...]] = {}
|
|
|
|
|
|
_RUNTIME_CACHE_KEY = "lightrag"
|
2026-05-17 08:38:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
2026-05-22 10:42:31 +08:00
|
|
|
|
class KnowledgeRagService:
|
|
|
|
|
|
def __init__(self, db: Session | None = None, storage_root: Path | None = None) -> None:
|
|
|
|
|
|
self.db = db
|
|
|
|
|
|
self.storage_root = Path(storage_root or get_settings().resolved_storage_root_dir)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
|
2026-05-22 10:42:31 +08:00
|
|
|
|
def query_knowledge(
|
2026-05-17 08:38:41 +00:00
|
|
|
|
self,
|
|
|
|
|
|
query: str,
|
|
|
|
|
|
*,
|
2026-05-22 10:42:31 +08:00
|
|
|
|
conversation_history: list[dict[str, str]] | None = None,
|
|
|
|
|
|
limit: int = 5,
|
|
|
|
|
|
) -> dict[str, Any]:
|
|
|
|
|
|
normalized_query = str(query or "").strip()
|
|
|
|
|
|
if not normalized_query:
|
|
|
|
|
|
return {
|
|
|
|
|
|
"result_type": "knowledge_search",
|
|
|
|
|
|
"query": "",
|
|
|
|
|
|
"record_count": 0,
|
|
|
|
|
|
"hits": [],
|
|
|
|
|
|
"references": [],
|
|
|
|
|
|
"message": "请先输入要检索的知识库问题。",
|
2026-05-17 08:38:41 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-24 21:44:17 +08:00
|
|
|
|
rewritten_query = normalized_query
|
|
|
|
|
|
if conversation_history:
|
|
|
|
|
|
rewritten_query = self._rewrite_query(normalized_query, conversation_history)
|
|
|
|
|
|
|
2026-05-23 19:54:42 +08:00
|
|
|
|
workspace = (
|
|
|
|
|
|
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
|
|
|
|
|
|
or DEFAULT_LIGHTRAG_WORKSPACE
|
|
|
|
|
|
)
|
|
|
|
|
|
local_result = query_local_text_chunks(
|
|
|
|
|
|
lightrag_root=(self.storage_root / "knowledge" / ".lightrag").resolve(),
|
|
|
|
|
|
workspace=workspace,
|
2026-05-24 21:44:17 +08:00
|
|
|
|
query=rewritten_query,
|
2026-05-23 19:54:42 +08:00
|
|
|
|
limit=limit,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-05-24 21:44:17 +08:00
|
|
|
|
runtime_hits: list[dict[str, Any]] = []
|
|
|
|
|
|
runtime_references: list[str] = []
|
2026-05-27 17:31:27 +08:00
|
|
|
|
if not local_result.confident:
|
|
|
|
|
|
try:
|
|
|
|
|
|
raw = self._run_runtime_operation(
|
|
|
|
|
|
lambda runtime: runtime.query_data(
|
|
|
|
|
|
rewritten_query,
|
|
|
|
|
|
conversation_history=conversation_history,
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
|
|
|
|
|
data = raw.get("data") if isinstance(raw, dict) else {}
|
|
|
|
|
|
chunks = list(data.get("chunks") or []) if isinstance(data, dict) else []
|
|
|
|
|
|
entities = list(data.get("entities") or []) if isinstance(data, dict) else []
|
|
|
|
|
|
runtime_references = list(data.get("references") or []) if isinstance(data, dict) else []
|
|
|
|
|
|
runtime_hits = self._build_hits_from_query_data(
|
|
|
|
|
|
query=rewritten_query,
|
|
|
|
|
|
chunks=chunks,
|
|
|
|
|
|
entities=entities,
|
|
|
|
|
|
limit=limit,
|
|
|
|
|
|
)
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.warning("Knowledge query failed: %s", exc)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
|
2026-05-24 21:44:17 +08:00
|
|
|
|
all_hits: dict[str, dict[str, Any]] = {}
|
|
|
|
|
|
for hit in local_result.hits:
|
|
|
|
|
|
hit["score"] = int(hit.get("score") or 0)
|
|
|
|
|
|
all_hits[hit["code"]] = hit
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
2026-05-24 21:44:17 +08:00
|
|
|
|
for hit in runtime_hits:
|
|
|
|
|
|
code = hit["code"]
|
|
|
|
|
|
if code in all_hits:
|
|
|
|
|
|
all_hits[code]["score"] = max(all_hits[code]["score"], int(hit.get("score") or 0) + 20)
|
|
|
|
|
|
if not all_hits[code].get("tags") and hit.get("tags"):
|
|
|
|
|
|
all_hits[code]["tags"] = hit["tags"]
|
|
|
|
|
|
else:
|
|
|
|
|
|
hit["score"] = int(hit.get("score") or 0)
|
|
|
|
|
|
all_hits[code] = hit
|
|
|
|
|
|
|
|
|
|
|
|
merged_hits = sorted(all_hits.values(), key=lambda x: int(x.get("score") or 0), reverse=True)[:max(1, limit)]
|
|
|
|
|
|
|
|
|
|
|
|
if not merged_hits:
|
2026-05-22 10:42:31 +08:00
|
|
|
|
return {
|
|
|
|
|
|
"result_type": "knowledge_search",
|
2026-05-24 21:44:17 +08:00
|
|
|
|
"query": rewritten_query,
|
2026-05-22 10:42:31 +08:00
|
|
|
|
"record_count": 0,
|
|
|
|
|
|
"hits": [],
|
|
|
|
|
|
"references": [],
|
2026-05-24 21:44:17 +08:00
|
|
|
|
"raw_references": runtime_references,
|
2026-05-22 10:42:31 +08:00
|
|
|
|
"message": "当前知识库中没有检索到与本次问题直接匹配的内容。",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
"result_type": "knowledge_search",
|
2026-05-24 21:44:17 +08:00
|
|
|
|
"query": rewritten_query,
|
|
|
|
|
|
"record_count": len(merged_hits),
|
|
|
|
|
|
"hits": merged_hits,
|
2026-05-22 23:47:28 +08:00
|
|
|
|
"references": [
|
|
|
|
|
|
str(item.get("code") or "").strip()
|
2026-05-24 21:44:17 +08:00
|
|
|
|
for item in merged_hits
|
2026-05-22 23:47:28 +08:00
|
|
|
|
if str(item.get("code") or "").strip()
|
|
|
|
|
|
],
|
2026-05-24 21:44:17 +08:00
|
|
|
|
"raw_references": runtime_references,
|
|
|
|
|
|
"metadata": {
|
2026-05-27 17:31:27 +08:00
|
|
|
|
"retrieval_strategy": "fusion" if runtime_hits else "local_text_chunks",
|
2026-05-24 21:44:17 +08:00
|
|
|
|
"local_total_chunks": local_result.total_chunks,
|
|
|
|
|
|
"local_best_score": local_result.best_score,
|
|
|
|
|
|
},
|
|
|
|
|
|
"message": f"已从知识库中联合检索到 {len(merged_hits)} 条相关内容。",
|
2026-05-22 10:42:31 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-24 21:44:17 +08:00
|
|
|
|
def _rewrite_query(self, query: str, conversation_history: list[dict[str, str]]) -> str:
|
|
|
|
|
|
if not self.db:
|
|
|
|
|
|
return query
|
|
|
|
|
|
|
|
|
|
|
|
from app.services.runtime_chat import RuntimeChatService
|
|
|
|
|
|
try:
|
|
|
|
|
|
chat_service = RuntimeChatService(self.db)
|
|
|
|
|
|
messages: list[dict[str, Any]] = [{"role": "system", "content": "你是一个查询重写助手。你的任务是根据用户的多轮对话历史,将用户的最后一次提问重写为一句独立、完整的查询语句,以便于在知识库中进行向量检索。只输出重写后的句子,不要任何解释。"}]
|
|
|
|
|
|
for msg in conversation_history[-6:]:
|
|
|
|
|
|
messages.append({"role": msg.get("role", "user"), "content": msg.get("content", "")})
|
|
|
|
|
|
messages.append({"role": "user", "content": f"当前提问:{query}\n\n请重写当前提问。"})
|
|
|
|
|
|
|
|
|
|
|
|
rewritten = chat_service.complete(
|
|
|
|
|
|
messages,
|
|
|
|
|
|
max_tokens=60,
|
|
|
|
|
|
temperature=0.1,
|
|
|
|
|
|
timeout_seconds=10,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if rewritten and len(rewritten) > 2 and len(rewritten) < 80:
|
|
|
|
|
|
logger.info("Query rewritten: '%s' -> '%s'", query, rewritten)
|
|
|
|
|
|
return rewritten
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.warning("Query rewrite failed: %s", exc)
|
|
|
|
|
|
|
|
|
|
|
|
return query
|
|
|
|
|
|
|
2026-05-22 10:42:31 +08:00
|
|
|
|
def index_documents(
|
|
|
|
|
|
self,
|
|
|
|
|
|
*,
|
|
|
|
|
|
document_ids: list[str],
|
|
|
|
|
|
force: bool = False,
|
|
|
|
|
|
) -> dict[str, Any]:
|
|
|
|
|
|
normalized_ids = [str(item).strip() for item in document_ids if str(item).strip()]
|
|
|
|
|
|
if not normalized_ids:
|
|
|
|
|
|
raise ValueError("没有可供索引的知识文档。")
|
|
|
|
|
|
|
2026-05-17 08:38:41 +00:00
|
|
|
|
from app.services.knowledge import KnowledgeService
|
|
|
|
|
|
from app.services.knowledge_normalizer import KnowledgeNormalizationService
|
|
|
|
|
|
|
|
|
|
|
|
knowledge_service = KnowledgeService(storage_root=self.storage_root, db=self.db)
|
|
|
|
|
|
normalization_service = (
|
|
|
|
|
|
KnowledgeNormalizationService(self.db) if self.db is not None else None
|
|
|
|
|
|
)
|
|
|
|
|
|
texts: list[str] = []
|
2026-05-22 10:42:31 +08:00
|
|
|
|
file_paths: list[str] = []
|
2026-05-22 23:47:28 +08:00
|
|
|
|
document_summaries: list[dict[str, Any]] = []
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
2026-05-27 17:31:27 +08:00
|
|
|
|
existing_statuses = self._run_runtime_operation(
|
|
|
|
|
|
lambda runtime: runtime.get_document_statuses(normalized_ids)
|
|
|
|
|
|
)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
for document_id in normalized_ids:
|
|
|
|
|
|
entry = knowledge_service.get_document_entry(document_id)
|
|
|
|
|
|
if force and document_id in existing_statuses:
|
|
|
|
|
|
try:
|
2026-05-27 17:31:27 +08:00
|
|
|
|
self._run_runtime_operation(
|
|
|
|
|
|
lambda runtime, target_id=document_id: runtime.delete_document(target_id)
|
|
|
|
|
|
)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
except Exception as exc:
|
2026-05-22 23:47:28 +08:00
|
|
|
|
logger.warning(
|
|
|
|
|
|
"Delete existing LightRAG document failed doc_id=%s: %s", document_id, exc
|
|
|
|
|
|
)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
text = knowledge_service.extract_document_text(document_id)
|
2026-05-22 23:47:28 +08:00
|
|
|
|
raw_text = text
|
2026-05-17 08:38:41 +00:00
|
|
|
|
if normalization_service is not None:
|
|
|
|
|
|
text = normalization_service.build_enriched_text(text)
|
|
|
|
|
|
texts.append(text)
|
2026-05-22 23:47:28 +08:00
|
|
|
|
file_paths.append(
|
|
|
|
|
|
str(
|
|
|
|
|
|
(
|
|
|
|
|
|
knowledge_service.library_root / entry["folder"] / entry["stored_name"]
|
|
|
|
|
|
).resolve()
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
|
|
|
|
|
document_summaries.append(
|
|
|
|
|
|
build_ingest_document_summary(
|
|
|
|
|
|
document_id=document_id,
|
|
|
|
|
|
entry=entry,
|
|
|
|
|
|
raw_text=raw_text,
|
|
|
|
|
|
indexed_text=text,
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
2026-05-27 17:31:27 +08:00
|
|
|
|
track_id = self._run_runtime_operation(
|
|
|
|
|
|
lambda runtime: runtime.insert_documents(
|
|
|
|
|
|
texts=texts,
|
|
|
|
|
|
document_ids=normalized_ids,
|
|
|
|
|
|
file_paths=file_paths,
|
|
|
|
|
|
)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
2026-05-27 17:31:27 +08:00
|
|
|
|
statuses = self._run_runtime_operation(
|
|
|
|
|
|
lambda runtime: runtime.get_document_statuses(normalized_ids)
|
|
|
|
|
|
)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
succeeded_document_ids: list[str] = []
|
|
|
|
|
|
failed_documents: list[dict[str, str]] = []
|
2026-05-22 23:47:28 +08:00
|
|
|
|
summary_by_id = {
|
|
|
|
|
|
str(item.get("document_id") or "").strip(): item
|
|
|
|
|
|
for item in document_summaries
|
|
|
|
|
|
if str(item.get("document_id") or "").strip()
|
|
|
|
|
|
}
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
2026-05-17 08:38:41 +00:00
|
|
|
|
for document_id in normalized_ids:
|
|
|
|
|
|
status_obj = statuses.get(document_id)
|
|
|
|
|
|
status_text = self._status_value(status_obj)
|
2026-05-22 23:47:28 +08:00
|
|
|
|
status_payload = self._serialize_status(status_obj)
|
|
|
|
|
|
workspace = (
|
|
|
|
|
|
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
|
|
|
|
|
|
or DEFAULT_LIGHTRAG_WORKSPACE
|
|
|
|
|
|
)
|
|
|
|
|
|
graph_summary = build_document_graph_summary(
|
|
|
|
|
|
self.storage_root,
|
|
|
|
|
|
workspace=workspace,
|
|
|
|
|
|
document_id=document_id,
|
|
|
|
|
|
)
|
|
|
|
|
|
if document_id in summary_by_id:
|
|
|
|
|
|
summary_by_id[document_id].update(
|
|
|
|
|
|
build_ingest_status_summary(
|
|
|
|
|
|
status_payload=status_payload,
|
|
|
|
|
|
graph_summary=graph_summary,
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
if self.is_query_ready_status(status_obj):
|
|
|
|
|
|
succeeded_document_ids.append(document_id)
|
|
|
|
|
|
continue
|
|
|
|
|
|
failed_documents.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"document_id": document_id,
|
2026-05-22 10:42:31 +08:00
|
|
|
|
"status": status_text or "unknown",
|
|
|
|
|
|
"error": self._status_error(status_obj),
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
"track_id": track_id,
|
|
|
|
|
|
"requested_document_ids": normalized_ids,
|
|
|
|
|
|
"succeeded_document_ids": succeeded_document_ids,
|
|
|
|
|
|
"failed_documents": failed_documents,
|
2026-05-22 23:47:28 +08:00
|
|
|
|
"document_summaries": [
|
|
|
|
|
|
summary_by_id.get(document_id, {}) for document_id in normalized_ids
|
|
|
|
|
|
],
|
2026-05-22 10:42:31 +08:00
|
|
|
|
"status_snapshot": {
|
|
|
|
|
|
document_id: self._serialize_status(status_obj)
|
|
|
|
|
|
for document_id, status_obj in statuses.items()
|
|
|
|
|
|
},
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-22 23:47:28 +08:00
|
|
|
|
def get_document_status_map(
|
|
|
|
|
|
self, document_ids: list[str] | None = None
|
|
|
|
|
|
) -> dict[str, dict[str, Any]]:
|
2026-05-22 10:42:31 +08:00
|
|
|
|
target_ids = [str(item).strip() for item in document_ids or [] if str(item).strip()]
|
|
|
|
|
|
if not target_ids:
|
|
|
|
|
|
return {}
|
|
|
|
|
|
try:
|
2026-05-27 17:31:27 +08:00
|
|
|
|
statuses = self._run_runtime_operation(
|
|
|
|
|
|
lambda runtime: runtime.get_document_statuses(target_ids)
|
|
|
|
|
|
)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.warning("Load LightRAG document statuses failed: %s", exc)
|
|
|
|
|
|
return {}
|
|
|
|
|
|
return {
|
|
|
|
|
|
document_id: self._serialize_status(status_obj)
|
|
|
|
|
|
for document_id, status_obj in statuses.items()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def delete_document(self, document_id: str) -> None:
|
|
|
|
|
|
normalized_id = str(document_id or "").strip()
|
|
|
|
|
|
if not normalized_id:
|
|
|
|
|
|
return
|
|
|
|
|
|
try:
|
2026-05-27 17:31:27 +08:00
|
|
|
|
self._run_runtime_operation(
|
|
|
|
|
|
lambda runtime: runtime.delete_document(normalized_id)
|
|
|
|
|
|
)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.warning("Delete LightRAG document ignored doc_id=%s: %s", normalized_id, exc)
|
|
|
|
|
|
|
2026-05-27 17:31:27 +08:00
|
|
|
|
def _run_runtime_operation(self, operation: Callable[[_LightRagRuntime], Any]) -> Any:
|
2026-05-22 10:42:31 +08:00
|
|
|
|
signature, runtime_kwargs = self._build_runtime_signature()
|
2026-05-27 17:31:27 +08:00
|
|
|
|
return _runtime_executor.submit(
|
|
|
|
|
|
self._execute_runtime_operation,
|
|
|
|
|
|
signature,
|
|
|
|
|
|
runtime_kwargs,
|
|
|
|
|
|
operation,
|
|
|
|
|
|
).result()
|
|
|
|
|
|
|
|
|
|
|
|
def _execute_runtime_operation(
|
|
|
|
|
|
self,
|
|
|
|
|
|
signature: tuple[Any, ...],
|
|
|
|
|
|
runtime_kwargs: dict[str, Any],
|
|
|
|
|
|
operation: Callable[[_LightRagRuntime], Any],
|
|
|
|
|
|
) -> Any:
|
|
|
|
|
|
return operation(self._get_runtime(signature=signature, runtime_kwargs=runtime_kwargs))
|
|
|
|
|
|
|
|
|
|
|
|
def _get_runtime(
|
|
|
|
|
|
self,
|
|
|
|
|
|
*,
|
|
|
|
|
|
signature: tuple[Any, ...] | None = None,
|
|
|
|
|
|
runtime_kwargs: dict[str, Any] | None = None,
|
|
|
|
|
|
) -> _LightRagRuntime:
|
|
|
|
|
|
if signature is None or runtime_kwargs is None:
|
|
|
|
|
|
signature, runtime_kwargs = self._build_runtime_signature()
|
2026-05-22 10:42:31 +08:00
|
|
|
|
with _runtime_lock:
|
2026-05-27 17:31:27 +08:00
|
|
|
|
runtime = _runtime_instances.get(_RUNTIME_CACHE_KEY)
|
|
|
|
|
|
if runtime is not None and _runtime_signatures.get(_RUNTIME_CACHE_KEY) == signature:
|
2026-05-22 23:47:28 +08:00
|
|
|
|
return runtime
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
2026-05-22 23:47:28 +08:00
|
|
|
|
if runtime is not None:
|
2026-05-22 10:42:31 +08:00
|
|
|
|
try:
|
2026-05-22 23:47:28 +08:00
|
|
|
|
runtime.finalize()
|
2026-05-22 10:42:31 +08:00
|
|
|
|
except Exception as exc: # pragma: no cover - best effort cleanup
|
|
|
|
|
|
logger.warning("Finalize previous LightRAG runtime failed: %s", exc)
|
|
|
|
|
|
|
2026-05-22 23:47:28 +08:00
|
|
|
|
runtime = _LightRagRuntime(**runtime_kwargs)
|
2026-05-27 17:31:27 +08:00
|
|
|
|
_runtime_instances[_RUNTIME_CACHE_KEY] = runtime
|
|
|
|
|
|
_runtime_signatures[_RUNTIME_CACHE_KEY] = signature
|
2026-05-22 23:47:28 +08:00
|
|
|
|
return runtime
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
2026-05-17 08:38:41 +00:00
|
|
|
|
def _build_runtime_signature(self) -> tuple[tuple[Any, ...], dict[str, Any]]:
|
|
|
|
|
|
configs = self._load_runtime_configs()
|
|
|
|
|
|
settings = get_settings()
|
|
|
|
|
|
working_dir = (self.storage_root / "knowledge" / ".lightrag").resolve()
|
2026-05-22 23:47:28 +08:00
|
|
|
|
workspace = (
|
|
|
|
|
|
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
|
|
|
|
|
|
or DEFAULT_LIGHTRAG_WORKSPACE
|
|
|
|
|
|
)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
qdrant_url = os.environ.get("QDRANT_URL", "").strip() or _resolve_default_qdrant_url()
|
|
|
|
|
|
qdrant_api_key = os.environ.get("QDRANT_API_KEY", "").strip()
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
signature = (
|
|
|
|
|
|
str(working_dir),
|
|
|
|
|
|
workspace,
|
|
|
|
|
|
qdrant_url,
|
|
|
|
|
|
qdrant_api_key,
|
|
|
|
|
|
configs["main"].provider,
|
|
|
|
|
|
configs["main"].model,
|
|
|
|
|
|
configs["main"].endpoint,
|
|
|
|
|
|
configs["main"].api_key,
|
|
|
|
|
|
configs["backup"].provider if configs["backup"] else "",
|
|
|
|
|
|
configs["backup"].model if configs["backup"] else "",
|
|
|
|
|
|
configs["backup"].endpoint if configs["backup"] else "",
|
|
|
|
|
|
configs["backup"].api_key if configs["backup"] else "",
|
2026-05-17 08:38:41 +00:00
|
|
|
|
configs["embedding"].provider,
|
|
|
|
|
|
configs["embedding"].model,
|
|
|
|
|
|
configs["embedding"].endpoint,
|
|
|
|
|
|
configs["embedding"].api_key,
|
|
|
|
|
|
configs["reranker"].provider if configs["reranker"] else "",
|
|
|
|
|
|
configs["reranker"].model if configs["reranker"] else "",
|
|
|
|
|
|
configs["reranker"].endpoint if configs["reranker"] else "",
|
|
|
|
|
|
configs["reranker"].api_key if configs["reranker"] else "",
|
|
|
|
|
|
str(settings.resolved_storage_root_dir),
|
|
|
|
|
|
)
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
return signature, {
|
|
|
|
|
|
"working_dir": working_dir,
|
|
|
|
|
|
"workspace": workspace,
|
|
|
|
|
|
"qdrant_url": qdrant_url,
|
|
|
|
|
|
"qdrant_api_key": qdrant_api_key,
|
2026-05-17 08:38:41 +00:00
|
|
|
|
"primary_chat": configs["main"],
|
|
|
|
|
|
"backup_chat": configs["backup"],
|
|
|
|
|
|
"embedding": configs["embedding"],
|
|
|
|
|
|
"reranker": configs["reranker"],
|
|
|
|
|
|
}
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
def _load_runtime_configs(self) -> dict[str, RuntimeModelConfig | None]:
|
|
|
|
|
|
owned_session = False
|
|
|
|
|
|
session = self.db
|
|
|
|
|
|
if session is None:
|
|
|
|
|
|
session = get_session_factory()()
|
|
|
|
|
|
owned_session = True
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
settings_service = SettingsService(session)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
main = self._normalize_runtime_model(settings_service.get_runtime_model_config("main"))
|
2026-05-22 23:47:28 +08:00
|
|
|
|
embedding = self._normalize_runtime_model(
|
|
|
|
|
|
settings_service.get_runtime_model_config("embedding")
|
|
|
|
|
|
)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
try:
|
|
|
|
|
|
backup_raw = settings_service.get_runtime_model_config("backup")
|
|
|
|
|
|
backup = self._normalize_runtime_model(backup_raw)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
backup = None
|
|
|
|
|
|
try:
|
|
|
|
|
|
reranker_raw = settings_service.get_runtime_model_config("reranker")
|
|
|
|
|
|
reranker = self._normalize_runtime_model(reranker_raw)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
reranker = None
|
|
|
|
|
|
if backup is not None and (
|
|
|
|
|
|
not backup.endpoint
|
|
|
|
|
|
or not backup.model
|
|
|
|
|
|
or (backup.provider != "Ollama" and not backup.api_key)
|
|
|
|
|
|
):
|
|
|
|
|
|
backup = None
|
|
|
|
|
|
if reranker is not None and (
|
|
|
|
|
|
not reranker.endpoint
|
|
|
|
|
|
or not reranker.model
|
|
|
|
|
|
or (reranker.provider != "Ollama" and not reranker.api_key)
|
|
|
|
|
|
):
|
|
|
|
|
|
reranker = None
|
2026-05-22 10:42:31 +08:00
|
|
|
|
if not main.endpoint or not main.model:
|
|
|
|
|
|
raise KnowledgeRagError("主对话模型未配置,无法初始化 LightRAG。")
|
|
|
|
|
|
if main.provider != "Ollama" and not main.api_key:
|
|
|
|
|
|
raise KnowledgeRagError("主对话模型缺少 API Key,无法初始化 LightRAG。")
|
|
|
|
|
|
if not embedding.endpoint or not embedding.model:
|
|
|
|
|
|
raise KnowledgeRagError("Embedding 模型未配置,无法初始化 LightRAG。")
|
|
|
|
|
|
if embedding.provider != "Ollama" and not embedding.api_key:
|
|
|
|
|
|
raise KnowledgeRagError("Embedding 模型缺少 API Key,无法初始化 LightRAG。")
|
|
|
|
|
|
return {
|
2026-05-17 08:38:41 +00:00
|
|
|
|
"main": main,
|
|
|
|
|
|
"backup": backup,
|
|
|
|
|
|
"embedding": embedding,
|
|
|
|
|
|
"reranker": reranker,
|
|
|
|
|
|
}
|
2026-05-22 10:42:31 +08:00
|
|
|
|
finally:
|
|
|
|
|
|
if owned_session and session is not None:
|
|
|
|
|
|
session.close()
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _normalize_runtime_model(payload: dict[str, str]) -> RuntimeModelConfig:
|
|
|
|
|
|
return RuntimeModelConfig(
|
|
|
|
|
|
slot=str(payload.get("slot") or "").strip(),
|
|
|
|
|
|
provider=str(payload.get("provider") or "").strip(),
|
|
|
|
|
|
model=str(payload.get("model") or "").strip(),
|
|
|
|
|
|
endpoint=str(payload.get("endpoint") or "").strip(),
|
|
|
|
|
|
api_key=str(payload.get("apiKey") or "").strip(),
|
|
|
|
|
|
capability=str(payload.get("capability") or "").strip(),
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
2026-05-17 08:38:41 +00:00
|
|
|
|
def _build_hits_from_query_data(
|
|
|
|
|
|
*,
|
|
|
|
|
|
query: str,
|
|
|
|
|
|
chunks: list[dict[str, Any]],
|
|
|
|
|
|
entities: list[dict[str, Any]],
|
|
|
|
|
|
limit: int,
|
|
|
|
|
|
) -> list[dict[str, Any]]:
|
|
|
|
|
|
entity_tags_by_path: dict[str, list[str]] = {}
|
|
|
|
|
|
|
|
|
|
|
|
for entity in entities:
|
|
|
|
|
|
if not isinstance(entity, dict):
|
|
|
|
|
|
continue
|
2026-05-22 10:42:31 +08:00
|
|
|
|
file_path = str(entity.get("file_path") or "").strip()
|
|
|
|
|
|
entity_name = str(entity.get("entity_name") or "").strip()
|
|
|
|
|
|
if not file_path or not entity_name:
|
|
|
|
|
|
continue
|
2026-05-17 08:38:41 +00:00
|
|
|
|
entity_tags_by_path.setdefault(file_path, [])
|
|
|
|
|
|
if entity_name not in entity_tags_by_path[file_path]:
|
|
|
|
|
|
entity_tags_by_path[file_path].append(entity_name)
|
|
|
|
|
|
|
|
|
|
|
|
query_terms = _extract_query_terms(query)
|
|
|
|
|
|
prefers_tabular_evidence = any(hint in query for hint in TABLE_OR_STANDARD_QUERY_HINTS)
|
|
|
|
|
|
candidates: list[dict[str, Any]] = []
|
|
|
|
|
|
for rank, chunk in enumerate(chunks, start=1):
|
|
|
|
|
|
if not isinstance(chunk, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
file_path = str(chunk.get("file_path") or "").strip()
|
|
|
|
|
|
chunk_id = str(chunk.get("chunk_id") or "").strip()
|
|
|
|
|
|
content = str(chunk.get("content") or "").strip()
|
|
|
|
|
|
if not file_path or not content:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
document_id, document_name = _parse_document_identity(file_path)
|
|
|
|
|
|
normalized_chunk_id = chunk_id or f"path-{rank}"
|
2026-05-22 23:47:28 +08:00
|
|
|
|
normalized_content = _truncate_text(
|
|
|
|
|
|
content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH
|
|
|
|
|
|
)
|
2026-05-18 02:49:39 +00:00
|
|
|
|
excerpt = _build_query_focused_excerpt(
|
|
|
|
|
|
normalized_content,
|
|
|
|
|
|
query_terms=query_terms,
|
|
|
|
|
|
max_length=MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH,
|
|
|
|
|
|
)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
candidates.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"code": f"knowledge.{document_id or 'unknown'}.{normalized_chunk_id}",
|
|
|
|
|
|
"candidate_id": normalized_chunk_id,
|
|
|
|
|
|
"title": document_name or "知识库文档",
|
|
|
|
|
|
"content": normalized_content,
|
|
|
|
|
|
"excerpt": excerpt,
|
|
|
|
|
|
"document_id": document_id,
|
|
|
|
|
|
"document_name": document_name or Path(file_path).name,
|
|
|
|
|
|
"version": None,
|
|
|
|
|
|
"updated_at": None,
|
|
|
|
|
|
"score": max(1, 100 - rank),
|
|
|
|
|
|
"tags": entity_tags_by_path.get(file_path, [])[:5],
|
|
|
|
|
|
"evidence": [normalized_chunk_id],
|
|
|
|
|
|
"file_path": file_path,
|
|
|
|
|
|
"_rank": rank,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
ranked = sorted(
|
|
|
|
|
|
candidates,
|
|
|
|
|
|
key=lambda item: (
|
|
|
|
|
|
_score_knowledge_hit(
|
|
|
|
|
|
item,
|
|
|
|
|
|
query_terms=query_terms,
|
|
|
|
|
|
prefers_tabular_evidence=prefers_tabular_evidence,
|
|
|
|
|
|
),
|
|
|
|
|
|
-int(item.get("_rank") or 0),
|
|
|
|
|
|
),
|
|
|
|
|
|
reverse=True,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
hits: list[dict[str, Any]] = []
|
|
|
|
|
|
for item in ranked[: max(1, limit)]:
|
|
|
|
|
|
normalized = dict(item)
|
|
|
|
|
|
normalized.pop("_rank", None)
|
|
|
|
|
|
hits.append(normalized)
|
|
|
|
|
|
return hits
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _serialize_status(status_obj: Any) -> dict[str, Any]:
|
|
|
|
|
|
if status_obj is None:
|
|
|
|
|
|
return {}
|
|
|
|
|
|
if hasattr(status_obj, "__dict__"):
|
|
|
|
|
|
payload = dict(status_obj.__dict__)
|
|
|
|
|
|
elif isinstance(status_obj, dict):
|
|
|
|
|
|
payload = dict(status_obj)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
else:
|
|
|
|
|
|
payload = {}
|
|
|
|
|
|
payload["status"] = KnowledgeRagService._status_value(status_obj)
|
|
|
|
|
|
payload["error_msg"] = KnowledgeRagService._status_error(status_obj)
|
|
|
|
|
|
payload["query_ready"] = KnowledgeRagService.is_query_ready_status(status_obj)
|
|
|
|
|
|
return payload
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _status_value(status_obj: Any) -> str:
|
|
|
|
|
|
raw_status = getattr(status_obj, "status", None)
|
|
|
|
|
|
if raw_status is None and isinstance(status_obj, dict):
|
|
|
|
|
|
raw_status = status_obj.get("status")
|
|
|
|
|
|
normalized = str(raw_status or "").strip().lower()
|
|
|
|
|
|
if "." in normalized:
|
|
|
|
|
|
normalized = normalized.split(".")[-1].strip()
|
|
|
|
|
|
if ":" in normalized and normalized.endswith(">"):
|
|
|
|
|
|
normalized = normalized.split(":")[0].strip("<> '\"")
|
|
|
|
|
|
return normalized
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
2026-05-17 08:38:41 +00:00
|
|
|
|
def _status_error(status_obj: Any) -> str:
|
|
|
|
|
|
value = getattr(status_obj, "error_msg", None)
|
|
|
|
|
|
if value is None and isinstance(status_obj, dict):
|
|
|
|
|
|
value = status_obj.get("error_msg")
|
|
|
|
|
|
return str(value or "").strip()
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def is_query_ready_status(status_obj: Any) -> bool:
|
|
|
|
|
|
status_text = KnowledgeRagService._status_value(status_obj)
|
2026-05-18 02:49:39 +00:00
|
|
|
|
if status_text in {"failed", "error", "aborted"}:
|
|
|
|
|
|
return False
|
2026-05-17 08:38:41 +00:00
|
|
|
|
if status_text == "processed":
|
|
|
|
|
|
return True
|
2026-05-18 02:49:39 +00:00
|
|
|
|
if status_text in {"pending", "processing", "preprocessed"}:
|
|
|
|
|
|
return False
|
2026-05-17 08:38:41 +00:00
|
|
|
|
|
|
|
|
|
|
chunks_count = getattr(status_obj, "chunks_count", None)
|
|
|
|
|
|
if chunks_count is None and isinstance(status_obj, dict):
|
|
|
|
|
|
chunks_count = status_obj.get("chunks_count")
|
|
|
|
|
|
try:
|
|
|
|
|
|
if int(chunks_count or 0) > 0:
|
|
|
|
|
|
return True
|
|
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
chunks_list = getattr(status_obj, "chunks_list", None)
|
|
|
|
|
|
if chunks_list is None and isinstance(status_obj, dict):
|
|
|
|
|
|
chunks_list = status_obj.get("chunks_list")
|
|
|
|
|
|
return bool(chunks_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-22 10:42:31 +08:00
|
|
|
|
def shutdown_knowledge_rag_runtime() -> None:
|
2026-05-27 17:31:27 +08:00
|
|
|
|
_runtime_executor.submit(_shutdown_runtime_instances).result()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _shutdown_runtime_instances() -> None:
|
2026-05-22 10:42:31 +08:00
|
|
|
|
with _runtime_lock:
|
2026-05-22 23:47:28 +08:00
|
|
|
|
for runtime in list(_runtime_instances.values()):
|
|
|
|
|
|
try:
|
|
|
|
|
|
runtime.finalize()
|
|
|
|
|
|
except Exception as exc: # pragma: no cover - best effort cleanup
|
|
|
|
|
|
logger.warning("Finalize LightRAG runtime failed during shutdown: %s", exc)
|
|
|
|
|
|
_runtime_instances.clear()
|
|
|
|
|
|
_runtime_signatures.clear()
|
2026-05-22 10:42:31 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_document_identity(file_path: str) -> tuple[str, str]:
|
|
|
|
|
|
path = Path(str(file_path or "").strip())
|
|
|
|
|
|
name = path.name
|
|
|
|
|
|
if "__" not in name:
|
|
|
|
|
|
return "", name
|
|
|
|
|
|
document_id, document_name = name.split("__", maxsplit=1)
|
|
|
|
|
|
return document_id.strip(), document_name.strip()
|
2026-05-17 08:38:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_excerpt(text: str, *, max_length: int = 180) -> str:
|
|
|
|
|
|
normalized = " ".join(str(text or "").split()).strip()
|
|
|
|
|
|
if len(normalized) <= max_length:
|
|
|
|
|
|
return normalized
|
|
|
|
|
|
return f"{normalized[: max_length - 3].rstrip()}..."
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-18 02:49:39 +00:00
|
|
|
|
def _build_query_focused_excerpt(
|
|
|
|
|
|
text: str,
|
|
|
|
|
|
*,
|
|
|
|
|
|
query_terms: list[str],
|
|
|
|
|
|
max_length: int = 180,
|
|
|
|
|
|
) -> str:
|
|
|
|
|
|
normalized = " ".join(str(text or "").split()).strip()
|
|
|
|
|
|
if not normalized:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
lowered = normalized.lower()
|
|
|
|
|
|
match_positions = [
|
2026-05-22 23:47:28 +08:00
|
|
|
|
lowered.find(term) for term in query_terms if term and lowered.find(term) >= 0
|
2026-05-18 02:49:39 +00:00
|
|
|
|
]
|
|
|
|
|
|
if not match_positions:
|
|
|
|
|
|
return _build_excerpt(normalized, max_length=max_length)
|
|
|
|
|
|
|
|
|
|
|
|
start = max(0, min(match_positions) - max_length // 3)
|
|
|
|
|
|
end = min(len(normalized), start + max_length)
|
|
|
|
|
|
snippet = normalized[start:end].strip()
|
|
|
|
|
|
if start > 0:
|
|
|
|
|
|
snippet = f"...{snippet.lstrip()}"
|
|
|
|
|
|
if end < len(normalized):
|
|
|
|
|
|
snippet = f"{snippet.rstrip()}..."
|
|
|
|
|
|
return snippet
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-17 08:38:41 +00:00
|
|
|
|
def _truncate_text(text: str, *, max_length: int) -> str:
|
|
|
|
|
|
normalized = str(text or "").strip()
|
|
|
|
|
|
if len(normalized) <= max_length:
|
|
|
|
|
|
return normalized
|
|
|
|
|
|
return f"{normalized[: max_length - 3].rstrip()}..."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _resolve_default_qdrant_url() -> str:
|
|
|
|
|
|
if _hostname_resolves("qdrant"):
|
|
|
|
|
|
return CONTAINER_QDRANT_URL
|
|
|
|
|
|
return DEFAULT_QDRANT_URL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _hostname_resolves(hostname: str) -> bool:
|
|
|
|
|
|
try:
|
|
|
|
|
|
socket.getaddrinfo(hostname, None)
|
|
|
|
|
|
except OSError:
|
|
|
|
|
|
return False
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_query_terms(query: str) -> list[str]:
|
|
|
|
|
|
normalized_query = str(query or "").strip().lower()
|
|
|
|
|
|
if not normalized_query:
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
terms: list[str] = []
|
|
|
|
|
|
seen: set[str] = set()
|
|
|
|
|
|
|
|
|
|
|
|
def remember(term: str) -> None:
|
|
|
|
|
|
normalized_term = str(term or "").strip().lower()
|
|
|
|
|
|
if (
|
|
|
|
|
|
not normalized_term
|
|
|
|
|
|
or normalized_term in seen
|
|
|
|
|
|
or normalized_term in QUERY_TERM_STOPWORDS
|
|
|
|
|
|
or len(normalized_term) < 2
|
|
|
|
|
|
):
|
|
|
|
|
|
return
|
|
|
|
|
|
seen.add(normalized_term)
|
|
|
|
|
|
terms.append(normalized_term)
|
|
|
|
|
|
|
|
|
|
|
|
for item in re.findall(r"[a-z0-9][a-z0-9_\-]{1,}", normalized_query):
|
|
|
|
|
|
remember(item)
|
|
|
|
|
|
|
|
|
|
|
|
for block in re.findall(r"[\u4e00-\u9fff]{2,20}", normalized_query):
|
2026-05-24 21:44:17 +08:00
|
|
|
|
for marker in ("标准", "金额", "限额", "额度"):
|
|
|
|
|
|
marker_index = block.find(marker)
|
|
|
|
|
|
if marker_index <= 0:
|
|
|
|
|
|
continue
|
|
|
|
|
|
subject = block[:marker_index]
|
|
|
|
|
|
for width in (6, 4, 3, 2):
|
|
|
|
|
|
remember(subject[-width:])
|
|
|
|
|
|
for anchor in QUERY_ANCHOR_TERMS:
|
|
|
|
|
|
if anchor in block:
|
|
|
|
|
|
remember(anchor)
|
|
|
|
|
|
tail = block[-14:]
|
|
|
|
|
|
for size in (8, 7, 6, 5, 4):
|
|
|
|
|
|
for start in range(0, len(tail) - size + 1):
|
|
|
|
|
|
piece = tail[start : start + size]
|
|
|
|
|
|
if any(anchor in piece for anchor in QUERY_ANCHOR_TERMS):
|
|
|
|
|
|
remember(piece)
|
|
|
|
|
|
if len(terms) >= MAX_QUERY_TERMS:
|
|
|
|
|
|
return terms
|
2026-05-17 08:38:41 +00:00
|
|
|
|
if len(block) <= 4:
|
|
|
|
|
|
remember(block)
|
|
|
|
|
|
continue
|
|
|
|
|
|
for size in (4, 3, 2):
|
|
|
|
|
|
for start in range(0, len(block) - size + 1):
|
|
|
|
|
|
remember(block[start : start + size])
|
|
|
|
|
|
if len(terms) >= MAX_QUERY_TERMS:
|
|
|
|
|
|
return terms
|
|
|
|
|
|
|
|
|
|
|
|
return terms[:MAX_QUERY_TERMS]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _score_knowledge_hit(
|
|
|
|
|
|
item: dict[str, Any],
|
|
|
|
|
|
*,
|
|
|
|
|
|
query_terms: list[str],
|
|
|
|
|
|
prefers_tabular_evidence: bool,
|
|
|
|
|
|
) -> int:
|
|
|
|
|
|
rank = max(1, int(item.get("_rank") or 1))
|
|
|
|
|
|
title = str(item.get("title") or item.get("document_name") or "").lower()
|
|
|
|
|
|
content = str(item.get("content") or "").lower()
|
|
|
|
|
|
excerpt = str(item.get("excerpt") or "").lower()
|
|
|
|
|
|
tags = " ".join(str(value).lower() for value in list(item.get("tags") or [])[:5])
|
|
|
|
|
|
haystack = "\n".join([title, excerpt, tags, content[:1200]])
|
|
|
|
|
|
|
|
|
|
|
|
score = max(1, 120 - rank * 4)
|
|
|
|
|
|
matched_terms = [term for term in query_terms if term in haystack]
|
|
|
|
|
|
score += len(matched_terms) * 8
|
|
|
|
|
|
score += sum(1 for term in matched_terms if term in title) * 6
|
2026-05-24 21:44:17 +08:00
|
|
|
|
score += sum(
|
|
|
|
|
|
(len(term) - 3) * 12
|
|
|
|
|
|
for term in matched_terms
|
|
|
|
|
|
if len(term) >= 4 and term in title and term not in GENERIC_TITLE_TERMS
|
|
|
|
|
|
)
|
2026-05-17 08:38:41 +00:00
|
|
|
|
|
2026-05-18 02:49:39 +00:00
|
|
|
|
leading_appendix_marker = _leading_structured_appendix_marker(content)
|
|
|
|
|
|
if leading_appendix_marker == "# 章节导航":
|
|
|
|
|
|
score -= 24
|
|
|
|
|
|
elif leading_appendix_marker == "# 重点章节摘录":
|
|
|
|
|
|
score += 4 if matched_terms else -12
|
|
|
|
|
|
elif leading_appendix_marker == "# 问答线索补充":
|
2026-05-22 23:47:28 +08:00
|
|
|
|
score += (
|
|
|
|
|
|
8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
|
|
|
|
|
|
)
|
2026-05-18 02:49:39 +00:00
|
|
|
|
elif leading_appendix_marker == "# 结构化表格补充":
|
|
|
|
|
|
if prefers_tabular_evidence and matched_terms:
|
|
|
|
|
|
score += 16
|
|
|
|
|
|
elif matched_terms:
|
|
|
|
|
|
score += 6
|
|
|
|
|
|
else:
|
|
|
|
|
|
score -= 18
|
|
|
|
|
|
|
|
|
|
|
|
if prefers_tabular_evidence and matched_terms and ("|" in content or "表" in content):
|
2026-05-17 08:38:41 +00:00
|
|
|
|
score += 10
|
2026-05-18 02:49:39 +00:00
|
|
|
|
if matched_terms and any(marker in content for marker in (":", ":")):
|
|
|
|
|
|
score += 10
|
|
|
|
|
|
if matched_terms and "\n" in content:
|
|
|
|
|
|
score += 4
|
|
|
|
|
|
if matched_terms and any(marker in content for marker in ("附表", "第", "条")):
|
2026-05-17 08:38:41 +00:00
|
|
|
|
score += 4
|
2026-05-22 23:47:28 +08:00
|
|
|
|
if (
|
|
|
|
|
|
not prefers_tabular_evidence
|
|
|
|
|
|
and matched_terms
|
|
|
|
|
|
and any(marker in content for marker in ("第", "条", ":", "-", "•"))
|
|
|
|
|
|
):
|
2026-05-17 08:38:41 +00:00
|
|
|
|
score += 4
|
|
|
|
|
|
if title and any(term in title for term in query_terms):
|
|
|
|
|
|
score += 6
|
2026-05-18 02:49:39 +00:00
|
|
|
|
if re.search(r"没有.{0,8}(信息|规定|说明|依据)", content):
|
|
|
|
|
|
score -= 12
|
2026-05-17 08:38:41 +00:00
|
|
|
|
|
|
|
|
|
|
return score
|
2026-05-18 02:49:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _leading_structured_appendix_marker(content: str) -> str:
|
|
|
|
|
|
normalized = str(content or "").lstrip()
|
|
|
|
|
|
for marker in STRUCTURED_APPENDIX_LEADING_MARKERS:
|
|
|
|
|
|
index = normalized.find(marker)
|
|
|
|
|
|
if 0 <= index <= STRUCTURED_APPENDIX_LEADING_WINDOW:
|
|
|
|
|
|
return marker
|
|
|
|
|
|
return ""
|