feat: 增强知识库索引与设置页面模块化拆分
扩展知识库索引任务和 RAG 检索支持增量入库和文档去重,优 化本体检测和规则匹配精度,前端设置页面拆分为 LLM、邮件 和 Hermes 员工同步子面板并重构样式,新增日志详情组件和 知识入库日志模型,补充单元测试覆盖。
This commit is contained in:
224
server/src/app/services/knowledge_ingest_log.py
Normal file
224
server/src/app/services/knowledge_ingest_log.py
Normal file
@@ -0,0 +1,224 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
MAX_INGEST_LOG_CHUNKS = 24
|
||||
MAX_INGEST_LOG_ENTITIES = 24
|
||||
MAX_INGEST_LOG_RELATIONS = 24
|
||||
MAX_INGEST_LOG_SECTIONS = 12
|
||||
MAX_INGEST_LOG_TEXT_PREVIEW = 180
|
||||
|
||||
INGEST_SECTION_HEADING_PATTERN = re.compile(
|
||||
r"^(?:#{1,4}\s+.+|第[一二三四五六七八九十百零0-9]+[章节条]\s*.*)$"
|
||||
)
|
||||
|
||||
|
||||
def build_ingest_document_summary(
|
||||
*,
|
||||
document_id: str,
|
||||
entry: dict[str, Any],
|
||||
raw_text: str,
|
||||
indexed_text: str,
|
||||
) -> dict[str, Any]:
|
||||
raw_text_value = str(raw_text or "")
|
||||
indexed_text_value = str(indexed_text or "")
|
||||
sections = _extract_ingest_sections(indexed_text_value)
|
||||
return {
|
||||
"document_id": document_id,
|
||||
"name": str(entry.get("original_name") or "").strip(),
|
||||
"folder": str(entry.get("folder") or "").strip(),
|
||||
"extension": str(entry.get("extension") or "").strip(),
|
||||
"mime_type": str(entry.get("mime_type") or "").strip(),
|
||||
"text_chars": len(raw_text_value),
|
||||
"indexed_text_chars": len(indexed_text_value),
|
||||
"section_count": len(sections),
|
||||
"sections": sections,
|
||||
"chunk_count": 0,
|
||||
"chunk_ids": [],
|
||||
"chunks": [],
|
||||
"entity_count": 0,
|
||||
"relation_count": 0,
|
||||
"entities": [],
|
||||
"relations": [],
|
||||
}
|
||||
|
||||
|
||||
def build_ingest_status_summary(
|
||||
*,
|
||||
status_payload: dict[str, Any],
|
||||
graph_summary: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
chunk_ids = _normalize_chunk_ids(status_payload)
|
||||
chunk_count = _resolve_chunk_count(status_payload, chunk_ids)
|
||||
return {
|
||||
"lightrag_status": str(status_payload.get("status") or "").strip(),
|
||||
"query_ready": bool(status_payload.get("query_ready")),
|
||||
"chunk_count": chunk_count,
|
||||
"chunk_ids": chunk_ids[:MAX_INGEST_LOG_CHUNKS],
|
||||
**graph_summary,
|
||||
}
|
||||
|
||||
|
||||
def build_document_graph_summary(
|
||||
storage_root: Path,
|
||||
*,
|
||||
workspace: str,
|
||||
document_id: str,
|
||||
) -> dict[str, Any]:
|
||||
workspace_dir = (
|
||||
Path(storage_root) / "knowledge" / ".lightrag" / str(workspace).strip()
|
||||
).resolve()
|
||||
entities_payload = _load_json_file(workspace_dir / "kv_store_full_entities.json")
|
||||
relations_payload = _load_json_file(workspace_dir / "kv_store_full_relations.json")
|
||||
chunks_payload = _load_json_file(workspace_dir / "kv_store_text_chunks.json")
|
||||
|
||||
entities = _normalize_document_entities(entities_payload, document_id)
|
||||
relations = _normalize_document_relations(relations_payload, document_id)
|
||||
chunks = _normalize_document_chunks(chunks_payload, document_id)
|
||||
return {
|
||||
"entity_count": len(entities),
|
||||
"relation_count": len(relations),
|
||||
"entities": entities[:MAX_INGEST_LOG_ENTITIES],
|
||||
"relations": relations[:MAX_INGEST_LOG_RELATIONS],
|
||||
"chunks": chunks[:MAX_INGEST_LOG_CHUNKS],
|
||||
}
|
||||
|
||||
|
||||
def _extract_ingest_sections(text: str) -> list[dict[str, str]]:
|
||||
sections: list[dict[str, str]] = []
|
||||
lines = [line.strip() for line in str(text or "").splitlines()]
|
||||
for index, line in enumerate(lines):
|
||||
if len(sections) >= MAX_INGEST_LOG_SECTIONS:
|
||||
break
|
||||
if not line or len(line) > 90 or not INGEST_SECTION_HEADING_PATTERN.match(line):
|
||||
continue
|
||||
sections.append(
|
||||
{
|
||||
"title": line.lstrip("#").strip(),
|
||||
"excerpt": _find_following_excerpt(lines[index + 1 :]),
|
||||
}
|
||||
)
|
||||
return sections
|
||||
|
||||
|
||||
def _find_following_excerpt(lines: list[str]) -> str:
|
||||
collected: list[str] = []
|
||||
for line in lines:
|
||||
if not line:
|
||||
continue
|
||||
if INGEST_SECTION_HEADING_PATTERN.match(line):
|
||||
break
|
||||
collected.append(line)
|
||||
if len(" ".join(collected)) >= MAX_INGEST_LOG_TEXT_PREVIEW:
|
||||
break
|
||||
return _truncate_text(" ".join(collected), max_length=MAX_INGEST_LOG_TEXT_PREVIEW)
|
||||
|
||||
|
||||
def _normalize_chunk_ids(status_payload: dict[str, Any]) -> list[str]:
|
||||
chunks_list = status_payload.get("chunks_list")
|
||||
if not isinstance(chunks_list, list):
|
||||
return []
|
||||
return [str(item).strip() for item in chunks_list if str(item or "").strip()]
|
||||
|
||||
|
||||
def _resolve_chunk_count(status_payload: dict[str, Any], chunk_ids: list[str]) -> int:
|
||||
try:
|
||||
return int(status_payload.get("chunks_count") or len(chunk_ids))
|
||||
except (TypeError, ValueError):
|
||||
return len(chunk_ids)
|
||||
|
||||
|
||||
def _load_json_file(path: Path) -> dict[str, Any]:
|
||||
try:
|
||||
payload = json.loads(path.read_text(encoding="utf-8"))
|
||||
except (FileNotFoundError, json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
return payload if isinstance(payload, dict) else {}
|
||||
|
||||
|
||||
def _normalize_document_entities(payload: dict[str, Any], document_id: str) -> list[str]:
|
||||
document_payload = payload.get(document_id) if isinstance(payload, dict) else {}
|
||||
entity_names = (
|
||||
document_payload.get("entity_names") if isinstance(document_payload, dict) else []
|
||||
)
|
||||
if not isinstance(entity_names, list):
|
||||
return []
|
||||
return _dedupe_text_items(entity_names)
|
||||
|
||||
|
||||
def _normalize_document_relations(
|
||||
payload: dict[str, Any], document_id: str
|
||||
) -> list[dict[str, str]]:
|
||||
document_payload = payload.get(document_id) if isinstance(payload, dict) else {}
|
||||
relation_pairs = (
|
||||
document_payload.get("relation_pairs") if isinstance(document_payload, dict) else []
|
||||
)
|
||||
if not isinstance(relation_pairs, list):
|
||||
return []
|
||||
|
||||
relations: list[dict[str, str]] = []
|
||||
seen: set[tuple[str, str]] = set()
|
||||
for pair in relation_pairs:
|
||||
if not isinstance(pair, (list, tuple)) or len(pair) < 2:
|
||||
continue
|
||||
source = str(pair[0] or "").strip()
|
||||
target = str(pair[1] or "").strip()
|
||||
if not source or not target or (source, target) in seen:
|
||||
continue
|
||||
seen.add((source, target))
|
||||
relations.append({"source": source, "target": target, "type": "关联"})
|
||||
return relations
|
||||
|
||||
|
||||
def _normalize_document_chunks(payload: dict[str, Any], document_id: str) -> list[dict[str, Any]]:
|
||||
chunks: list[dict[str, Any]] = []
|
||||
for chunk_id, raw_chunk in payload.items():
|
||||
if not isinstance(raw_chunk, dict):
|
||||
continue
|
||||
if str(raw_chunk.get("full_doc_id") or "").strip() != document_id:
|
||||
continue
|
||||
content = str(raw_chunk.get("content") or "").strip()
|
||||
chunks.append(
|
||||
{
|
||||
"id": str(raw_chunk.get("_id") or chunk_id).strip(),
|
||||
"order": _to_int(raw_chunk.get("chunk_order_index")),
|
||||
"tokens": _to_int(raw_chunk.get("tokens")),
|
||||
"summary": _build_chunk_summary(content),
|
||||
}
|
||||
)
|
||||
return sorted(chunks, key=lambda item: (item["order"], item["id"]))
|
||||
|
||||
|
||||
def _build_chunk_summary(content: str) -> str:
|
||||
lines = [line.strip() for line in str(content or "").splitlines() if line.strip()]
|
||||
text = next((line for line in lines if len(line) >= 12), lines[0] if lines else "")
|
||||
return _truncate_text(text, max_length=MAX_INGEST_LOG_TEXT_PREVIEW)
|
||||
|
||||
|
||||
def _dedupe_text_items(items: list[Any]) -> list[str]:
|
||||
deduped: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for item in items:
|
||||
text = str(item or "").strip()
|
||||
if not text or text in seen:
|
||||
continue
|
||||
seen.add(text)
|
||||
deduped.append(text)
|
||||
return deduped
|
||||
|
||||
|
||||
def _to_int(value: Any) -> int:
|
||||
try:
|
||||
return int(value or 0)
|
||||
except (TypeError, ValueError):
|
||||
return 0
|
||||
|
||||
|
||||
def _truncate_text(text: str, *, max_length: int) -> str:
|
||||
normalized = " ".join(str(text or "").split()).strip()
|
||||
if len(normalized) <= max_length:
|
||||
return normalized
|
||||
return f"{normalized[: max_length - 3].rstrip()}..."
|
||||
Reference in New Issue
Block a user