server/src/app/services/knowledge_ingest_log.py

from __future__ import annotations

import json
import re
from pathlib import Path
from typing import Any

MAX_INGEST_LOG_CHUNKS = 24
MAX_INGEST_LOG_ENTITIES = 24
MAX_INGEST_LOG_RELATIONS = 24
MAX_INGEST_LOG_SECTIONS = 12
MAX_INGEST_LOG_TEXT_PREVIEW = 180

INGEST_SECTION_HEADING_PATTERN = re.compile(
    r"^(?:#{1,4}\s+.+|第[一二三四五六七八九十百零0-9]+[章节条]\s*.*)$"
)


def build_ingest_document_summary(
    *,
    document_id: str,
    entry: dict[str, Any],
    raw_text: str,
    indexed_text: str,
) -> dict[str, Any]:
    raw_text_value = str(raw_text or "")
    indexed_text_value = str(indexed_text or "")
    sections = _extract_ingest_sections(indexed_text_value)
    return {
        "document_id": document_id,
        "name": str(entry.get("original_name") or "").strip(),
        "folder": str(entry.get("folder") or "").strip(),
        "extension": str(entry.get("extension") or "").strip(),
        "mime_type": str(entry.get("mime_type") or "").strip(),
        "text_chars": len(raw_text_value),
        "indexed_text_chars": len(indexed_text_value),
        "section_count": len(sections),
        "sections": sections,
        "chunk_count": 0,
        "chunk_ids": [],
        "chunks": [],
        "entity_count": 0,
        "relation_count": 0,
        "entities": [],
        "relations": [],
    }


def build_ingest_status_summary(
    *,
    status_payload: dict[str, Any],
    graph_summary: dict[str, Any],
) -> dict[str, Any]:
    chunk_ids = _normalize_chunk_ids(status_payload)
    chunk_count = _resolve_chunk_count(status_payload, chunk_ids)
    return {
        "lightrag_status": str(status_payload.get("status") or "").strip(),
        "query_ready": bool(status_payload.get("query_ready")),
        "chunk_count": chunk_count,
        "chunk_ids": chunk_ids[:MAX_INGEST_LOG_CHUNKS],
        **graph_summary,
    }


def build_document_graph_summary(
    storage_root: Path,
    *,
    workspace: str,
    document_id: str,
) -> dict[str, Any]:
    workspace_dir = (
        Path(storage_root) / "knowledge" / ".lightrag" / str(workspace).strip()
    ).resolve()
    entities_payload = _load_json_file(workspace_dir / "kv_store_full_entities.json")
    relations_payload = _load_json_file(workspace_dir / "kv_store_full_relations.json")
    chunks_payload = _load_json_file(workspace_dir / "kv_store_text_chunks.json")

    entities = _normalize_document_entities(entities_payload, document_id)
    relations = _normalize_document_relations(relations_payload, document_id)
    chunks = _normalize_document_chunks(chunks_payload, document_id)
    return {
        "entity_count": len(entities),
        "relation_count": len(relations),
        "entities": entities[:MAX_INGEST_LOG_ENTITIES],
        "relations": relations[:MAX_INGEST_LOG_RELATIONS],
        "chunks": chunks[:MAX_INGEST_LOG_CHUNKS],
    }


def _extract_ingest_sections(text: str) -> list[dict[str, str]]:
    sections: list[dict[str, str]] = []
    lines = [line.strip() for line in str(text or "").splitlines()]
    for index, line in enumerate(lines):
        if len(sections) >= MAX_INGEST_LOG_SECTIONS:
            break
        if not line or len(line) > 90 or not INGEST_SECTION_HEADING_PATTERN.match(line):
            continue
        sections.append(
            {
                "title": line.lstrip("#").strip(),
                "excerpt": _find_following_excerpt(lines[index + 1 :]),
            }
        )
    return sections


def _find_following_excerpt(lines: list[str]) -> str:
    collected: list[str] = []
    for line in lines:
        if not line:
            continue
        if INGEST_SECTION_HEADING_PATTERN.match(line):
            break
        collected.append(line)
        if len(" ".join(collected)) >= MAX_INGEST_LOG_TEXT_PREVIEW:
            break
    return _truncate_text(" ".join(collected), max_length=MAX_INGEST_LOG_TEXT_PREVIEW)


def _normalize_chunk_ids(status_payload: dict[str, Any]) -> list[str]:
    chunks_list = status_payload.get("chunks_list")
    if not isinstance(chunks_list, list):
        return []
    return [str(item).strip() for item in chunks_list if str(item or "").strip()]


def _resolve_chunk_count(status_payload: dict[str, Any], chunk_ids: list[str]) -> int:
    try:
        return int(status_payload.get("chunks_count") or len(chunk_ids))
    except (TypeError, ValueError):
        return len(chunk_ids)


def _load_json_file(path: Path) -> dict[str, Any]:
    try:
        payload = json.loads(path.read_text(encoding="utf-8"))
    except (FileNotFoundError, json.JSONDecodeError, OSError):
        return {}
    return payload if isinstance(payload, dict) else {}


def _normalize_document_entities(payload: dict[str, Any], document_id: str) -> list[str]:
    document_payload = payload.get(document_id) if isinstance(payload, dict) else {}
    entity_names = (
        document_payload.get("entity_names") if isinstance(document_payload, dict) else []
    )
    if not isinstance(entity_names, list):
        return []
    return _dedupe_text_items(entity_names)


def _normalize_document_relations(
    payload: dict[str, Any], document_id: str
) -> list[dict[str, str]]:
    document_payload = payload.get(document_id) if isinstance(payload, dict) else {}
    relation_pairs = (
        document_payload.get("relation_pairs") if isinstance(document_payload, dict) else []
    )
    if not isinstance(relation_pairs, list):
        return []

    relations: list[dict[str, str]] = []
    seen: set[tuple[str, str]] = set()
    for pair in relation_pairs:
        if not isinstance(pair, (list, tuple)) or len(pair) < 2:
            continue
        source = str(pair[0] or "").strip()
        target = str(pair[1] or "").strip()
        if not source or not target or (source, target) in seen:
            continue
        seen.add((source, target))
        relations.append({"source": source, "target": target, "type": "关联"})
    return relations


def _normalize_document_chunks(payload: dict[str, Any], document_id: str) -> list[dict[str, Any]]:
    chunks: list[dict[str, Any]] = []
    for chunk_id, raw_chunk in payload.items():
        if not isinstance(raw_chunk, dict):
            continue
        if str(raw_chunk.get("full_doc_id") or "").strip() != document_id:
            continue
        content = str(raw_chunk.get("content") or "").strip()
        chunks.append(
            {
                "id": str(raw_chunk.get("_id") or chunk_id).strip(),
                "order": _to_int(raw_chunk.get("chunk_order_index")),
                "tokens": _to_int(raw_chunk.get("tokens")),
                "summary": _build_chunk_summary(content),
            }
        )
    return sorted(chunks, key=lambda item: (item["order"], item["id"]))


def _build_chunk_summary(content: str) -> str:
    lines = [line.strip() for line in str(content or "").splitlines() if line.strip()]
    text = next((line for line in lines if len(line) >= 12), lines[0] if lines else "")
    return _truncate_text(text, max_length=MAX_INGEST_LOG_TEXT_PREVIEW)


def _dedupe_text_items(items: list[Any]) -> list[str]:
    deduped: list[str] = []
    seen: set[str] = set()
    for item in items:
        text = str(item or "").strip()
        if not text or text in seen:
            continue
        seen.add(text)
        deduped.append(text)
    return deduped


def _to_int(value: Any) -> int:
    try:
        return int(value or 0)
    except (TypeError, ValueError):
        return 0


def _truncate_text(text: str, *, max_length: int) -> str:
    normalized = " ".join(str(text or "").split()).strip()
    if len(normalized) <= max_length:
        return normalized
    return f"{normalized[: max_length - 3].rstrip()}..."
feat: 增强知识库索引与设置页面模块化拆分扩展知识库索引任务和 RAG 检索支持增量入库和文档去重，优化本体检测和规则匹配精度，前端设置页面拆分为 LLM、邮件和 Hermes 员工同步子面板并重构样式，新增日志详情组件和知识入库日志模型，补充单元测试覆盖。 2026-05-22 23:47:28 +08:00			`from __future__ import annotations`

			`import json`
			`import re`
			`from pathlib import Path`
			`from typing import Any`

			`MAX_INGEST_LOG_CHUNKS = 24`
			`MAX_INGEST_LOG_ENTITIES = 24`
			`MAX_INGEST_LOG_RELATIONS = 24`
			`MAX_INGEST_LOG_SECTIONS = 12`
			`MAX_INGEST_LOG_TEXT_PREVIEW = 180`

			`INGEST_SECTION_HEADING_PATTERN = re.compile(`
			`r"^(?:#{1,4}\s+.+\|第[一二三四五六七八九十百零0-9]+[章节条]\s.)$"`
			`)`


			`def build_ingest_document_summary(`
			`*,`
			`document_id: str,`
			`entry: dict[str, Any],`
			`raw_text: str,`
			`indexed_text: str,`
			`) -> dict[str, Any]:`
			`raw_text_value = str(raw_text or "")`
			`indexed_text_value = str(indexed_text or "")`
			`sections = _extract_ingest_sections(indexed_text_value)`
			`return {`
			`"document_id": document_id,`
			`"name": str(entry.get("original_name") or "").strip(),`
			`"folder": str(entry.get("folder") or "").strip(),`
			`"extension": str(entry.get("extension") or "").strip(),`
			`"mime_type": str(entry.get("mime_type") or "").strip(),`
			`"text_chars": len(raw_text_value),`
			`"indexed_text_chars": len(indexed_text_value),`
			`"section_count": len(sections),`
			`"sections": sections,`
			`"chunk_count": 0,`
			`"chunk_ids": [],`
			`"chunks": [],`
			`"entity_count": 0,`
			`"relation_count": 0,`
			`"entities": [],`
			`"relations": [],`
			`}`


			`def build_ingest_status_summary(`
			`*,`
			`status_payload: dict[str, Any],`
			`graph_summary: dict[str, Any],`
			`) -> dict[str, Any]:`
			`chunk_ids = _normalize_chunk_ids(status_payload)`
			`chunk_count = _resolve_chunk_count(status_payload, chunk_ids)`
			`return {`
			`"lightrag_status": str(status_payload.get("status") or "").strip(),`
			`"query_ready": bool(status_payload.get("query_ready")),`
			`"chunk_count": chunk_count,`
			`"chunk_ids": chunk_ids[:MAX_INGEST_LOG_CHUNKS],`
			`**graph_summary,`
			`}`


			`def build_document_graph_summary(`
			`storage_root: Path,`
			`*,`
			`workspace: str,`
			`document_id: str,`
			`) -> dict[str, Any]:`
			`workspace_dir = (`
			`Path(storage_root) / "knowledge" / ".lightrag" / str(workspace).strip()`
			`).resolve()`
			`entities_payload = _load_json_file(workspace_dir / "kv_store_full_entities.json")`
			`relations_payload = _load_json_file(workspace_dir / "kv_store_full_relations.json")`
			`chunks_payload = _load_json_file(workspace_dir / "kv_store_text_chunks.json")`

			`entities = _normalize_document_entities(entities_payload, document_id)`
			`relations = _normalize_document_relations(relations_payload, document_id)`
			`chunks = _normalize_document_chunks(chunks_payload, document_id)`
			`return {`
			`"entity_count": len(entities),`
			`"relation_count": len(relations),`
			`"entities": entities[:MAX_INGEST_LOG_ENTITIES],`
			`"relations": relations[:MAX_INGEST_LOG_RELATIONS],`
			`"chunks": chunks[:MAX_INGEST_LOG_CHUNKS],`
			`}`


			`def _extract_ingest_sections(text: str) -> list[dict[str, str]]:`
			`sections: list[dict[str, str]] = []`
			`lines = [line.strip() for line in str(text or "").splitlines()]`
			`for index, line in enumerate(lines):`
			`if len(sections) >= MAX_INGEST_LOG_SECTIONS:`
			`break`
			`if not line or len(line) > 90 or not INGEST_SECTION_HEADING_PATTERN.match(line):`
			`continue`
			`sections.append(`
			`{`
			`"title": line.lstrip("#").strip(),`
			`"excerpt": _find_following_excerpt(lines[index + 1 :]),`
			`}`
			`)`
			`return sections`


			`def _find_following_excerpt(lines: list[str]) -> str:`
			`collected: list[str] = []`
			`for line in lines:`
			`if not line:`
			`continue`
			`if INGEST_SECTION_HEADING_PATTERN.match(line):`
			`break`
			`collected.append(line)`
			`if len(" ".join(collected)) >= MAX_INGEST_LOG_TEXT_PREVIEW:`
			`break`
			`return _truncate_text(" ".join(collected), max_length=MAX_INGEST_LOG_TEXT_PREVIEW)`


			`def _normalize_chunk_ids(status_payload: dict[str, Any]) -> list[str]:`
			`chunks_list = status_payload.get("chunks_list")`
			`if not isinstance(chunks_list, list):`
			`return []`
			`return [str(item).strip() for item in chunks_list if str(item or "").strip()]`


			`def _resolve_chunk_count(status_payload: dict[str, Any], chunk_ids: list[str]) -> int:`
			`try:`
			`return int(status_payload.get("chunks_count") or len(chunk_ids))`
			`except (TypeError, ValueError):`
			`return len(chunk_ids)`


			`def _load_json_file(path: Path) -> dict[str, Any]:`
			`try:`
			`payload = json.loads(path.read_text(encoding="utf-8"))`
			`except (FileNotFoundError, json.JSONDecodeError, OSError):`
			`return {}`
			`return payload if isinstance(payload, dict) else {}`


			`def _normalize_document_entities(payload: dict[str, Any], document_id: str) -> list[str]:`
			`document_payload = payload.get(document_id) if isinstance(payload, dict) else {}`
			`entity_names = (`
			`document_payload.get("entity_names") if isinstance(document_payload, dict) else []`
			`)`
			`if not isinstance(entity_names, list):`
			`return []`
			`return _dedupe_text_items(entity_names)`


			`def _normalize_document_relations(`
			`payload: dict[str, Any], document_id: str`
			`) -> list[dict[str, str]]:`
			`document_payload = payload.get(document_id) if isinstance(payload, dict) else {}`
			`relation_pairs = (`
			`document_payload.get("relation_pairs") if isinstance(document_payload, dict) else []`
			`)`
			`if not isinstance(relation_pairs, list):`
			`return []`

			`relations: list[dict[str, str]] = []`
			`seen: set[tuple[str, str]] = set()`
			`for pair in relation_pairs:`
			`if not isinstance(pair, (list, tuple)) or len(pair) < 2:`
			`continue`
			`source = str(pair[0] or "").strip()`
			`target = str(pair[1] or "").strip()`
			`if not source or not target or (source, target) in seen:`
			`continue`
			`seen.add((source, target))`
			`relations.append({"source": source, "target": target, "type": "关联"})`
			`return relations`


			`def _normalize_document_chunks(payload: dict[str, Any], document_id: str) -> list[dict[str, Any]]:`
			`chunks: list[dict[str, Any]] = []`
			`for chunk_id, raw_chunk in payload.items():`
			`if not isinstance(raw_chunk, dict):`
			`continue`
			`if str(raw_chunk.get("full_doc_id") or "").strip() != document_id:`
			`continue`
			`content = str(raw_chunk.get("content") or "").strip()`
			`chunks.append(`
			`{`
			`"id": str(raw_chunk.get("_id") or chunk_id).strip(),`
			`"order": _to_int(raw_chunk.get("chunk_order_index")),`
			`"tokens": _to_int(raw_chunk.get("tokens")),`
			`"summary": _build_chunk_summary(content),`
			`}`
			`)`
			`return sorted(chunks, key=lambda item: (item["order"], item["id"]))`


			`def _build_chunk_summary(content: str) -> str:`
			`lines = [line.strip() for line in str(content or "").splitlines() if line.strip()]`
			`text = next((line for line in lines if len(line) >= 12), lines[0] if lines else "")`
			`return _truncate_text(text, max_length=MAX_INGEST_LOG_TEXT_PREVIEW)`


			`def _dedupe_text_items(items: list[Any]) -> list[str]:`
			`deduped: list[str] = []`
			`seen: set[str] = set()`
			`for item in items:`
			`text = str(item or "").strip()`
			`if not text or text in seen:`
			`continue`
			`seen.add(text)`
			`deduped.append(text)`
			`return deduped`


			`def _to_int(value: Any) -> int:`
			`try:`
			`return int(value or 0)`
			`except (TypeError, ValueError):`
			`return 0`


			`def _truncate_text(text: str, *, max_length: int) -> str:`
			`normalized = " ".join(str(text or "").split()).strip()`
			`if len(normalized) <= max_length:`
			`return normalized`
			`return f"{normalized[: max_length - 3].rstrip()}..."`