from __future__ import annotations import json import re from pathlib import Path from typing import Any MAX_INGEST_LOG_CHUNKS = 24 MAX_INGEST_LOG_ENTITIES = 24 MAX_INGEST_LOG_RELATIONS = 24 MAX_INGEST_LOG_SECTIONS = 12 MAX_INGEST_LOG_TEXT_PREVIEW = 180 INGEST_SECTION_HEADING_PATTERN = re.compile( r"^(?:#{1,4}\s+.+|第[一二三四五六七八九十百零0-9]+[章节条]\s*.*)$" ) def build_ingest_document_summary( *, document_id: str, entry: dict[str, Any], raw_text: str, indexed_text: str, ) -> dict[str, Any]: raw_text_value = str(raw_text or "") indexed_text_value = str(indexed_text or "") sections = _extract_ingest_sections(indexed_text_value) return { "document_id": document_id, "name": str(entry.get("original_name") or "").strip(), "folder": str(entry.get("folder") or "").strip(), "extension": str(entry.get("extension") or "").strip(), "mime_type": str(entry.get("mime_type") or "").strip(), "text_chars": len(raw_text_value), "indexed_text_chars": len(indexed_text_value), "section_count": len(sections), "sections": sections, "chunk_count": 0, "chunk_ids": [], "chunks": [], "entity_count": 0, "relation_count": 0, "entities": [], "relations": [], } def build_ingest_status_summary( *, status_payload: dict[str, Any], graph_summary: dict[str, Any], ) -> dict[str, Any]: chunk_ids = _normalize_chunk_ids(status_payload) chunk_count = _resolve_chunk_count(status_payload, chunk_ids) return { "lightrag_status": str(status_payload.get("status") or "").strip(), "query_ready": bool(status_payload.get("query_ready")), "chunk_count": chunk_count, "chunk_ids": chunk_ids[:MAX_INGEST_LOG_CHUNKS], **graph_summary, } def build_document_graph_summary( storage_root: Path, *, workspace: str, document_id: str, ) -> dict[str, Any]: workspace_dir = ( Path(storage_root) / "knowledge" / ".lightrag" / str(workspace).strip() ).resolve() entities_payload = _load_json_file(workspace_dir / "kv_store_full_entities.json") relations_payload = _load_json_file(workspace_dir / "kv_store_full_relations.json") chunks_payload = _load_json_file(workspace_dir / "kv_store_text_chunks.json") entities = _normalize_document_entities(entities_payload, document_id) relations = _normalize_document_relations(relations_payload, document_id) chunks = _normalize_document_chunks(chunks_payload, document_id) return { "entity_count": len(entities), "relation_count": len(relations), "entities": entities[:MAX_INGEST_LOG_ENTITIES], "relations": relations[:MAX_INGEST_LOG_RELATIONS], "chunks": chunks[:MAX_INGEST_LOG_CHUNKS], } def _extract_ingest_sections(text: str) -> list[dict[str, str]]: sections: list[dict[str, str]] = [] lines = [line.strip() for line in str(text or "").splitlines()] for index, line in enumerate(lines): if len(sections) >= MAX_INGEST_LOG_SECTIONS: break if not line or len(line) > 90 or not INGEST_SECTION_HEADING_PATTERN.match(line): continue sections.append( { "title": line.lstrip("#").strip(), "excerpt": _find_following_excerpt(lines[index + 1 :]), } ) return sections def _find_following_excerpt(lines: list[str]) -> str: collected: list[str] = [] for line in lines: if not line: continue if INGEST_SECTION_HEADING_PATTERN.match(line): break collected.append(line) if len(" ".join(collected)) >= MAX_INGEST_LOG_TEXT_PREVIEW: break return _truncate_text(" ".join(collected), max_length=MAX_INGEST_LOG_TEXT_PREVIEW) def _normalize_chunk_ids(status_payload: dict[str, Any]) -> list[str]: chunks_list = status_payload.get("chunks_list") if not isinstance(chunks_list, list): return [] return [str(item).strip() for item in chunks_list if str(item or "").strip()] def _resolve_chunk_count(status_payload: dict[str, Any], chunk_ids: list[str]) -> int: try: return int(status_payload.get("chunks_count") or len(chunk_ids)) except (TypeError, ValueError): return len(chunk_ids) def _load_json_file(path: Path) -> dict[str, Any]: try: payload = json.loads(path.read_text(encoding="utf-8")) except (FileNotFoundError, json.JSONDecodeError, OSError): return {} return payload if isinstance(payload, dict) else {} def _normalize_document_entities(payload: dict[str, Any], document_id: str) -> list[str]: document_payload = payload.get(document_id) if isinstance(payload, dict) else {} entity_names = ( document_payload.get("entity_names") if isinstance(document_payload, dict) else [] ) if not isinstance(entity_names, list): return [] return _dedupe_text_items(entity_names) def _normalize_document_relations( payload: dict[str, Any], document_id: str ) -> list[dict[str, str]]: document_payload = payload.get(document_id) if isinstance(payload, dict) else {} relation_pairs = ( document_payload.get("relation_pairs") if isinstance(document_payload, dict) else [] ) if not isinstance(relation_pairs, list): return [] relations: list[dict[str, str]] = [] seen: set[tuple[str, str]] = set() for pair in relation_pairs: if not isinstance(pair, (list, tuple)) or len(pair) < 2: continue source = str(pair[0] or "").strip() target = str(pair[1] or "").strip() if not source or not target or (source, target) in seen: continue seen.add((source, target)) relations.append({"source": source, "target": target, "type": "关联"}) return relations def _normalize_document_chunks(payload: dict[str, Any], document_id: str) -> list[dict[str, Any]]: chunks: list[dict[str, Any]] = [] for chunk_id, raw_chunk in payload.items(): if not isinstance(raw_chunk, dict): continue if str(raw_chunk.get("full_doc_id") or "").strip() != document_id: continue content = str(raw_chunk.get("content") or "").strip() chunks.append( { "id": str(raw_chunk.get("_id") or chunk_id).strip(), "order": _to_int(raw_chunk.get("chunk_order_index")), "tokens": _to_int(raw_chunk.get("tokens")), "summary": _build_chunk_summary(content), } ) return sorted(chunks, key=lambda item: (item["order"], item["id"])) def _build_chunk_summary(content: str) -> str: lines = [line.strip() for line in str(content or "").splitlines() if line.strip()] text = next((line for line in lines if len(line) >= 12), lines[0] if lines else "") return _truncate_text(text, max_length=MAX_INGEST_LOG_TEXT_PREVIEW) def _dedupe_text_items(items: list[Any]) -> list[str]: deduped: list[str] = [] seen: set[str] = set() for item in items: text = str(item or "").strip() if not text or text in seen: continue seen.add(text) deduped.append(text) return deduped def _to_int(value: Any) -> int: try: return int(value or 0) except (TypeError, ValueError): return 0 def _truncate_text(text: str, *, max_length: int) -> str: normalized = " ".join(str(text or "").split()).strip() if len(normalized) <= max_length: return normalized return f"{normalized[: max_length - 3].rstrip()}..."