X-Financial/server/src/app/services/knowledge.py

from __future__ import annotations

import hashlib
import json
import mimetypes
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
from urllib.request import Request, urlopen
from uuid import uuid4

from sqlalchemy.orm import Session

from app.api.deps import CurrentUserContext
from app.core.agent_enums import AgentRunStatus
from app.core.config import get_settings
from app.core.logging import get_logger
from app.schemas.knowledge import (
    KnowledgeDocumentDetailRead,
    KnowledgeDocumentRead,
    KnowledgeFolderRead,
    KnowledgeLibraryRead,
    KnowledgeOnlyOfficeConfigRead,
    KnowledgePreviewPageRead,
)
from app.services.knowledge_rag import KnowledgeRagService

logger = get_logger("app.services.knowledge")

from app.services.knowledge_constants import (
    FIXED_KNOWLEDGE_FOLDERS,
    ICON_BY_TYPE,
    KNOWLEDGE_INGEST_STATUS_FAILED,
    KNOWLEDGE_INGEST_STATUS_INGESTED,
    KNOWLEDGE_INGEST_STATUS_META,
    KNOWLEDGE_INGEST_STATUS_PUBLISHED,
    KNOWLEDGE_INGEST_STATUS_SYNCING,
    KNOWLEDGE_SEARCH_RESULT_LIMIT,
)
from app.services.knowledge_document_extractors import (
    _extract_docx_text,
    _extract_document_text_from_path,
    _extract_pdf_text,
    _extract_pptx_slides,
    _extract_text_with_ocr,
    _extract_xlsx_sheets,
    _normalize_extracted_text,
    _read_text_preview,
)
from app.services.knowledge_file_utils import (
    can_preview,
    extract_extension,
    format_size,
    format_time,
    normalize_filename,
    normalize_folder,
    parse_stored_name,
    resolve_file_type,
    resolve_file_type_label,
)
from app.services.knowledge_onlyoffice import (
    OnlyOfficeCallbackPayload,
    build_onlyoffice_config as build_onlyoffice_config_payload,
    build_onlyoffice_access_token,
    build_onlyoffice_document_key,
    parse_onlyoffice_callback,
    resolve_onlyoffice_document_type,
    validate_onlyoffice_access_token,
)
from app.services.knowledge_ingest_status import (
    is_syncing_status_stale,
    normalize_ingest_status_code,
    resolve_linked_ingest_run_status,
    should_preserve_syncing_status,
)
from app.services.knowledge_preview import build_preview


def prepare_knowledge_library() -> None:
    KnowledgeService().ensure_library_ready()


class KnowledgeService:
    def __init__(self, storage_root: Path | None = None, db: Session | None = None) -> None:
        settings = get_settings()
        self.db = db
        self.storage_root = Path(storage_root or settings.resolved_storage_root_dir)
        self.library_root = self.storage_root / "knowledge"
        self.index_path = self.library_root / ".index.json"

    def ensure_library_ready(self) -> None:
        self.library_root.mkdir(parents=True, exist_ok=True)
        for folder_name in FIXED_KNOWLEDGE_FOLDERS:
            (self.library_root / folder_name).mkdir(parents=True, exist_ok=True)

        if not self.index_path.exists():
            self._save_index({"version": 1, "documents": []})

        index = self._load_index()
        if self._reconcile_index(index):
            self._save_index(index)

    def list_library(self) -> KnowledgeLibraryRead:
        documents = self._load_documents()
        folders = [
            KnowledgeFolderRead(
                name=folder_name,
                count=sum(1 for item in documents if item.folder == folder_name),
                icon="mdi mdi-folder",
            )
            for folder_name in FIXED_KNOWLEDGE_FOLDERS
        ]
        return KnowledgeLibraryRead(folders=folders, documents=documents)

    def get_document_detail(self, document_id: str) -> KnowledgeDocumentDetailRead:
        self.ensure_library_ready()
        index = self._load_index()
        if self._reconcile_document_ingest_statuses(index, document_ids=[document_id]):
            self._save_index(index)
        entry = self._require_entry(index, document_id)
        preview_kind, preview_pages = self._build_preview(entry)
        document = self._serialize_document(entry)
        return KnowledgeDocumentDetailRead(
            **document.model_dump(),
            previewKind=preview_kind,
            previewPages=preview_pages,
        )

    def upload_document(
        self,
        folder: str,
        filename: str,
        content: bytes,
        current_user: CurrentUserContext,
    ) -> KnowledgeDocumentDetailRead:
        self.ensure_library_ready()
        normalized_folder = self._normalize_folder(folder)
        normalized_name = self._normalize_filename(filename)

        if not content:
            raise ValueError("上传文件不能为空。")

        rag_service = KnowledgeRagService(db=self.db, storage_root=self.storage_root)
        index = self._load_index()
        existing_entry = next(
            (
                item
                for item in index["documents"]
                if item["folder"] == normalized_folder
                and item["original_name"].lower() == normalized_name.lower()
            ),
            None,
        )

        document_id = existing_entry["id"] if existing_entry else uuid4().hex
        stored_name = f"{document_id}__{normalized_name}"
        target_path = self.library_root / normalized_folder / stored_name

        if existing_entry is not None:
            rag_service.delete_document(document_id)
            if existing_entry["stored_name"] != stored_name:
                old_path = (
                    self.library_root / existing_entry["folder"] / existing_entry["stored_name"]
                )
                if old_path.exists():
                    old_path.unlink()

        target_path.write_bytes(content)

        now = datetime.now(UTC).isoformat()
        mime_type = mimetypes.guess_type(normalized_name)[0] or "application/octet-stream"
        checksum = hashlib.sha256(content).hexdigest()
        extension = self._extract_extension(normalized_name)

        if existing_entry is None:
            entry = {
                "id": document_id,
                "folder": normalized_folder,
                "original_name": normalized_name,
                "stored_name": stored_name,
                "mime_type": mime_type,
                "extension": extension,
                "size_bytes": len(content),
                "sha256": checksum,
                "created_at": now,
                "updated_at": now,
                "uploaded_by": current_user.name,
                "version_number": 1,
                "ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
                "ingest_status_updated_at": now,
                "ingest_completed_at": "",
                "ingest_document_name": "",
                "ingest_document_updated_at": "",
                "ingest_document_sha256": "",
                "ingest_agent_run_id": "",
            }
            index["documents"].append(entry)
            logger.info(
                "Knowledge document uploaded id=%s folder=%s filename=%s by=%s",
                document_id,
                normalized_folder,
                normalized_name,
                current_user.name,
            )
        else:
            existing_entry.update(
                {
                    "stored_name": stored_name,
                    "mime_type": mime_type,
                    "extension": extension,
                    "size_bytes": len(content),
                    "sha256": checksum,
                    "updated_at": now,
                    "uploaded_by": current_user.name,
                    "version_number": int(existing_entry.get("version_number", 1)) + 1,
                    "ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
                    "ingest_status_updated_at": now,
                    "ingest_completed_at": "",
                    "ingest_document_name": "",
                    "ingest_document_updated_at": "",
                    "ingest_document_sha256": "",
                    "ingest_agent_run_id": "",
                }
            )
            entry = existing_entry
            logger.info(
                "Knowledge document updated id=%s folder=%s filename=%s by=%s",
                document_id,
                normalized_folder,
                normalized_name,
                current_user.name,
            )

        self._save_index(index)
        return self.get_document_detail(document_id)

    def delete_document(self, document_id: str) -> None:
        self.ensure_library_ready()
        index = self._load_index()
        entry = self._require_entry(index, document_id)
        file_path = self._resolve_document_path(entry)
        if file_path.exists():
            file_path.unlink()

        index["documents"] = [item for item in index["documents"] if item["id"] != document_id]
        self._save_index(index)
        KnowledgeRagService(db=self.db, storage_root=self.storage_root).delete_document(document_id)
        logger.info(
            "Knowledge document deleted id=%s filename=%s", document_id, entry["original_name"]
        )

    def get_document_content(self, document_id: str) -> tuple[Path, str, str]:
        self.ensure_library_ready()
        index = self._load_index()
        entry = self._require_entry(index, document_id)
        file_path = self._resolve_document_path(entry)

        if not file_path.exists():
            raise FileNotFoundError(entry["original_name"])

        return file_path, entry["mime_type"], entry["original_name"]

    def list_folder_documents(self, folder: str | None = None) -> list[dict[str, Any]]:
        self.ensure_library_ready()
        index = self._load_index()
        if self._reconcile_document_ingest_statuses(index):
            self._save_index(index)
        documents = list(index.get("documents") or [])
        if folder is None:
            return documents
        normalized_folder = self._normalize_folder(folder)
        return [item for item in documents if item.get("folder") == normalized_folder]

    def list_documents_for_ingest(
        self,
        *,
        folder: str | None = None,
        document_ids: list[str] | None = None,
        changed_only: bool = False,
    ) -> list[dict[str, Any]]:
        documents = self.list_folder_documents(folder=folder)
        requested_ids = {str(item).strip() for item in document_ids or [] if str(item).strip()}
        if requested_ids:
            documents = [
                item for item in documents if str(item.get("id") or "").strip() in requested_ids
            ]
        if changed_only:
            documents = [item for item in documents if self._should_index_document(item)]
        return documents

    def get_document_entry(self, document_id: str) -> dict[str, Any]:
        self.ensure_library_ready()
        index = self._load_index()
        if self._reconcile_document_ingest_statuses(index, document_ids=[document_id]):
            self._save_index(index)
        return dict(self._require_entry(index, document_id))

    def set_document_ingest_statuses(
        self,
        document_ids: list[str],
        status_code: int,
        *,
        agent_run_id: str | None = None,
    ) -> None:
        self.ensure_library_ready()
        normalized_ids = {str(item).strip() for item in document_ids if str(item).strip()}
        if not normalized_ids:
            return

        index = self._load_index()
        changed = False
        updated_at = datetime.now(UTC).isoformat()
        for entry in index.get("documents", []):
            if str(entry.get("id") or "").strip() not in normalized_ids:
                continue
            changed = (
                self._apply_ingest_status_to_entry(
                    entry,
                    status_code=status_code,
                    updated_at=updated_at,
                    agent_run_id=agent_run_id,
                )
                or changed
            )

        if changed:
            self._save_index(index)

    def refresh_document_ingest_statuses(
        self,
        document_ids: list[str] | None = None,
        *,
        preserve_syncing: bool = True,
    ) -> None:
        self.ensure_library_ready()
        index = self._load_index()
        if self._reconcile_document_ingest_statuses(
            index,
            document_ids=document_ids,
            preserve_syncing=preserve_syncing,
        ):
            self._save_index(index)

    def search_knowledge(
        self,
        query: str,
        *,
        conversation_history: list[dict[str, str]] | None = None,
        limit: int = KNOWLEDGE_SEARCH_RESULT_LIMIT,
    ) -> dict[str, Any]:
        self.ensure_library_ready()
        return KnowledgeRagService(db=self.db, storage_root=self.storage_root).query_knowledge(
            query,
            conversation_history=conversation_history,
            limit=limit,
        )

    def extract_document_text(self, document_id: str) -> str:
        self.ensure_library_ready()
        entry = self.get_document_entry(document_id)
        file_path = self._resolve_document_path(entry)
        if not file_path.exists():
            raise FileNotFoundError(entry["original_name"])
        return self._extract_document_text_from_path(
            file_path=file_path,
            original_name=str(entry.get("original_name") or file_path.name),
            mime_type=str(entry.get("mime_type") or "application/octet-stream"),
        )

    def build_onlyoffice_config(
        self,
        document_id: str,
        current_user: CurrentUserContext,
    ) -> KnowledgeOnlyOfficeConfigRead:
        self.ensure_library_ready()
        index = self._load_index()
        entry = self._require_entry(index, document_id)
        return build_onlyoffice_config_payload(
            document_id=document_id,
            entry=entry,
            current_user=current_user,
        )

    def validate_onlyoffice_access_token(self, document_id: str, access_token: str) -> None:
        validate_onlyoffice_access_token(document_id, access_token)

    def handle_onlyoffice_callback(self, document_id: str, payload: dict[str, Any]) -> None:
        self.ensure_library_ready()
        callback = self._parse_onlyoffice_callback(payload)
        if callback.status not in {2, 6} or not callback.download_url:
            return

        logger.info(
            "ONLYOFFICE callback received id=%s status=%s users=%s",
            document_id,
            callback.status,
            ",".join(callback.users) if callback.users else "-",
        )

        request = Request(callback.download_url, headers={"User-Agent": "x-financial-onlyoffice"})
        with urlopen(request, timeout=30) as response:  # noqa: S310
            content = response.read()

        actor_name = callback.users[0] if callback.users else "ONLYOFFICE"
        self._replace_document_content(document_id, content, actor_name=actor_name)

    def _load_documents(self) -> list[KnowledgeDocumentRead]:
        self.ensure_library_ready()
        index = self._load_index()
        changed = self._reconcile_index(index)
        changed = self._reconcile_document_ingest_statuses(index) or changed
        if changed:
            self._save_index(index)

        documents = [self._serialize_document(entry) for entry in index["documents"]]
        return sorted(documents, key=lambda item: item.time, reverse=True)

    def _serialize_document(
        self,
        entry: dict[str, Any],
    ) -> KnowledgeDocumentRead:
        extension = entry.get("extension") or self._extract_extension(entry["original_name"])
        file_type = self._resolve_file_type(extension)
        size_bytes = int(entry.get("size_bytes") or 0)
        updated_at = self._format_time(entry.get("updated_at") or entry.get("created_at"))
        ingest_time = self._format_time(entry.get("ingest_completed_at"))
        state_code = normalize_ingest_status_code(entry.get("ingest_status"))
        state_label, state_tone = KNOWLEDGE_INGEST_STATUS_META.get(
            state_code,
            KNOWLEDGE_INGEST_STATUS_META[KNOWLEDGE_INGEST_STATUS_PUBLISHED],
        )

        return KnowledgeDocumentRead(
            id=entry["id"],
            name=entry["original_name"],
            folder=entry["folder"],
            tag=f"{entry['folder']} / {extension.upper() or 'FILE'}",
            time=updated_at,
            ingestTime=ingest_time if state_code == KNOWLEDGE_INGEST_STATUS_INGESTED else "",
            version=f"v{int(entry.get('version_number', 1))}.0",
            stateCode=state_code,
            state=state_label,
            stateTone=state_tone,
            owner=entry.get("uploaded_by") or "????",
            icon=ICON_BY_TYPE.get(file_type, ICON_BY_TYPE["binary"]),
            fileType=file_type,
            fileTypeLabel=self._resolve_file_type_label(file_type),
            summary=f"{entry['folder']} ? {extension.upper() or 'FILE'} ? {self._format_size(size_bytes)}",
            mimeType=entry.get("mime_type") or "application/octet-stream",
            extension=extension,
            sizeBytes=size_bytes,
            canPreview=self._can_preview(extension),
            llmWikiAvailable=False,
            llmWikiQualityStatus="",
            llmWikiQualityNote="",
        )

    def _build_preview(self, entry: dict[str, Any]) -> tuple[str, list[KnowledgePreviewPageRead]]:
        return build_preview(entry, resolve_document_path=self._resolve_document_path)

    def _load_index(self) -> dict[str, Any]:
        try:
            payload = json.loads(self.index_path.read_text(encoding="utf-8"))
        except (FileNotFoundError, json.JSONDecodeError):
            payload = {"version": 1, "documents": []}
        payload.setdefault("documents", [])
        return payload

    def _save_index(self, index: dict[str, Any]) -> None:
        self.index_path.write_text(
            json.dumps(index, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )

    def _reconcile_index(self, index: dict[str, Any]) -> bool:
        changed = False
        documents = index.setdefault("documents", [])
        known_by_stored = {
            (item["folder"], item["stored_name"]): item
            for item in documents
            if item.get("folder") and item.get("stored_name")
        }

        existing_items: list[dict[str, Any]] = []
        for item in documents:
            file_path = self._resolve_document_path(item)
            if file_path.exists():
                item["size_bytes"] = file_path.stat().st_size
                item["extension"] = self._extract_extension(item["original_name"])
                item["mime_type"] = item.get("mime_type") or (
                    mimetypes.guess_type(item["original_name"])[0] or "application/octet-stream"
                )
                normalized_status = normalize_ingest_status_code(item.get("ingest_status"))
                if item.get("ingest_status") != normalized_status:
                    item["ingest_status"] = normalized_status
                    changed = True
                if "ingest_agent_run_id" not in item:
                    item["ingest_agent_run_id"] = ""
                    changed = True
                if "ingest_status_updated_at" not in item:
                    item["ingest_status_updated_at"] = (
                        item.get("updated_at") or item.get("created_at") or ""
                    )
                    changed = True
                if "ingest_completed_at" not in item:
                    item["ingest_completed_at"] = ""
                    changed = True
                if "ingest_document_name" not in item:
                    item["ingest_document_name"] = ""
                    changed = True
                if "ingest_document_updated_at" not in item:
                    item["ingest_document_updated_at"] = ""
                    changed = True
                if "ingest_document_sha256" not in item:
                    item["ingest_document_sha256"] = ""
                    changed = True
                existing_items.append(item)
            else:
                changed = True

        for folder_name in FIXED_KNOWLEDGE_FOLDERS:
            folder_path = self.library_root / folder_name
            for file_path in folder_path.iterdir():
                if not file_path.is_file() or file_path.name.startswith("."):
                    continue

                key = (folder_name, file_path.name)
                if key in known_by_stored:
                    continue

                document_id, original_name = self._parse_stored_name(file_path.name)
                stat = file_path.stat()
                existing_items.append(
                    {
                        "id": document_id,
                        "folder": folder_name,
                        "original_name": original_name,
                        "stored_name": file_path.name,
                        "mime_type": mimetypes.guess_type(original_name)[0]
                        or "application/octet-stream",
                        "extension": self._extract_extension(original_name),
                        "size_bytes": stat.st_size,
                        "sha256": "",
                        "created_at": datetime.fromtimestamp(stat.st_ctime, tz=UTC).isoformat(),
                        "updated_at": datetime.fromtimestamp(stat.st_mtime, tz=UTC).isoformat(),
                        "uploaded_by": "系统导入",
                        "version_number": 1,
                        "ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
                        "ingest_status_updated_at": datetime.now(UTC).isoformat(),
                        "ingest_completed_at": "",
                        "ingest_document_name": "",
                        "ingest_document_updated_at": "",
                        "ingest_document_sha256": "",
                        "ingest_agent_run_id": "",
                    }
                )
                changed = True

        if changed or len(existing_items) != len(documents):
            index["documents"] = existing_items
            return True
        return False

    def _reconcile_document_ingest_statuses(
        self,
        index: dict[str, Any],
        *,
        document_ids: list[str] | None = None,
        preserve_syncing: bool = True,
    ) -> bool:
        changed = False
        target_ids = {str(item).strip() for item in document_ids or [] if str(item).strip()}
        status_map = KnowledgeRagService(
            db=self.db, storage_root=self.storage_root
        ).get_document_status_map(
            list(target_ids)
            if target_ids
            else [
                str(item.get("id") or "").strip()
                for item in index.get("documents", [])
                if str(item.get("id") or "").strip()
            ]
        )

        for entry in index.get("documents", []):
            document_id = str(entry.get("id") or "").strip()
            if target_ids and document_id not in target_ids:
                continue

            current_status = normalize_ingest_status_code(entry.get("ingest_status"))
            if entry.get("ingest_status") != current_status:
                entry["ingest_status"] = current_status
                changed = True

            if (
                current_status == KNOWLEDGE_INGEST_STATUS_SYNCING
                and preserve_syncing
                and should_preserve_syncing_status(entry, db=self.db)
            ):
                continue

            status_payload = status_map.get(document_id) or {}
            rag_status = str(status_payload.get("status") or "").strip().lower()
            linked_run_status = resolve_linked_ingest_run_status(entry, db=self.db)
            if not status_payload:
                if (
                    current_status == KNOWLEDGE_INGEST_STATUS_SYNCING
                    and linked_run_status == AgentRunStatus.FAILED.value
                ):
                    desired_status = KNOWLEDGE_INGEST_STATUS_FAILED
                else:
                    continue
            elif linked_run_status == AgentRunStatus.FAILED.value and rag_status in {
                "pending",
                "processing",
                "preprocessed",
            }:
                desired_status = KNOWLEDGE_INGEST_STATUS_FAILED
            elif bool(status_payload.get("query_ready")):
                desired_status = KNOWLEDGE_INGEST_STATUS_INGESTED
            elif rag_status in {"pending", "processing", "preprocessed"}:
                desired_status = KNOWLEDGE_INGEST_STATUS_SYNCING
            elif rag_status == "failed":
                desired_status = KNOWLEDGE_INGEST_STATUS_FAILED
            else:
                desired_status = KNOWLEDGE_INGEST_STATUS_PUBLISHED

            if (
                current_status == KNOWLEDGE_INGEST_STATUS_FAILED
                and desired_status == KNOWLEDGE_INGEST_STATUS_PUBLISHED
            ):
                continue
            if current_status != desired_status:
                entry["ingest_status"] = desired_status
                entry["ingest_status_updated_at"] = (
                    str(status_payload.get("updated_at") or "").strip()
                    or datetime.now(UTC).isoformat()
                )
                if desired_status == KNOWLEDGE_INGEST_STATUS_INGESTED:
                    self._mark_entry_ingested(
                        entry,
                        completed_at=entry.get("ingest_status_updated_at")
                        or datetime.now(UTC).isoformat(),
                    )
                changed = True
            elif desired_status == KNOWLEDGE_INGEST_STATUS_INGESTED:
                changed = self._mark_entry_ingested(entry) or changed

        return changed

    def _apply_ingest_status_to_entry(
        self,
        entry: dict[str, Any],
        *,
        status_code: int,
        updated_at: str,
        agent_run_id: str | None,
    ) -> bool:
        changed = False
        current_status = normalize_ingest_status_code(entry.get("ingest_status"))
        if current_status != status_code:
            entry["ingest_status"] = status_code
            changed = True

        if str(entry.get("ingest_status_updated_at") or "").strip() != updated_at:
            entry["ingest_status_updated_at"] = updated_at
            changed = True

        if agent_run_id is not None and entry.get("ingest_agent_run_id") != agent_run_id:
            entry["ingest_agent_run_id"] = agent_run_id
            changed = True

        if status_code == KNOWLEDGE_INGEST_STATUS_INGESTED:
            changed = self._mark_entry_ingested(entry, completed_at=updated_at) or changed

        return changed

    def _mark_entry_ingested(
        self,
        entry: dict[str, Any],
        *,
        completed_at: str | None = None,
    ) -> bool:
        completed_value = (
            str(completed_at or entry.get("ingest_completed_at") or "").strip()
            or datetime.now(UTC).isoformat()
        )
        expected_values = {
            "ingest_completed_at": completed_value,
            "ingest_document_name": str(entry.get("original_name") or "").strip(),
            "ingest_document_updated_at": str(entry.get("updated_at") or "").strip(),
            "ingest_document_sha256": str(entry.get("sha256") or "").strip(),
        }
        changed = False
        for key, value in expected_values.items():
            if str(entry.get(key) or "").strip() != value:
                entry[key] = value
                changed = True
        return changed

    def _should_index_document(self, entry: dict[str, Any]) -> bool:
        status_code = normalize_ingest_status_code(entry.get("ingest_status"))
        if status_code in {
            KNOWLEDGE_INGEST_STATUS_PUBLISHED,
            KNOWLEDGE_INGEST_STATUS_FAILED,
        }:
            return True
        if status_code == KNOWLEDGE_INGEST_STATUS_SYNCING:
            return is_syncing_status_stale(entry)

        return any(
            [
                not str(entry.get("ingest_completed_at") or "").strip(),
                str(entry.get("ingest_document_name") or "").strip()
                != str(entry.get("original_name") or "").strip(),
                str(entry.get("ingest_document_updated_at") or "").strip()
                != str(entry.get("updated_at") or "").strip(),
                str(entry.get("ingest_document_sha256") or "").strip()
                != str(entry.get("sha256") or "").strip(),
            ]
        )

    @staticmethod
    def _load_json_file(path: Path, *, default: Any) -> Any:
        try:
            return json.loads(path.read_text(encoding="utf-8"))
        except (FileNotFoundError, json.JSONDecodeError):
            return default

    @staticmethod
    def _load_text_file(path: Path) -> str:
        try:
            return path.read_text(encoding="utf-8").strip()
        except FileNotFoundError:
            return ""

    def _require_entry(self, index: dict[str, Any], document_id: str) -> dict[str, Any]:
        for entry in index["documents"]:
            if entry["id"] == document_id:
                return entry
        raise FileNotFoundError(document_id)

    def _resolve_document_path(self, entry: dict[str, Any]) -> Path:
        return self.library_root / entry["folder"] / entry["stored_name"]

    def _replace_document_content(
        self, document_id: str, content: bytes, actor_name: str
    ) -> KnowledgeDocumentDetailRead:
        index = self._load_index()
        entry = self._require_entry(index, document_id)
        current_user = CurrentUserContext(
            username="onlyoffice",
            name=actor_name or "ONLYOFFICE",
            role_codes=["manager"],
            is_admin=True,
        )
        return self.upload_document(
            folder=entry["folder"],
            filename=entry["original_name"],
            content=content,
            current_user=current_user,
        )

    @staticmethod
    def _parse_onlyoffice_callback(payload: dict[str, Any]) -> OnlyOfficeCallbackPayload:
        return parse_onlyoffice_callback(payload)

    _build_onlyoffice_document_key = staticmethod(build_onlyoffice_document_key)
    _build_onlyoffice_access_token = staticmethod(build_onlyoffice_access_token)
    _resolve_onlyoffice_document_type = staticmethod(resolve_onlyoffice_document_type)

    _normalize_filename = staticmethod(normalize_filename)
    _normalize_folder = staticmethod(normalize_folder)
    _extract_extension = staticmethod(extract_extension)
    _parse_stored_name = staticmethod(parse_stored_name)
    _format_time = staticmethod(format_time)
    _format_size = staticmethod(format_size)
    _resolve_file_type = staticmethod(resolve_file_type)
    _resolve_file_type_label = staticmethod(resolve_file_type_label)
    _can_preview = staticmethod(can_preview)
    _read_text_preview = staticmethod(_read_text_preview)
    _extract_docx_text = staticmethod(_extract_docx_text)
    _normalize_extracted_text = staticmethod(_normalize_extracted_text)
    _extract_pdf_text = staticmethod(_extract_pdf_text)
    _extract_text_with_ocr = staticmethod(_extract_text_with_ocr)
    _extract_xlsx_sheets = staticmethod(_extract_xlsx_sheets)
    _extract_pptx_slides = staticmethod(_extract_pptx_slides)

    def _extract_document_text_from_path(
        self,
        *,
        file_path: Path,
        original_name: str,
        mime_type: str,
    ) -> str:
        return _extract_document_text_from_path(
            file_path=file_path,
            original_name=original_name,
            mime_type=mime_type,
        )