from __future__ import annotations import hashlib import json import mimetypes from datetime import UTC, datetime from pathlib import Path from typing import Any from urllib.request import Request, urlopen from uuid import uuid4 from sqlalchemy.orm import Session from app.api.deps import CurrentUserContext from app.core.agent_enums import AgentRunStatus from app.core.config import get_settings from app.core.logging import get_logger from app.schemas.knowledge import ( KnowledgeDocumentDetailRead, KnowledgeDocumentRead, KnowledgeFolderRead, KnowledgeLibraryRead, KnowledgeOnlyOfficeConfigRead, KnowledgePreviewPageRead, ) from app.services.knowledge_rag import KnowledgeRagService logger = get_logger("app.services.knowledge") from app.services.knowledge_constants import ( FIXED_KNOWLEDGE_FOLDERS, ICON_BY_TYPE, KNOWLEDGE_INGEST_STATUS_FAILED, KNOWLEDGE_INGEST_STATUS_INGESTED, KNOWLEDGE_INGEST_STATUS_META, KNOWLEDGE_INGEST_STATUS_PUBLISHED, KNOWLEDGE_INGEST_STATUS_SYNCING, KNOWLEDGE_SEARCH_RESULT_LIMIT, ) from app.services.knowledge_document_extractors import ( _extract_docx_text, _extract_document_text_from_path, _extract_pdf_text, _extract_pptx_slides, _extract_text_with_ocr, _extract_xlsx_sheets, _normalize_extracted_text, _read_text_preview, ) from app.services.knowledge_file_utils import ( can_preview, extract_extension, format_size, format_time, normalize_filename, normalize_folder, parse_stored_name, resolve_file_type, resolve_file_type_label, ) from app.services.knowledge_onlyoffice import ( OnlyOfficeCallbackPayload, build_onlyoffice_config as build_onlyoffice_config_payload, build_onlyoffice_access_token, build_onlyoffice_document_key, parse_onlyoffice_callback, resolve_onlyoffice_document_type, validate_onlyoffice_access_token, ) from app.services.knowledge_ingest_status import ( is_syncing_status_stale, normalize_ingest_status_code, resolve_linked_ingest_run_status, should_preserve_syncing_status, ) from app.services.knowledge_preview import build_preview def prepare_knowledge_library() -> None: KnowledgeService().ensure_library_ready() class KnowledgeService: def __init__(self, storage_root: Path | None = None, db: Session | None = None) -> None: settings = get_settings() self.db = db self.storage_root = Path(storage_root or settings.resolved_storage_root_dir) self.library_root = self.storage_root / "knowledge" self.index_path = self.library_root / ".index.json" def ensure_library_ready(self) -> None: self.library_root.mkdir(parents=True, exist_ok=True) for folder_name in FIXED_KNOWLEDGE_FOLDERS: (self.library_root / folder_name).mkdir(parents=True, exist_ok=True) if not self.index_path.exists(): self._save_index({"version": 1, "documents": []}) index = self._load_index() if self._reconcile_index(index): self._save_index(index) def list_library(self) -> KnowledgeLibraryRead: documents = self._load_documents() folders = [ KnowledgeFolderRead( name=folder_name, count=sum(1 for item in documents if item.folder == folder_name), icon="mdi mdi-folder", ) for folder_name in FIXED_KNOWLEDGE_FOLDERS ] return KnowledgeLibraryRead(folders=folders, documents=documents) def get_document_detail(self, document_id: str) -> KnowledgeDocumentDetailRead: self.ensure_library_ready() index = self._load_index() if self._reconcile_document_ingest_statuses(index, document_ids=[document_id]): self._save_index(index) entry = self._require_entry(index, document_id) preview_kind, preview_pages = self._build_preview(entry) document = self._serialize_document(entry) return KnowledgeDocumentDetailRead( **document.model_dump(), previewKind=preview_kind, previewPages=preview_pages, ) def upload_document( self, folder: str, filename: str, content: bytes, current_user: CurrentUserContext, ) -> KnowledgeDocumentDetailRead: self.ensure_library_ready() normalized_folder = self._normalize_folder(folder) normalized_name = self._normalize_filename(filename) if not content: raise ValueError("上传文件不能为空。") rag_service = KnowledgeRagService(db=self.db, storage_root=self.storage_root) index = self._load_index() existing_entry = next( ( item for item in index["documents"] if item["folder"] == normalized_folder and item["original_name"].lower() == normalized_name.lower() ), None, ) document_id = existing_entry["id"] if existing_entry else uuid4().hex stored_name = f"{document_id}__{normalized_name}" target_path = self.library_root / normalized_folder / stored_name if existing_entry is not None: rag_service.delete_document(document_id) if existing_entry["stored_name"] != stored_name: old_path = ( self.library_root / existing_entry["folder"] / existing_entry["stored_name"] ) if old_path.exists(): old_path.unlink() target_path.write_bytes(content) now = datetime.now(UTC).isoformat() mime_type = mimetypes.guess_type(normalized_name)[0] or "application/octet-stream" checksum = hashlib.sha256(content).hexdigest() extension = self._extract_extension(normalized_name) if existing_entry is None: entry = { "id": document_id, "folder": normalized_folder, "original_name": normalized_name, "stored_name": stored_name, "mime_type": mime_type, "extension": extension, "size_bytes": len(content), "sha256": checksum, "created_at": now, "updated_at": now, "uploaded_by": current_user.name, "version_number": 1, "ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED, "ingest_status_updated_at": now, "ingest_completed_at": "", "ingest_document_name": "", "ingest_document_updated_at": "", "ingest_document_sha256": "", "ingest_agent_run_id": "", } index["documents"].append(entry) logger.info( "Knowledge document uploaded id=%s folder=%s filename=%s by=%s", document_id, normalized_folder, normalized_name, current_user.name, ) else: existing_entry.update( { "stored_name": stored_name, "mime_type": mime_type, "extension": extension, "size_bytes": len(content), "sha256": checksum, "updated_at": now, "uploaded_by": current_user.name, "version_number": int(existing_entry.get("version_number", 1)) + 1, "ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED, "ingest_status_updated_at": now, "ingest_completed_at": "", "ingest_document_name": "", "ingest_document_updated_at": "", "ingest_document_sha256": "", "ingest_agent_run_id": "", } ) entry = existing_entry logger.info( "Knowledge document updated id=%s folder=%s filename=%s by=%s", document_id, normalized_folder, normalized_name, current_user.name, ) self._save_index(index) return self.get_document_detail(document_id) def delete_document(self, document_id: str) -> None: self.ensure_library_ready() index = self._load_index() entry = self._require_entry(index, document_id) file_path = self._resolve_document_path(entry) if file_path.exists(): file_path.unlink() index["documents"] = [item for item in index["documents"] if item["id"] != document_id] self._save_index(index) KnowledgeRagService(db=self.db, storage_root=self.storage_root).delete_document(document_id) logger.info( "Knowledge document deleted id=%s filename=%s", document_id, entry["original_name"] ) def get_document_content(self, document_id: str) -> tuple[Path, str, str]: self.ensure_library_ready() index = self._load_index() entry = self._require_entry(index, document_id) file_path = self._resolve_document_path(entry) if not file_path.exists(): raise FileNotFoundError(entry["original_name"]) return file_path, entry["mime_type"], entry["original_name"] def list_folder_documents(self, folder: str | None = None) -> list[dict[str, Any]]: self.ensure_library_ready() index = self._load_index() if self._reconcile_document_ingest_statuses(index): self._save_index(index) documents = list(index.get("documents") or []) if folder is None: return documents normalized_folder = self._normalize_folder(folder) return [item for item in documents if item.get("folder") == normalized_folder] def list_documents_for_ingest( self, *, folder: str | None = None, document_ids: list[str] | None = None, changed_only: bool = False, ) -> list[dict[str, Any]]: documents = self.list_folder_documents(folder=folder) requested_ids = {str(item).strip() for item in document_ids or [] if str(item).strip()} if requested_ids: documents = [ item for item in documents if str(item.get("id") or "").strip() in requested_ids ] if changed_only: documents = [item for item in documents if self._should_index_document(item)] return documents def get_document_entry(self, document_id: str) -> dict[str, Any]: self.ensure_library_ready() index = self._load_index() if self._reconcile_document_ingest_statuses(index, document_ids=[document_id]): self._save_index(index) return dict(self._require_entry(index, document_id)) def set_document_ingest_statuses( self, document_ids: list[str], status_code: int, *, agent_run_id: str | None = None, ) -> None: self.ensure_library_ready() normalized_ids = {str(item).strip() for item in document_ids if str(item).strip()} if not normalized_ids: return index = self._load_index() changed = False updated_at = datetime.now(UTC).isoformat() for entry in index.get("documents", []): if str(entry.get("id") or "").strip() not in normalized_ids: continue changed = ( self._apply_ingest_status_to_entry( entry, status_code=status_code, updated_at=updated_at, agent_run_id=agent_run_id, ) or changed ) if changed: self._save_index(index) def refresh_document_ingest_statuses( self, document_ids: list[str] | None = None, *, preserve_syncing: bool = True, ) -> None: self.ensure_library_ready() index = self._load_index() if self._reconcile_document_ingest_statuses( index, document_ids=document_ids, preserve_syncing=preserve_syncing, ): self._save_index(index) def search_knowledge( self, query: str, *, conversation_history: list[dict[str, str]] | None = None, limit: int = KNOWLEDGE_SEARCH_RESULT_LIMIT, ) -> dict[str, Any]: self.ensure_library_ready() return KnowledgeRagService(db=self.db, storage_root=self.storage_root).query_knowledge( query, conversation_history=conversation_history, limit=limit, ) def extract_document_text(self, document_id: str) -> str: self.ensure_library_ready() entry = self.get_document_entry(document_id) file_path = self._resolve_document_path(entry) if not file_path.exists(): raise FileNotFoundError(entry["original_name"]) return self._extract_document_text_from_path( file_path=file_path, original_name=str(entry.get("original_name") or file_path.name), mime_type=str(entry.get("mime_type") or "application/octet-stream"), ) def build_onlyoffice_config( self, document_id: str, current_user: CurrentUserContext, ) -> KnowledgeOnlyOfficeConfigRead: self.ensure_library_ready() index = self._load_index() entry = self._require_entry(index, document_id) return build_onlyoffice_config_payload( document_id=document_id, entry=entry, current_user=current_user, ) def validate_onlyoffice_access_token(self, document_id: str, access_token: str) -> None: validate_onlyoffice_access_token(document_id, access_token) def handle_onlyoffice_callback(self, document_id: str, payload: dict[str, Any]) -> None: self.ensure_library_ready() callback = self._parse_onlyoffice_callback(payload) if callback.status not in {2, 6} or not callback.download_url: return logger.info( "ONLYOFFICE callback received id=%s status=%s users=%s", document_id, callback.status, ",".join(callback.users) if callback.users else "-", ) request = Request(callback.download_url, headers={"User-Agent": "x-financial-onlyoffice"}) with urlopen(request, timeout=30) as response: # noqa: S310 content = response.read() actor_name = callback.users[0] if callback.users else "ONLYOFFICE" self._replace_document_content(document_id, content, actor_name=actor_name) def _load_documents(self) -> list[KnowledgeDocumentRead]: self.ensure_library_ready() index = self._load_index() changed = self._reconcile_index(index) changed = self._reconcile_document_ingest_statuses(index) or changed if changed: self._save_index(index) documents = [self._serialize_document(entry) for entry in index["documents"]] return sorted(documents, key=lambda item: item.time, reverse=True) def _serialize_document( self, entry: dict[str, Any], ) -> KnowledgeDocumentRead: extension = entry.get("extension") or self._extract_extension(entry["original_name"]) file_type = self._resolve_file_type(extension) size_bytes = int(entry.get("size_bytes") or 0) updated_at = self._format_time(entry.get("updated_at") or entry.get("created_at")) ingest_time = self._format_time(entry.get("ingest_completed_at")) state_code = normalize_ingest_status_code(entry.get("ingest_status")) state_label, state_tone = KNOWLEDGE_INGEST_STATUS_META.get( state_code, KNOWLEDGE_INGEST_STATUS_META[KNOWLEDGE_INGEST_STATUS_PUBLISHED], ) return KnowledgeDocumentRead( id=entry["id"], name=entry["original_name"], folder=entry["folder"], tag=f"{entry['folder']} / {extension.upper() or 'FILE'}", time=updated_at, ingestTime=ingest_time if state_code == KNOWLEDGE_INGEST_STATUS_INGESTED else "", version=f"v{int(entry.get('version_number', 1))}.0", stateCode=state_code, state=state_label, stateTone=state_tone, owner=entry.get("uploaded_by") or "????", icon=ICON_BY_TYPE.get(file_type, ICON_BY_TYPE["binary"]), fileType=file_type, fileTypeLabel=self._resolve_file_type_label(file_type), summary=f"{entry['folder']} ? {extension.upper() or 'FILE'} ? {self._format_size(size_bytes)}", mimeType=entry.get("mime_type") or "application/octet-stream", extension=extension, sizeBytes=size_bytes, canPreview=self._can_preview(extension), llmWikiAvailable=False, llmWikiQualityStatus="", llmWikiQualityNote="", ) def _build_preview(self, entry: dict[str, Any]) -> tuple[str, list[KnowledgePreviewPageRead]]: return build_preview(entry, resolve_document_path=self._resolve_document_path) def _load_index(self) -> dict[str, Any]: try: payload = json.loads(self.index_path.read_text(encoding="utf-8")) except (FileNotFoundError, json.JSONDecodeError): payload = {"version": 1, "documents": []} payload.setdefault("documents", []) return payload def _save_index(self, index: dict[str, Any]) -> None: self.index_path.write_text( json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8", ) def _reconcile_index(self, index: dict[str, Any]) -> bool: changed = False documents = index.setdefault("documents", []) known_by_stored = { (item["folder"], item["stored_name"]): item for item in documents if item.get("folder") and item.get("stored_name") } existing_items: list[dict[str, Any]] = [] for item in documents: file_path = self._resolve_document_path(item) if file_path.exists(): item["size_bytes"] = file_path.stat().st_size item["extension"] = self._extract_extension(item["original_name"]) item["mime_type"] = item.get("mime_type") or ( mimetypes.guess_type(item["original_name"])[0] or "application/octet-stream" ) normalized_status = normalize_ingest_status_code(item.get("ingest_status")) if item.get("ingest_status") != normalized_status: item["ingest_status"] = normalized_status changed = True if "ingest_agent_run_id" not in item: item["ingest_agent_run_id"] = "" changed = True if "ingest_status_updated_at" not in item: item["ingest_status_updated_at"] = ( item.get("updated_at") or item.get("created_at") or "" ) changed = True if "ingest_completed_at" not in item: item["ingest_completed_at"] = "" changed = True if "ingest_document_name" not in item: item["ingest_document_name"] = "" changed = True if "ingest_document_updated_at" not in item: item["ingest_document_updated_at"] = "" changed = True if "ingest_document_sha256" not in item: item["ingest_document_sha256"] = "" changed = True existing_items.append(item) else: changed = True for folder_name in FIXED_KNOWLEDGE_FOLDERS: folder_path = self.library_root / folder_name for file_path in folder_path.iterdir(): if not file_path.is_file() or file_path.name.startswith("."): continue key = (folder_name, file_path.name) if key in known_by_stored: continue document_id, original_name = self._parse_stored_name(file_path.name) stat = file_path.stat() existing_items.append( { "id": document_id, "folder": folder_name, "original_name": original_name, "stored_name": file_path.name, "mime_type": mimetypes.guess_type(original_name)[0] or "application/octet-stream", "extension": self._extract_extension(original_name), "size_bytes": stat.st_size, "sha256": "", "created_at": datetime.fromtimestamp(stat.st_ctime, tz=UTC).isoformat(), "updated_at": datetime.fromtimestamp(stat.st_mtime, tz=UTC).isoformat(), "uploaded_by": "系统导入", "version_number": 1, "ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED, "ingest_status_updated_at": datetime.now(UTC).isoformat(), "ingest_completed_at": "", "ingest_document_name": "", "ingest_document_updated_at": "", "ingest_document_sha256": "", "ingest_agent_run_id": "", } ) changed = True if changed or len(existing_items) != len(documents): index["documents"] = existing_items return True return False def _reconcile_document_ingest_statuses( self, index: dict[str, Any], *, document_ids: list[str] | None = None, preserve_syncing: bool = True, ) -> bool: changed = False target_ids = {str(item).strip() for item in document_ids or [] if str(item).strip()} status_map = KnowledgeRagService( db=self.db, storage_root=self.storage_root ).get_document_status_map( list(target_ids) if target_ids else [ str(item.get("id") or "").strip() for item in index.get("documents", []) if str(item.get("id") or "").strip() ] ) for entry in index.get("documents", []): document_id = str(entry.get("id") or "").strip() if target_ids and document_id not in target_ids: continue current_status = normalize_ingest_status_code(entry.get("ingest_status")) if entry.get("ingest_status") != current_status: entry["ingest_status"] = current_status changed = True if ( current_status == KNOWLEDGE_INGEST_STATUS_SYNCING and preserve_syncing and should_preserve_syncing_status(entry, db=self.db) ): continue status_payload = status_map.get(document_id) or {} rag_status = str(status_payload.get("status") or "").strip().lower() linked_run_status = resolve_linked_ingest_run_status(entry, db=self.db) if not status_payload: if ( current_status == KNOWLEDGE_INGEST_STATUS_SYNCING and linked_run_status == AgentRunStatus.FAILED.value ): desired_status = KNOWLEDGE_INGEST_STATUS_FAILED else: continue elif linked_run_status == AgentRunStatus.FAILED.value and rag_status in { "pending", "processing", "preprocessed", }: desired_status = KNOWLEDGE_INGEST_STATUS_FAILED elif bool(status_payload.get("query_ready")): desired_status = KNOWLEDGE_INGEST_STATUS_INGESTED elif rag_status in {"pending", "processing", "preprocessed"}: desired_status = KNOWLEDGE_INGEST_STATUS_SYNCING elif rag_status == "failed": desired_status = KNOWLEDGE_INGEST_STATUS_FAILED else: desired_status = KNOWLEDGE_INGEST_STATUS_PUBLISHED if ( current_status == KNOWLEDGE_INGEST_STATUS_FAILED and desired_status == KNOWLEDGE_INGEST_STATUS_PUBLISHED ): continue if current_status != desired_status: entry["ingest_status"] = desired_status entry["ingest_status_updated_at"] = ( str(status_payload.get("updated_at") or "").strip() or datetime.now(UTC).isoformat() ) if desired_status == KNOWLEDGE_INGEST_STATUS_INGESTED: self._mark_entry_ingested( entry, completed_at=entry.get("ingest_status_updated_at") or datetime.now(UTC).isoformat(), ) changed = True elif desired_status == KNOWLEDGE_INGEST_STATUS_INGESTED: changed = self._mark_entry_ingested(entry) or changed return changed def _apply_ingest_status_to_entry( self, entry: dict[str, Any], *, status_code: int, updated_at: str, agent_run_id: str | None, ) -> bool: changed = False current_status = normalize_ingest_status_code(entry.get("ingest_status")) if current_status != status_code: entry["ingest_status"] = status_code changed = True if str(entry.get("ingest_status_updated_at") or "").strip() != updated_at: entry["ingest_status_updated_at"] = updated_at changed = True if agent_run_id is not None and entry.get("ingest_agent_run_id") != agent_run_id: entry["ingest_agent_run_id"] = agent_run_id changed = True if status_code == KNOWLEDGE_INGEST_STATUS_INGESTED: changed = self._mark_entry_ingested(entry, completed_at=updated_at) or changed return changed def _mark_entry_ingested( self, entry: dict[str, Any], *, completed_at: str | None = None, ) -> bool: completed_value = ( str(completed_at or entry.get("ingest_completed_at") or "").strip() or datetime.now(UTC).isoformat() ) expected_values = { "ingest_completed_at": completed_value, "ingest_document_name": str(entry.get("original_name") or "").strip(), "ingest_document_updated_at": str(entry.get("updated_at") or "").strip(), "ingest_document_sha256": str(entry.get("sha256") or "").strip(), } changed = False for key, value in expected_values.items(): if str(entry.get(key) or "").strip() != value: entry[key] = value changed = True return changed def _should_index_document(self, entry: dict[str, Any]) -> bool: status_code = normalize_ingest_status_code(entry.get("ingest_status")) if status_code in { KNOWLEDGE_INGEST_STATUS_PUBLISHED, KNOWLEDGE_INGEST_STATUS_FAILED, }: return True if status_code == KNOWLEDGE_INGEST_STATUS_SYNCING: return is_syncing_status_stale(entry) return any( [ not str(entry.get("ingest_completed_at") or "").strip(), str(entry.get("ingest_document_name") or "").strip() != str(entry.get("original_name") or "").strip(), str(entry.get("ingest_document_updated_at") or "").strip() != str(entry.get("updated_at") or "").strip(), str(entry.get("ingest_document_sha256") or "").strip() != str(entry.get("sha256") or "").strip(), ] ) @staticmethod def _load_json_file(path: Path, *, default: Any) -> Any: try: return json.loads(path.read_text(encoding="utf-8")) except (FileNotFoundError, json.JSONDecodeError): return default @staticmethod def _load_text_file(path: Path) -> str: try: return path.read_text(encoding="utf-8").strip() except FileNotFoundError: return "" def _require_entry(self, index: dict[str, Any], document_id: str) -> dict[str, Any]: for entry in index["documents"]: if entry["id"] == document_id: return entry raise FileNotFoundError(document_id) def _resolve_document_path(self, entry: dict[str, Any]) -> Path: return self.library_root / entry["folder"] / entry["stored_name"] def _replace_document_content( self, document_id: str, content: bytes, actor_name: str ) -> KnowledgeDocumentDetailRead: index = self._load_index() entry = self._require_entry(index, document_id) current_user = CurrentUserContext( username="onlyoffice", name=actor_name or "ONLYOFFICE", role_codes=["manager"], is_admin=True, ) return self.upload_document( folder=entry["folder"], filename=entry["original_name"], content=content, current_user=current_user, ) @staticmethod def _parse_onlyoffice_callback(payload: dict[str, Any]) -> OnlyOfficeCallbackPayload: return parse_onlyoffice_callback(payload) _build_onlyoffice_document_key = staticmethod(build_onlyoffice_document_key) _build_onlyoffice_access_token = staticmethod(build_onlyoffice_access_token) _resolve_onlyoffice_document_type = staticmethod(resolve_onlyoffice_document_type) _normalize_filename = staticmethod(normalize_filename) _normalize_folder = staticmethod(normalize_folder) _extract_extension = staticmethod(extract_extension) _parse_stored_name = staticmethod(parse_stored_name) _format_time = staticmethod(format_time) _format_size = staticmethod(format_size) _resolve_file_type = staticmethod(resolve_file_type) _resolve_file_type_label = staticmethod(resolve_file_type_label) _can_preview = staticmethod(can_preview) _read_text_preview = staticmethod(_read_text_preview) _extract_docx_text = staticmethod(_extract_docx_text) _normalize_extracted_text = staticmethod(_normalize_extracted_text) _extract_pdf_text = staticmethod(_extract_pdf_text) _extract_text_with_ocr = staticmethod(_extract_text_with_ocr) _extract_xlsx_sheets = staticmethod(_extract_xlsx_sheets) _extract_pptx_slides = staticmethod(_extract_pptx_slides) def _extract_document_text_from_path( self, *, file_path: Path, original_name: str, mime_type: str, ) -> str: return _extract_document_text_from_path( file_path=file_path, original_name=original_name, mime_type=mime_type, )