后端新增预算费控服务和报销单审批流模块,引入申请人费用画像 算法,优化知识库 RAG 运行时和同步逻辑,完善报销单工作流常 量和明细同步,更新差旅报销规则电子表格,前端新增预算分析 组件和数字员工模型,完善审批对话框和洞察面板交互,优化侧 边栏和顶栏样式,补充单元测试。
801 lines
31 KiB
Python
801 lines
31 KiB
Python
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
import mimetypes
|
|
from datetime import UTC, datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from urllib.request import Request, urlopen
|
|
from uuid import uuid4
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.api.deps import CurrentUserContext
|
|
from app.core.agent_enums import AgentRunStatus
|
|
from app.core.config import get_settings
|
|
from app.core.logging import get_logger
|
|
from app.schemas.knowledge import (
|
|
KnowledgeDocumentDetailRead,
|
|
KnowledgeDocumentRead,
|
|
KnowledgeFolderRead,
|
|
KnowledgeLibraryRead,
|
|
KnowledgeOnlyOfficeConfigRead,
|
|
KnowledgePreviewPageRead,
|
|
)
|
|
from app.services.knowledge_rag import KnowledgeRagService
|
|
|
|
logger = get_logger("app.services.knowledge")
|
|
|
|
from app.services.knowledge_constants import (
|
|
FIXED_KNOWLEDGE_FOLDERS,
|
|
ICON_BY_TYPE,
|
|
KNOWLEDGE_INGEST_STATUS_FAILED,
|
|
KNOWLEDGE_INGEST_STATUS_INGESTED,
|
|
KNOWLEDGE_INGEST_STATUS_META,
|
|
KNOWLEDGE_INGEST_STATUS_PUBLISHED,
|
|
KNOWLEDGE_INGEST_STATUS_SYNCING,
|
|
KNOWLEDGE_SEARCH_RESULT_LIMIT,
|
|
)
|
|
from app.services.knowledge_document_extractors import (
|
|
_extract_docx_text,
|
|
_extract_document_text_from_path,
|
|
_extract_pdf_text,
|
|
_extract_pptx_slides,
|
|
_extract_text_with_ocr,
|
|
_extract_xlsx_sheets,
|
|
_normalize_extracted_text,
|
|
_read_text_preview,
|
|
)
|
|
from app.services.knowledge_file_utils import (
|
|
can_preview,
|
|
extract_extension,
|
|
format_size,
|
|
format_time,
|
|
normalize_filename,
|
|
normalize_folder,
|
|
parse_stored_name,
|
|
resolve_file_type,
|
|
resolve_file_type_label,
|
|
)
|
|
from app.services.knowledge_onlyoffice import (
|
|
OnlyOfficeCallbackPayload,
|
|
build_onlyoffice_config as build_onlyoffice_config_payload,
|
|
build_onlyoffice_access_token,
|
|
build_onlyoffice_document_key,
|
|
parse_onlyoffice_callback,
|
|
resolve_onlyoffice_document_type,
|
|
validate_onlyoffice_access_token,
|
|
)
|
|
from app.services.knowledge_ingest_status import (
|
|
is_syncing_status_stale,
|
|
normalize_ingest_status_code,
|
|
resolve_linked_ingest_run_status,
|
|
should_preserve_syncing_status,
|
|
)
|
|
from app.services.knowledge_preview import build_preview
|
|
|
|
|
|
def prepare_knowledge_library() -> None:
|
|
KnowledgeService().ensure_library_ready()
|
|
|
|
|
|
class KnowledgeService:
|
|
def __init__(self, storage_root: Path | None = None, db: Session | None = None) -> None:
|
|
settings = get_settings()
|
|
self.db = db
|
|
self.storage_root = Path(storage_root or settings.resolved_storage_root_dir)
|
|
self.library_root = self.storage_root / "knowledge"
|
|
self.index_path = self.library_root / ".index.json"
|
|
|
|
def ensure_library_ready(self) -> None:
|
|
self.library_root.mkdir(parents=True, exist_ok=True)
|
|
for folder_name in FIXED_KNOWLEDGE_FOLDERS:
|
|
(self.library_root / folder_name).mkdir(parents=True, exist_ok=True)
|
|
|
|
if not self.index_path.exists():
|
|
self._save_index({"version": 1, "documents": []})
|
|
|
|
index = self._load_index()
|
|
if self._reconcile_index(index):
|
|
self._save_index(index)
|
|
|
|
def list_library(self) -> KnowledgeLibraryRead:
|
|
documents = self._load_documents()
|
|
folders = [
|
|
KnowledgeFolderRead(
|
|
name=folder_name,
|
|
count=sum(1 for item in documents if item.folder == folder_name),
|
|
icon="mdi mdi-folder",
|
|
)
|
|
for folder_name in FIXED_KNOWLEDGE_FOLDERS
|
|
]
|
|
return KnowledgeLibraryRead(folders=folders, documents=documents)
|
|
|
|
def get_document_detail(self, document_id: str) -> KnowledgeDocumentDetailRead:
|
|
self.ensure_library_ready()
|
|
index = self._load_index()
|
|
if self._reconcile_document_ingest_statuses(index, document_ids=[document_id]):
|
|
self._save_index(index)
|
|
entry = self._require_entry(index, document_id)
|
|
preview_kind, preview_pages = self._build_preview(entry)
|
|
document = self._serialize_document(entry)
|
|
return KnowledgeDocumentDetailRead(
|
|
**document.model_dump(),
|
|
previewKind=preview_kind,
|
|
previewPages=preview_pages,
|
|
)
|
|
|
|
def upload_document(
|
|
self,
|
|
folder: str,
|
|
filename: str,
|
|
content: bytes,
|
|
current_user: CurrentUserContext,
|
|
) -> KnowledgeDocumentDetailRead:
|
|
self.ensure_library_ready()
|
|
normalized_folder = self._normalize_folder(folder)
|
|
normalized_name = self._normalize_filename(filename)
|
|
|
|
if not content:
|
|
raise ValueError("上传文件不能为空。")
|
|
|
|
rag_service = KnowledgeRagService(db=self.db, storage_root=self.storage_root)
|
|
index = self._load_index()
|
|
existing_entry = next(
|
|
(
|
|
item
|
|
for item in index["documents"]
|
|
if item["folder"] == normalized_folder
|
|
and item["original_name"].lower() == normalized_name.lower()
|
|
),
|
|
None,
|
|
)
|
|
|
|
document_id = existing_entry["id"] if existing_entry else uuid4().hex
|
|
stored_name = f"{document_id}__{normalized_name}"
|
|
target_path = self.library_root / normalized_folder / stored_name
|
|
|
|
if existing_entry is not None:
|
|
rag_service.delete_document(document_id)
|
|
if existing_entry["stored_name"] != stored_name:
|
|
old_path = (
|
|
self.library_root / existing_entry["folder"] / existing_entry["stored_name"]
|
|
)
|
|
if old_path.exists():
|
|
old_path.unlink()
|
|
|
|
target_path.write_bytes(content)
|
|
|
|
now = datetime.now(UTC).isoformat()
|
|
mime_type = mimetypes.guess_type(normalized_name)[0] or "application/octet-stream"
|
|
checksum = hashlib.sha256(content).hexdigest()
|
|
extension = self._extract_extension(normalized_name)
|
|
|
|
if existing_entry is None:
|
|
entry = {
|
|
"id": document_id,
|
|
"folder": normalized_folder,
|
|
"original_name": normalized_name,
|
|
"stored_name": stored_name,
|
|
"mime_type": mime_type,
|
|
"extension": extension,
|
|
"size_bytes": len(content),
|
|
"sha256": checksum,
|
|
"created_at": now,
|
|
"updated_at": now,
|
|
"uploaded_by": current_user.name,
|
|
"version_number": 1,
|
|
"ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
|
|
"ingest_status_updated_at": now,
|
|
"ingest_completed_at": "",
|
|
"ingest_document_name": "",
|
|
"ingest_document_updated_at": "",
|
|
"ingest_document_sha256": "",
|
|
"ingest_agent_run_id": "",
|
|
}
|
|
index["documents"].append(entry)
|
|
logger.info(
|
|
"Knowledge document uploaded id=%s folder=%s filename=%s by=%s",
|
|
document_id,
|
|
normalized_folder,
|
|
normalized_name,
|
|
current_user.name,
|
|
)
|
|
else:
|
|
existing_entry.update(
|
|
{
|
|
"stored_name": stored_name,
|
|
"mime_type": mime_type,
|
|
"extension": extension,
|
|
"size_bytes": len(content),
|
|
"sha256": checksum,
|
|
"updated_at": now,
|
|
"uploaded_by": current_user.name,
|
|
"version_number": int(existing_entry.get("version_number", 1)) + 1,
|
|
"ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
|
|
"ingest_status_updated_at": now,
|
|
"ingest_completed_at": "",
|
|
"ingest_document_name": "",
|
|
"ingest_document_updated_at": "",
|
|
"ingest_document_sha256": "",
|
|
"ingest_agent_run_id": "",
|
|
}
|
|
)
|
|
entry = existing_entry
|
|
logger.info(
|
|
"Knowledge document updated id=%s folder=%s filename=%s by=%s",
|
|
document_id,
|
|
normalized_folder,
|
|
normalized_name,
|
|
current_user.name,
|
|
)
|
|
|
|
self._save_index(index)
|
|
return self.get_document_detail(document_id)
|
|
|
|
def delete_document(self, document_id: str) -> None:
|
|
self.ensure_library_ready()
|
|
index = self._load_index()
|
|
entry = self._require_entry(index, document_id)
|
|
file_path = self._resolve_document_path(entry)
|
|
if file_path.exists():
|
|
file_path.unlink()
|
|
|
|
index["documents"] = [item for item in index["documents"] if item["id"] != document_id]
|
|
self._save_index(index)
|
|
KnowledgeRagService(db=self.db, storage_root=self.storage_root).delete_document(document_id)
|
|
logger.info(
|
|
"Knowledge document deleted id=%s filename=%s", document_id, entry["original_name"]
|
|
)
|
|
|
|
def get_document_content(self, document_id: str) -> tuple[Path, str, str]:
|
|
self.ensure_library_ready()
|
|
index = self._load_index()
|
|
entry = self._require_entry(index, document_id)
|
|
file_path = self._resolve_document_path(entry)
|
|
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(entry["original_name"])
|
|
|
|
return file_path, entry["mime_type"], entry["original_name"]
|
|
|
|
def list_folder_documents(self, folder: str | None = None) -> list[dict[str, Any]]:
|
|
self.ensure_library_ready()
|
|
index = self._load_index()
|
|
if self._reconcile_document_ingest_statuses(index):
|
|
self._save_index(index)
|
|
documents = list(index.get("documents") or [])
|
|
if folder is None:
|
|
return documents
|
|
normalized_folder = self._normalize_folder(folder)
|
|
return [item for item in documents if item.get("folder") == normalized_folder]
|
|
|
|
def list_documents_for_ingest(
|
|
self,
|
|
*,
|
|
folder: str | None = None,
|
|
document_ids: list[str] | None = None,
|
|
changed_only: bool = False,
|
|
) -> list[dict[str, Any]]:
|
|
documents = self.list_folder_documents(folder=folder)
|
|
requested_ids = {str(item).strip() for item in document_ids or [] if str(item).strip()}
|
|
if requested_ids:
|
|
documents = [
|
|
item for item in documents if str(item.get("id") or "").strip() in requested_ids
|
|
]
|
|
if changed_only:
|
|
documents = [item for item in documents if self._should_index_document(item)]
|
|
return documents
|
|
|
|
def get_document_entry(self, document_id: str) -> dict[str, Any]:
|
|
self.ensure_library_ready()
|
|
index = self._load_index()
|
|
if self._reconcile_document_ingest_statuses(index, document_ids=[document_id]):
|
|
self._save_index(index)
|
|
return dict(self._require_entry(index, document_id))
|
|
|
|
def set_document_ingest_statuses(
|
|
self,
|
|
document_ids: list[str],
|
|
status_code: int,
|
|
*,
|
|
agent_run_id: str | None = None,
|
|
) -> None:
|
|
self.ensure_library_ready()
|
|
normalized_ids = {str(item).strip() for item in document_ids if str(item).strip()}
|
|
if not normalized_ids:
|
|
return
|
|
|
|
index = self._load_index()
|
|
changed = False
|
|
updated_at = datetime.now(UTC).isoformat()
|
|
for entry in index.get("documents", []):
|
|
if str(entry.get("id") or "").strip() not in normalized_ids:
|
|
continue
|
|
changed = (
|
|
self._apply_ingest_status_to_entry(
|
|
entry,
|
|
status_code=status_code,
|
|
updated_at=updated_at,
|
|
agent_run_id=agent_run_id,
|
|
)
|
|
or changed
|
|
)
|
|
|
|
if changed:
|
|
self._save_index(index)
|
|
|
|
def refresh_document_ingest_statuses(
|
|
self,
|
|
document_ids: list[str] | None = None,
|
|
*,
|
|
preserve_syncing: bool = True,
|
|
) -> None:
|
|
self.ensure_library_ready()
|
|
index = self._load_index()
|
|
if self._reconcile_document_ingest_statuses(
|
|
index,
|
|
document_ids=document_ids,
|
|
preserve_syncing=preserve_syncing,
|
|
):
|
|
self._save_index(index)
|
|
|
|
def search_knowledge(
|
|
self,
|
|
query: str,
|
|
*,
|
|
conversation_history: list[dict[str, str]] | None = None,
|
|
limit: int = KNOWLEDGE_SEARCH_RESULT_LIMIT,
|
|
) -> dict[str, Any]:
|
|
self.ensure_library_ready()
|
|
return KnowledgeRagService(db=self.db, storage_root=self.storage_root).query_knowledge(
|
|
query,
|
|
conversation_history=conversation_history,
|
|
limit=limit,
|
|
)
|
|
|
|
def extract_document_text(self, document_id: str) -> str:
|
|
self.ensure_library_ready()
|
|
entry = self.get_document_entry(document_id)
|
|
file_path = self._resolve_document_path(entry)
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(entry["original_name"])
|
|
return self._extract_document_text_from_path(
|
|
file_path=file_path,
|
|
original_name=str(entry.get("original_name") or file_path.name),
|
|
mime_type=str(entry.get("mime_type") or "application/octet-stream"),
|
|
)
|
|
|
|
def build_onlyoffice_config(
|
|
self,
|
|
document_id: str,
|
|
current_user: CurrentUserContext,
|
|
) -> KnowledgeOnlyOfficeConfigRead:
|
|
self.ensure_library_ready()
|
|
index = self._load_index()
|
|
entry = self._require_entry(index, document_id)
|
|
return build_onlyoffice_config_payload(
|
|
document_id=document_id,
|
|
entry=entry,
|
|
current_user=current_user,
|
|
)
|
|
|
|
def validate_onlyoffice_access_token(self, document_id: str, access_token: str) -> None:
|
|
validate_onlyoffice_access_token(document_id, access_token)
|
|
|
|
def handle_onlyoffice_callback(self, document_id: str, payload: dict[str, Any]) -> None:
|
|
self.ensure_library_ready()
|
|
callback = self._parse_onlyoffice_callback(payload)
|
|
if callback.status not in {2, 6} or not callback.download_url:
|
|
return
|
|
|
|
logger.info(
|
|
"ONLYOFFICE callback received id=%s status=%s users=%s",
|
|
document_id,
|
|
callback.status,
|
|
",".join(callback.users) if callback.users else "-",
|
|
)
|
|
|
|
request = Request(callback.download_url, headers={"User-Agent": "x-financial-onlyoffice"})
|
|
with urlopen(request, timeout=30) as response: # noqa: S310
|
|
content = response.read()
|
|
|
|
actor_name = callback.users[0] if callback.users else "ONLYOFFICE"
|
|
self._replace_document_content(document_id, content, actor_name=actor_name)
|
|
|
|
def _load_documents(self) -> list[KnowledgeDocumentRead]:
|
|
self.ensure_library_ready()
|
|
index = self._load_index()
|
|
changed = self._reconcile_index(index)
|
|
changed = self._reconcile_document_ingest_statuses(index) or changed
|
|
if changed:
|
|
self._save_index(index)
|
|
|
|
documents = [self._serialize_document(entry) for entry in index["documents"]]
|
|
return sorted(documents, key=lambda item: item.time, reverse=True)
|
|
|
|
def _serialize_document(
|
|
self,
|
|
entry: dict[str, Any],
|
|
) -> KnowledgeDocumentRead:
|
|
extension = entry.get("extension") or self._extract_extension(entry["original_name"])
|
|
file_type = self._resolve_file_type(extension)
|
|
size_bytes = int(entry.get("size_bytes") or 0)
|
|
updated_at = self._format_time(entry.get("updated_at") or entry.get("created_at"))
|
|
ingest_time = self._format_time(entry.get("ingest_completed_at"))
|
|
state_code = normalize_ingest_status_code(entry.get("ingest_status"))
|
|
state_label, state_tone = KNOWLEDGE_INGEST_STATUS_META.get(
|
|
state_code,
|
|
KNOWLEDGE_INGEST_STATUS_META[KNOWLEDGE_INGEST_STATUS_PUBLISHED],
|
|
)
|
|
|
|
return KnowledgeDocumentRead(
|
|
id=entry["id"],
|
|
name=entry["original_name"],
|
|
folder=entry["folder"],
|
|
tag=f"{entry['folder']} / {extension.upper() or 'FILE'}",
|
|
time=updated_at,
|
|
ingestTime=ingest_time if state_code == KNOWLEDGE_INGEST_STATUS_INGESTED else "",
|
|
version=f"v{int(entry.get('version_number', 1))}.0",
|
|
stateCode=state_code,
|
|
state=state_label,
|
|
stateTone=state_tone,
|
|
owner=entry.get("uploaded_by") or "????",
|
|
icon=ICON_BY_TYPE.get(file_type, ICON_BY_TYPE["binary"]),
|
|
fileType=file_type,
|
|
fileTypeLabel=self._resolve_file_type_label(file_type),
|
|
summary=f"{entry['folder']} ? {extension.upper() or 'FILE'} ? {self._format_size(size_bytes)}",
|
|
mimeType=entry.get("mime_type") or "application/octet-stream",
|
|
extension=extension,
|
|
sizeBytes=size_bytes,
|
|
canPreview=self._can_preview(extension),
|
|
llmWikiAvailable=False,
|
|
llmWikiQualityStatus="",
|
|
llmWikiQualityNote="",
|
|
)
|
|
|
|
def _build_preview(self, entry: dict[str, Any]) -> tuple[str, list[KnowledgePreviewPageRead]]:
|
|
return build_preview(entry, resolve_document_path=self._resolve_document_path)
|
|
|
|
def _load_index(self) -> dict[str, Any]:
|
|
try:
|
|
payload = json.loads(self.index_path.read_text(encoding="utf-8"))
|
|
except (FileNotFoundError, json.JSONDecodeError):
|
|
payload = {"version": 1, "documents": []}
|
|
payload.setdefault("documents", [])
|
|
return payload
|
|
|
|
def _save_index(self, index: dict[str, Any]) -> None:
|
|
self.index_path.write_text(
|
|
json.dumps(index, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
def _reconcile_index(self, index: dict[str, Any]) -> bool:
|
|
changed = False
|
|
documents = index.setdefault("documents", [])
|
|
known_by_stored = {
|
|
(item["folder"], item["stored_name"]): item
|
|
for item in documents
|
|
if item.get("folder") and item.get("stored_name")
|
|
}
|
|
|
|
existing_items: list[dict[str, Any]] = []
|
|
for item in documents:
|
|
file_path = self._resolve_document_path(item)
|
|
if file_path.exists():
|
|
item["size_bytes"] = file_path.stat().st_size
|
|
item["extension"] = self._extract_extension(item["original_name"])
|
|
item["mime_type"] = item.get("mime_type") or (
|
|
mimetypes.guess_type(item["original_name"])[0] or "application/octet-stream"
|
|
)
|
|
normalized_status = normalize_ingest_status_code(item.get("ingest_status"))
|
|
if item.get("ingest_status") != normalized_status:
|
|
item["ingest_status"] = normalized_status
|
|
changed = True
|
|
if "ingest_agent_run_id" not in item:
|
|
item["ingest_agent_run_id"] = ""
|
|
changed = True
|
|
if "ingest_status_updated_at" not in item:
|
|
item["ingest_status_updated_at"] = (
|
|
item.get("updated_at") or item.get("created_at") or ""
|
|
)
|
|
changed = True
|
|
if "ingest_completed_at" not in item:
|
|
item["ingest_completed_at"] = ""
|
|
changed = True
|
|
if "ingest_document_name" not in item:
|
|
item["ingest_document_name"] = ""
|
|
changed = True
|
|
if "ingest_document_updated_at" not in item:
|
|
item["ingest_document_updated_at"] = ""
|
|
changed = True
|
|
if "ingest_document_sha256" not in item:
|
|
item["ingest_document_sha256"] = ""
|
|
changed = True
|
|
existing_items.append(item)
|
|
else:
|
|
changed = True
|
|
|
|
for folder_name in FIXED_KNOWLEDGE_FOLDERS:
|
|
folder_path = self.library_root / folder_name
|
|
for file_path in folder_path.iterdir():
|
|
if not file_path.is_file() or file_path.name.startswith("."):
|
|
continue
|
|
|
|
key = (folder_name, file_path.name)
|
|
if key in known_by_stored:
|
|
continue
|
|
|
|
document_id, original_name = self._parse_stored_name(file_path.name)
|
|
stat = file_path.stat()
|
|
existing_items.append(
|
|
{
|
|
"id": document_id,
|
|
"folder": folder_name,
|
|
"original_name": original_name,
|
|
"stored_name": file_path.name,
|
|
"mime_type": mimetypes.guess_type(original_name)[0]
|
|
or "application/octet-stream",
|
|
"extension": self._extract_extension(original_name),
|
|
"size_bytes": stat.st_size,
|
|
"sha256": "",
|
|
"created_at": datetime.fromtimestamp(stat.st_ctime, tz=UTC).isoformat(),
|
|
"updated_at": datetime.fromtimestamp(stat.st_mtime, tz=UTC).isoformat(),
|
|
"uploaded_by": "系统导入",
|
|
"version_number": 1,
|
|
"ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
|
|
"ingest_status_updated_at": datetime.now(UTC).isoformat(),
|
|
"ingest_completed_at": "",
|
|
"ingest_document_name": "",
|
|
"ingest_document_updated_at": "",
|
|
"ingest_document_sha256": "",
|
|
"ingest_agent_run_id": "",
|
|
}
|
|
)
|
|
changed = True
|
|
|
|
if changed or len(existing_items) != len(documents):
|
|
index["documents"] = existing_items
|
|
return True
|
|
return False
|
|
|
|
def _reconcile_document_ingest_statuses(
|
|
self,
|
|
index: dict[str, Any],
|
|
*,
|
|
document_ids: list[str] | None = None,
|
|
preserve_syncing: bool = True,
|
|
) -> bool:
|
|
changed = False
|
|
target_ids = {str(item).strip() for item in document_ids or [] if str(item).strip()}
|
|
status_map = KnowledgeRagService(
|
|
db=self.db, storage_root=self.storage_root
|
|
).get_document_status_map(
|
|
list(target_ids)
|
|
if target_ids
|
|
else [
|
|
str(item.get("id") or "").strip()
|
|
for item in index.get("documents", [])
|
|
if str(item.get("id") or "").strip()
|
|
]
|
|
)
|
|
|
|
for entry in index.get("documents", []):
|
|
document_id = str(entry.get("id") or "").strip()
|
|
if target_ids and document_id not in target_ids:
|
|
continue
|
|
|
|
current_status = normalize_ingest_status_code(entry.get("ingest_status"))
|
|
if entry.get("ingest_status") != current_status:
|
|
entry["ingest_status"] = current_status
|
|
changed = True
|
|
|
|
if (
|
|
current_status == KNOWLEDGE_INGEST_STATUS_SYNCING
|
|
and preserve_syncing
|
|
and should_preserve_syncing_status(entry, db=self.db)
|
|
):
|
|
continue
|
|
|
|
status_payload = status_map.get(document_id) or {}
|
|
rag_status = str(status_payload.get("status") or "").strip().lower()
|
|
linked_run_status = resolve_linked_ingest_run_status(entry, db=self.db)
|
|
if not status_payload:
|
|
if (
|
|
current_status == KNOWLEDGE_INGEST_STATUS_SYNCING
|
|
and linked_run_status == AgentRunStatus.FAILED.value
|
|
):
|
|
desired_status = KNOWLEDGE_INGEST_STATUS_FAILED
|
|
else:
|
|
continue
|
|
elif linked_run_status == AgentRunStatus.FAILED.value and rag_status in {
|
|
"pending",
|
|
"processing",
|
|
"preprocessed",
|
|
}:
|
|
desired_status = KNOWLEDGE_INGEST_STATUS_FAILED
|
|
elif bool(status_payload.get("query_ready")):
|
|
desired_status = KNOWLEDGE_INGEST_STATUS_INGESTED
|
|
elif rag_status in {"pending", "processing", "preprocessed"}:
|
|
desired_status = KNOWLEDGE_INGEST_STATUS_SYNCING
|
|
elif rag_status == "failed":
|
|
desired_status = KNOWLEDGE_INGEST_STATUS_FAILED
|
|
else:
|
|
desired_status = KNOWLEDGE_INGEST_STATUS_PUBLISHED
|
|
|
|
if (
|
|
current_status == KNOWLEDGE_INGEST_STATUS_FAILED
|
|
and desired_status == KNOWLEDGE_INGEST_STATUS_PUBLISHED
|
|
):
|
|
continue
|
|
if current_status != desired_status:
|
|
entry["ingest_status"] = desired_status
|
|
entry["ingest_status_updated_at"] = (
|
|
str(status_payload.get("updated_at") or "").strip()
|
|
or datetime.now(UTC).isoformat()
|
|
)
|
|
if desired_status == KNOWLEDGE_INGEST_STATUS_INGESTED:
|
|
self._mark_entry_ingested(
|
|
entry,
|
|
completed_at=entry.get("ingest_status_updated_at")
|
|
or datetime.now(UTC).isoformat(),
|
|
)
|
|
changed = True
|
|
elif desired_status == KNOWLEDGE_INGEST_STATUS_INGESTED:
|
|
changed = self._mark_entry_ingested(entry) or changed
|
|
|
|
return changed
|
|
|
|
def _apply_ingest_status_to_entry(
|
|
self,
|
|
entry: dict[str, Any],
|
|
*,
|
|
status_code: int,
|
|
updated_at: str,
|
|
agent_run_id: str | None,
|
|
) -> bool:
|
|
changed = False
|
|
current_status = normalize_ingest_status_code(entry.get("ingest_status"))
|
|
if current_status != status_code:
|
|
entry["ingest_status"] = status_code
|
|
changed = True
|
|
|
|
if str(entry.get("ingest_status_updated_at") or "").strip() != updated_at:
|
|
entry["ingest_status_updated_at"] = updated_at
|
|
changed = True
|
|
|
|
if agent_run_id is not None and entry.get("ingest_agent_run_id") != agent_run_id:
|
|
entry["ingest_agent_run_id"] = agent_run_id
|
|
changed = True
|
|
|
|
if status_code == KNOWLEDGE_INGEST_STATUS_INGESTED:
|
|
changed = self._mark_entry_ingested(entry, completed_at=updated_at) or changed
|
|
|
|
return changed
|
|
|
|
def _mark_entry_ingested(
|
|
self,
|
|
entry: dict[str, Any],
|
|
*,
|
|
completed_at: str | None = None,
|
|
) -> bool:
|
|
completed_value = (
|
|
str(completed_at or entry.get("ingest_completed_at") or "").strip()
|
|
or datetime.now(UTC).isoformat()
|
|
)
|
|
expected_values = {
|
|
"ingest_completed_at": completed_value,
|
|
"ingest_document_name": str(entry.get("original_name") or "").strip(),
|
|
"ingest_document_updated_at": str(entry.get("updated_at") or "").strip(),
|
|
"ingest_document_sha256": str(entry.get("sha256") or "").strip(),
|
|
}
|
|
changed = False
|
|
for key, value in expected_values.items():
|
|
if str(entry.get(key) or "").strip() != value:
|
|
entry[key] = value
|
|
changed = True
|
|
return changed
|
|
|
|
def _should_index_document(self, entry: dict[str, Any]) -> bool:
|
|
status_code = normalize_ingest_status_code(entry.get("ingest_status"))
|
|
if status_code in {
|
|
KNOWLEDGE_INGEST_STATUS_PUBLISHED,
|
|
KNOWLEDGE_INGEST_STATUS_FAILED,
|
|
}:
|
|
return True
|
|
if status_code == KNOWLEDGE_INGEST_STATUS_SYNCING:
|
|
return is_syncing_status_stale(entry)
|
|
|
|
return any(
|
|
[
|
|
not str(entry.get("ingest_completed_at") or "").strip(),
|
|
str(entry.get("ingest_document_name") or "").strip()
|
|
!= str(entry.get("original_name") or "").strip(),
|
|
str(entry.get("ingest_document_updated_at") or "").strip()
|
|
!= str(entry.get("updated_at") or "").strip(),
|
|
str(entry.get("ingest_document_sha256") or "").strip()
|
|
!= str(entry.get("sha256") or "").strip(),
|
|
]
|
|
)
|
|
|
|
@staticmethod
|
|
def _load_json_file(path: Path, *, default: Any) -> Any:
|
|
try:
|
|
return json.loads(path.read_text(encoding="utf-8"))
|
|
except (FileNotFoundError, json.JSONDecodeError):
|
|
return default
|
|
|
|
@staticmethod
|
|
def _load_text_file(path: Path) -> str:
|
|
try:
|
|
return path.read_text(encoding="utf-8").strip()
|
|
except FileNotFoundError:
|
|
return ""
|
|
|
|
def _require_entry(self, index: dict[str, Any], document_id: str) -> dict[str, Any]:
|
|
for entry in index["documents"]:
|
|
if entry["id"] == document_id:
|
|
return entry
|
|
raise FileNotFoundError(document_id)
|
|
|
|
def _resolve_document_path(self, entry: dict[str, Any]) -> Path:
|
|
return self.library_root / entry["folder"] / entry["stored_name"]
|
|
|
|
def _replace_document_content(
|
|
self, document_id: str, content: bytes, actor_name: str
|
|
) -> KnowledgeDocumentDetailRead:
|
|
index = self._load_index()
|
|
entry = self._require_entry(index, document_id)
|
|
current_user = CurrentUserContext(
|
|
username="onlyoffice",
|
|
name=actor_name or "ONLYOFFICE",
|
|
role_codes=["manager"],
|
|
is_admin=True,
|
|
)
|
|
return self.upload_document(
|
|
folder=entry["folder"],
|
|
filename=entry["original_name"],
|
|
content=content,
|
|
current_user=current_user,
|
|
)
|
|
|
|
@staticmethod
|
|
def _parse_onlyoffice_callback(payload: dict[str, Any]) -> OnlyOfficeCallbackPayload:
|
|
return parse_onlyoffice_callback(payload)
|
|
|
|
_build_onlyoffice_document_key = staticmethod(build_onlyoffice_document_key)
|
|
_build_onlyoffice_access_token = staticmethod(build_onlyoffice_access_token)
|
|
_resolve_onlyoffice_document_type = staticmethod(resolve_onlyoffice_document_type)
|
|
|
|
_normalize_filename = staticmethod(normalize_filename)
|
|
_normalize_folder = staticmethod(normalize_folder)
|
|
_extract_extension = staticmethod(extract_extension)
|
|
_parse_stored_name = staticmethod(parse_stored_name)
|
|
_format_time = staticmethod(format_time)
|
|
_format_size = staticmethod(format_size)
|
|
_resolve_file_type = staticmethod(resolve_file_type)
|
|
_resolve_file_type_label = staticmethod(resolve_file_type_label)
|
|
_can_preview = staticmethod(can_preview)
|
|
_read_text_preview = staticmethod(_read_text_preview)
|
|
_extract_docx_text = staticmethod(_extract_docx_text)
|
|
_normalize_extracted_text = staticmethod(_normalize_extracted_text)
|
|
_extract_pdf_text = staticmethod(_extract_pdf_text)
|
|
_extract_text_with_ocr = staticmethod(_extract_text_with_ocr)
|
|
_extract_xlsx_sheets = staticmethod(_extract_xlsx_sheets)
|
|
_extract_pptx_slides = staticmethod(_extract_pptx_slides)
|
|
|
|
def _extract_document_text_from_path(
|
|
self,
|
|
*,
|
|
file_path: Path,
|
|
original_name: str,
|
|
mime_type: str,
|
|
) -> str:
|
|
return _extract_document_text_from_path(
|
|
file_path=file_path,
|
|
original_name=original_name,
|
|
mime_type=mime_type,
|
|
)
|