Files
X-Financial/server/src/app/services/knowledge.py
caoxiaozhu d4d5d40569 feat: 新增预算费控模型与报销审批流引擎
后端新增预算费控服务和报销单审批流模块,引入申请人费用画像
算法,优化知识库 RAG 运行时和同步逻辑,完善报销单工作流常
量和明细同步,更新差旅报销规则电子表格,前端新增预算分析
组件和数字员工模型,完善审批对话框和洞察面板交互,优化侧
边栏和顶栏样式,补充单元测试。
2026-05-27 17:31:27 +08:00

801 lines
31 KiB
Python

from __future__ import annotations
import hashlib
import json
import mimetypes
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
from urllib.request import Request, urlopen
from uuid import uuid4
from sqlalchemy.orm import Session
from app.api.deps import CurrentUserContext
from app.core.agent_enums import AgentRunStatus
from app.core.config import get_settings
from app.core.logging import get_logger
from app.schemas.knowledge import (
KnowledgeDocumentDetailRead,
KnowledgeDocumentRead,
KnowledgeFolderRead,
KnowledgeLibraryRead,
KnowledgeOnlyOfficeConfigRead,
KnowledgePreviewPageRead,
)
from app.services.knowledge_rag import KnowledgeRagService
logger = get_logger("app.services.knowledge")
from app.services.knowledge_constants import (
FIXED_KNOWLEDGE_FOLDERS,
ICON_BY_TYPE,
KNOWLEDGE_INGEST_STATUS_FAILED,
KNOWLEDGE_INGEST_STATUS_INGESTED,
KNOWLEDGE_INGEST_STATUS_META,
KNOWLEDGE_INGEST_STATUS_PUBLISHED,
KNOWLEDGE_INGEST_STATUS_SYNCING,
KNOWLEDGE_SEARCH_RESULT_LIMIT,
)
from app.services.knowledge_document_extractors import (
_extract_docx_text,
_extract_document_text_from_path,
_extract_pdf_text,
_extract_pptx_slides,
_extract_text_with_ocr,
_extract_xlsx_sheets,
_normalize_extracted_text,
_read_text_preview,
)
from app.services.knowledge_file_utils import (
can_preview,
extract_extension,
format_size,
format_time,
normalize_filename,
normalize_folder,
parse_stored_name,
resolve_file_type,
resolve_file_type_label,
)
from app.services.knowledge_onlyoffice import (
OnlyOfficeCallbackPayload,
build_onlyoffice_config as build_onlyoffice_config_payload,
build_onlyoffice_access_token,
build_onlyoffice_document_key,
parse_onlyoffice_callback,
resolve_onlyoffice_document_type,
validate_onlyoffice_access_token,
)
from app.services.knowledge_ingest_status import (
is_syncing_status_stale,
normalize_ingest_status_code,
resolve_linked_ingest_run_status,
should_preserve_syncing_status,
)
from app.services.knowledge_preview import build_preview
def prepare_knowledge_library() -> None:
KnowledgeService().ensure_library_ready()
class KnowledgeService:
def __init__(self, storage_root: Path | None = None, db: Session | None = None) -> None:
settings = get_settings()
self.db = db
self.storage_root = Path(storage_root or settings.resolved_storage_root_dir)
self.library_root = self.storage_root / "knowledge"
self.index_path = self.library_root / ".index.json"
def ensure_library_ready(self) -> None:
self.library_root.mkdir(parents=True, exist_ok=True)
for folder_name in FIXED_KNOWLEDGE_FOLDERS:
(self.library_root / folder_name).mkdir(parents=True, exist_ok=True)
if not self.index_path.exists():
self._save_index({"version": 1, "documents": []})
index = self._load_index()
if self._reconcile_index(index):
self._save_index(index)
def list_library(self) -> KnowledgeLibraryRead:
documents = self._load_documents()
folders = [
KnowledgeFolderRead(
name=folder_name,
count=sum(1 for item in documents if item.folder == folder_name),
icon="mdi mdi-folder",
)
for folder_name in FIXED_KNOWLEDGE_FOLDERS
]
return KnowledgeLibraryRead(folders=folders, documents=documents)
def get_document_detail(self, document_id: str) -> KnowledgeDocumentDetailRead:
self.ensure_library_ready()
index = self._load_index()
if self._reconcile_document_ingest_statuses(index, document_ids=[document_id]):
self._save_index(index)
entry = self._require_entry(index, document_id)
preview_kind, preview_pages = self._build_preview(entry)
document = self._serialize_document(entry)
return KnowledgeDocumentDetailRead(
**document.model_dump(),
previewKind=preview_kind,
previewPages=preview_pages,
)
def upload_document(
self,
folder: str,
filename: str,
content: bytes,
current_user: CurrentUserContext,
) -> KnowledgeDocumentDetailRead:
self.ensure_library_ready()
normalized_folder = self._normalize_folder(folder)
normalized_name = self._normalize_filename(filename)
if not content:
raise ValueError("上传文件不能为空。")
rag_service = KnowledgeRagService(db=self.db, storage_root=self.storage_root)
index = self._load_index()
existing_entry = next(
(
item
for item in index["documents"]
if item["folder"] == normalized_folder
and item["original_name"].lower() == normalized_name.lower()
),
None,
)
document_id = existing_entry["id"] if existing_entry else uuid4().hex
stored_name = f"{document_id}__{normalized_name}"
target_path = self.library_root / normalized_folder / stored_name
if existing_entry is not None:
rag_service.delete_document(document_id)
if existing_entry["stored_name"] != stored_name:
old_path = (
self.library_root / existing_entry["folder"] / existing_entry["stored_name"]
)
if old_path.exists():
old_path.unlink()
target_path.write_bytes(content)
now = datetime.now(UTC).isoformat()
mime_type = mimetypes.guess_type(normalized_name)[0] or "application/octet-stream"
checksum = hashlib.sha256(content).hexdigest()
extension = self._extract_extension(normalized_name)
if existing_entry is None:
entry = {
"id": document_id,
"folder": normalized_folder,
"original_name": normalized_name,
"stored_name": stored_name,
"mime_type": mime_type,
"extension": extension,
"size_bytes": len(content),
"sha256": checksum,
"created_at": now,
"updated_at": now,
"uploaded_by": current_user.name,
"version_number": 1,
"ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
"ingest_status_updated_at": now,
"ingest_completed_at": "",
"ingest_document_name": "",
"ingest_document_updated_at": "",
"ingest_document_sha256": "",
"ingest_agent_run_id": "",
}
index["documents"].append(entry)
logger.info(
"Knowledge document uploaded id=%s folder=%s filename=%s by=%s",
document_id,
normalized_folder,
normalized_name,
current_user.name,
)
else:
existing_entry.update(
{
"stored_name": stored_name,
"mime_type": mime_type,
"extension": extension,
"size_bytes": len(content),
"sha256": checksum,
"updated_at": now,
"uploaded_by": current_user.name,
"version_number": int(existing_entry.get("version_number", 1)) + 1,
"ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
"ingest_status_updated_at": now,
"ingest_completed_at": "",
"ingest_document_name": "",
"ingest_document_updated_at": "",
"ingest_document_sha256": "",
"ingest_agent_run_id": "",
}
)
entry = existing_entry
logger.info(
"Knowledge document updated id=%s folder=%s filename=%s by=%s",
document_id,
normalized_folder,
normalized_name,
current_user.name,
)
self._save_index(index)
return self.get_document_detail(document_id)
def delete_document(self, document_id: str) -> None:
self.ensure_library_ready()
index = self._load_index()
entry = self._require_entry(index, document_id)
file_path = self._resolve_document_path(entry)
if file_path.exists():
file_path.unlink()
index["documents"] = [item for item in index["documents"] if item["id"] != document_id]
self._save_index(index)
KnowledgeRagService(db=self.db, storage_root=self.storage_root).delete_document(document_id)
logger.info(
"Knowledge document deleted id=%s filename=%s", document_id, entry["original_name"]
)
def get_document_content(self, document_id: str) -> tuple[Path, str, str]:
self.ensure_library_ready()
index = self._load_index()
entry = self._require_entry(index, document_id)
file_path = self._resolve_document_path(entry)
if not file_path.exists():
raise FileNotFoundError(entry["original_name"])
return file_path, entry["mime_type"], entry["original_name"]
def list_folder_documents(self, folder: str | None = None) -> list[dict[str, Any]]:
self.ensure_library_ready()
index = self._load_index()
if self._reconcile_document_ingest_statuses(index):
self._save_index(index)
documents = list(index.get("documents") or [])
if folder is None:
return documents
normalized_folder = self._normalize_folder(folder)
return [item for item in documents if item.get("folder") == normalized_folder]
def list_documents_for_ingest(
self,
*,
folder: str | None = None,
document_ids: list[str] | None = None,
changed_only: bool = False,
) -> list[dict[str, Any]]:
documents = self.list_folder_documents(folder=folder)
requested_ids = {str(item).strip() for item in document_ids or [] if str(item).strip()}
if requested_ids:
documents = [
item for item in documents if str(item.get("id") or "").strip() in requested_ids
]
if changed_only:
documents = [item for item in documents if self._should_index_document(item)]
return documents
def get_document_entry(self, document_id: str) -> dict[str, Any]:
self.ensure_library_ready()
index = self._load_index()
if self._reconcile_document_ingest_statuses(index, document_ids=[document_id]):
self._save_index(index)
return dict(self._require_entry(index, document_id))
def set_document_ingest_statuses(
self,
document_ids: list[str],
status_code: int,
*,
agent_run_id: str | None = None,
) -> None:
self.ensure_library_ready()
normalized_ids = {str(item).strip() for item in document_ids if str(item).strip()}
if not normalized_ids:
return
index = self._load_index()
changed = False
updated_at = datetime.now(UTC).isoformat()
for entry in index.get("documents", []):
if str(entry.get("id") or "").strip() not in normalized_ids:
continue
changed = (
self._apply_ingest_status_to_entry(
entry,
status_code=status_code,
updated_at=updated_at,
agent_run_id=agent_run_id,
)
or changed
)
if changed:
self._save_index(index)
def refresh_document_ingest_statuses(
self,
document_ids: list[str] | None = None,
*,
preserve_syncing: bool = True,
) -> None:
self.ensure_library_ready()
index = self._load_index()
if self._reconcile_document_ingest_statuses(
index,
document_ids=document_ids,
preserve_syncing=preserve_syncing,
):
self._save_index(index)
def search_knowledge(
self,
query: str,
*,
conversation_history: list[dict[str, str]] | None = None,
limit: int = KNOWLEDGE_SEARCH_RESULT_LIMIT,
) -> dict[str, Any]:
self.ensure_library_ready()
return KnowledgeRagService(db=self.db, storage_root=self.storage_root).query_knowledge(
query,
conversation_history=conversation_history,
limit=limit,
)
def extract_document_text(self, document_id: str) -> str:
self.ensure_library_ready()
entry = self.get_document_entry(document_id)
file_path = self._resolve_document_path(entry)
if not file_path.exists():
raise FileNotFoundError(entry["original_name"])
return self._extract_document_text_from_path(
file_path=file_path,
original_name=str(entry.get("original_name") or file_path.name),
mime_type=str(entry.get("mime_type") or "application/octet-stream"),
)
def build_onlyoffice_config(
self,
document_id: str,
current_user: CurrentUserContext,
) -> KnowledgeOnlyOfficeConfigRead:
self.ensure_library_ready()
index = self._load_index()
entry = self._require_entry(index, document_id)
return build_onlyoffice_config_payload(
document_id=document_id,
entry=entry,
current_user=current_user,
)
def validate_onlyoffice_access_token(self, document_id: str, access_token: str) -> None:
validate_onlyoffice_access_token(document_id, access_token)
def handle_onlyoffice_callback(self, document_id: str, payload: dict[str, Any]) -> None:
self.ensure_library_ready()
callback = self._parse_onlyoffice_callback(payload)
if callback.status not in {2, 6} or not callback.download_url:
return
logger.info(
"ONLYOFFICE callback received id=%s status=%s users=%s",
document_id,
callback.status,
",".join(callback.users) if callback.users else "-",
)
request = Request(callback.download_url, headers={"User-Agent": "x-financial-onlyoffice"})
with urlopen(request, timeout=30) as response: # noqa: S310
content = response.read()
actor_name = callback.users[0] if callback.users else "ONLYOFFICE"
self._replace_document_content(document_id, content, actor_name=actor_name)
def _load_documents(self) -> list[KnowledgeDocumentRead]:
self.ensure_library_ready()
index = self._load_index()
changed = self._reconcile_index(index)
changed = self._reconcile_document_ingest_statuses(index) or changed
if changed:
self._save_index(index)
documents = [self._serialize_document(entry) for entry in index["documents"]]
return sorted(documents, key=lambda item: item.time, reverse=True)
def _serialize_document(
self,
entry: dict[str, Any],
) -> KnowledgeDocumentRead:
extension = entry.get("extension") or self._extract_extension(entry["original_name"])
file_type = self._resolve_file_type(extension)
size_bytes = int(entry.get("size_bytes") or 0)
updated_at = self._format_time(entry.get("updated_at") or entry.get("created_at"))
ingest_time = self._format_time(entry.get("ingest_completed_at"))
state_code = normalize_ingest_status_code(entry.get("ingest_status"))
state_label, state_tone = KNOWLEDGE_INGEST_STATUS_META.get(
state_code,
KNOWLEDGE_INGEST_STATUS_META[KNOWLEDGE_INGEST_STATUS_PUBLISHED],
)
return KnowledgeDocumentRead(
id=entry["id"],
name=entry["original_name"],
folder=entry["folder"],
tag=f"{entry['folder']} / {extension.upper() or 'FILE'}",
time=updated_at,
ingestTime=ingest_time if state_code == KNOWLEDGE_INGEST_STATUS_INGESTED else "",
version=f"v{int(entry.get('version_number', 1))}.0",
stateCode=state_code,
state=state_label,
stateTone=state_tone,
owner=entry.get("uploaded_by") or "????",
icon=ICON_BY_TYPE.get(file_type, ICON_BY_TYPE["binary"]),
fileType=file_type,
fileTypeLabel=self._resolve_file_type_label(file_type),
summary=f"{entry['folder']} ? {extension.upper() or 'FILE'} ? {self._format_size(size_bytes)}",
mimeType=entry.get("mime_type") or "application/octet-stream",
extension=extension,
sizeBytes=size_bytes,
canPreview=self._can_preview(extension),
llmWikiAvailable=False,
llmWikiQualityStatus="",
llmWikiQualityNote="",
)
def _build_preview(self, entry: dict[str, Any]) -> tuple[str, list[KnowledgePreviewPageRead]]:
return build_preview(entry, resolve_document_path=self._resolve_document_path)
def _load_index(self) -> dict[str, Any]:
try:
payload = json.loads(self.index_path.read_text(encoding="utf-8"))
except (FileNotFoundError, json.JSONDecodeError):
payload = {"version": 1, "documents": []}
payload.setdefault("documents", [])
return payload
def _save_index(self, index: dict[str, Any]) -> None:
self.index_path.write_text(
json.dumps(index, ensure_ascii=False, indent=2),
encoding="utf-8",
)
def _reconcile_index(self, index: dict[str, Any]) -> bool:
changed = False
documents = index.setdefault("documents", [])
known_by_stored = {
(item["folder"], item["stored_name"]): item
for item in documents
if item.get("folder") and item.get("stored_name")
}
existing_items: list[dict[str, Any]] = []
for item in documents:
file_path = self._resolve_document_path(item)
if file_path.exists():
item["size_bytes"] = file_path.stat().st_size
item["extension"] = self._extract_extension(item["original_name"])
item["mime_type"] = item.get("mime_type") or (
mimetypes.guess_type(item["original_name"])[0] or "application/octet-stream"
)
normalized_status = normalize_ingest_status_code(item.get("ingest_status"))
if item.get("ingest_status") != normalized_status:
item["ingest_status"] = normalized_status
changed = True
if "ingest_agent_run_id" not in item:
item["ingest_agent_run_id"] = ""
changed = True
if "ingest_status_updated_at" not in item:
item["ingest_status_updated_at"] = (
item.get("updated_at") or item.get("created_at") or ""
)
changed = True
if "ingest_completed_at" not in item:
item["ingest_completed_at"] = ""
changed = True
if "ingest_document_name" not in item:
item["ingest_document_name"] = ""
changed = True
if "ingest_document_updated_at" not in item:
item["ingest_document_updated_at"] = ""
changed = True
if "ingest_document_sha256" not in item:
item["ingest_document_sha256"] = ""
changed = True
existing_items.append(item)
else:
changed = True
for folder_name in FIXED_KNOWLEDGE_FOLDERS:
folder_path = self.library_root / folder_name
for file_path in folder_path.iterdir():
if not file_path.is_file() or file_path.name.startswith("."):
continue
key = (folder_name, file_path.name)
if key in known_by_stored:
continue
document_id, original_name = self._parse_stored_name(file_path.name)
stat = file_path.stat()
existing_items.append(
{
"id": document_id,
"folder": folder_name,
"original_name": original_name,
"stored_name": file_path.name,
"mime_type": mimetypes.guess_type(original_name)[0]
or "application/octet-stream",
"extension": self._extract_extension(original_name),
"size_bytes": stat.st_size,
"sha256": "",
"created_at": datetime.fromtimestamp(stat.st_ctime, tz=UTC).isoformat(),
"updated_at": datetime.fromtimestamp(stat.st_mtime, tz=UTC).isoformat(),
"uploaded_by": "系统导入",
"version_number": 1,
"ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
"ingest_status_updated_at": datetime.now(UTC).isoformat(),
"ingest_completed_at": "",
"ingest_document_name": "",
"ingest_document_updated_at": "",
"ingest_document_sha256": "",
"ingest_agent_run_id": "",
}
)
changed = True
if changed or len(existing_items) != len(documents):
index["documents"] = existing_items
return True
return False
def _reconcile_document_ingest_statuses(
self,
index: dict[str, Any],
*,
document_ids: list[str] | None = None,
preserve_syncing: bool = True,
) -> bool:
changed = False
target_ids = {str(item).strip() for item in document_ids or [] if str(item).strip()}
status_map = KnowledgeRagService(
db=self.db, storage_root=self.storage_root
).get_document_status_map(
list(target_ids)
if target_ids
else [
str(item.get("id") or "").strip()
for item in index.get("documents", [])
if str(item.get("id") or "").strip()
]
)
for entry in index.get("documents", []):
document_id = str(entry.get("id") or "").strip()
if target_ids and document_id not in target_ids:
continue
current_status = normalize_ingest_status_code(entry.get("ingest_status"))
if entry.get("ingest_status") != current_status:
entry["ingest_status"] = current_status
changed = True
if (
current_status == KNOWLEDGE_INGEST_STATUS_SYNCING
and preserve_syncing
and should_preserve_syncing_status(entry, db=self.db)
):
continue
status_payload = status_map.get(document_id) or {}
rag_status = str(status_payload.get("status") or "").strip().lower()
linked_run_status = resolve_linked_ingest_run_status(entry, db=self.db)
if not status_payload:
if (
current_status == KNOWLEDGE_INGEST_STATUS_SYNCING
and linked_run_status == AgentRunStatus.FAILED.value
):
desired_status = KNOWLEDGE_INGEST_STATUS_FAILED
else:
continue
elif linked_run_status == AgentRunStatus.FAILED.value and rag_status in {
"pending",
"processing",
"preprocessed",
}:
desired_status = KNOWLEDGE_INGEST_STATUS_FAILED
elif bool(status_payload.get("query_ready")):
desired_status = KNOWLEDGE_INGEST_STATUS_INGESTED
elif rag_status in {"pending", "processing", "preprocessed"}:
desired_status = KNOWLEDGE_INGEST_STATUS_SYNCING
elif rag_status == "failed":
desired_status = KNOWLEDGE_INGEST_STATUS_FAILED
else:
desired_status = KNOWLEDGE_INGEST_STATUS_PUBLISHED
if (
current_status == KNOWLEDGE_INGEST_STATUS_FAILED
and desired_status == KNOWLEDGE_INGEST_STATUS_PUBLISHED
):
continue
if current_status != desired_status:
entry["ingest_status"] = desired_status
entry["ingest_status_updated_at"] = (
str(status_payload.get("updated_at") or "").strip()
or datetime.now(UTC).isoformat()
)
if desired_status == KNOWLEDGE_INGEST_STATUS_INGESTED:
self._mark_entry_ingested(
entry,
completed_at=entry.get("ingest_status_updated_at")
or datetime.now(UTC).isoformat(),
)
changed = True
elif desired_status == KNOWLEDGE_INGEST_STATUS_INGESTED:
changed = self._mark_entry_ingested(entry) or changed
return changed
def _apply_ingest_status_to_entry(
self,
entry: dict[str, Any],
*,
status_code: int,
updated_at: str,
agent_run_id: str | None,
) -> bool:
changed = False
current_status = normalize_ingest_status_code(entry.get("ingest_status"))
if current_status != status_code:
entry["ingest_status"] = status_code
changed = True
if str(entry.get("ingest_status_updated_at") or "").strip() != updated_at:
entry["ingest_status_updated_at"] = updated_at
changed = True
if agent_run_id is not None and entry.get("ingest_agent_run_id") != agent_run_id:
entry["ingest_agent_run_id"] = agent_run_id
changed = True
if status_code == KNOWLEDGE_INGEST_STATUS_INGESTED:
changed = self._mark_entry_ingested(entry, completed_at=updated_at) or changed
return changed
def _mark_entry_ingested(
self,
entry: dict[str, Any],
*,
completed_at: str | None = None,
) -> bool:
completed_value = (
str(completed_at or entry.get("ingest_completed_at") or "").strip()
or datetime.now(UTC).isoformat()
)
expected_values = {
"ingest_completed_at": completed_value,
"ingest_document_name": str(entry.get("original_name") or "").strip(),
"ingest_document_updated_at": str(entry.get("updated_at") or "").strip(),
"ingest_document_sha256": str(entry.get("sha256") or "").strip(),
}
changed = False
for key, value in expected_values.items():
if str(entry.get(key) or "").strip() != value:
entry[key] = value
changed = True
return changed
def _should_index_document(self, entry: dict[str, Any]) -> bool:
status_code = normalize_ingest_status_code(entry.get("ingest_status"))
if status_code in {
KNOWLEDGE_INGEST_STATUS_PUBLISHED,
KNOWLEDGE_INGEST_STATUS_FAILED,
}:
return True
if status_code == KNOWLEDGE_INGEST_STATUS_SYNCING:
return is_syncing_status_stale(entry)
return any(
[
not str(entry.get("ingest_completed_at") or "").strip(),
str(entry.get("ingest_document_name") or "").strip()
!= str(entry.get("original_name") or "").strip(),
str(entry.get("ingest_document_updated_at") or "").strip()
!= str(entry.get("updated_at") or "").strip(),
str(entry.get("ingest_document_sha256") or "").strip()
!= str(entry.get("sha256") or "").strip(),
]
)
@staticmethod
def _load_json_file(path: Path, *, default: Any) -> Any:
try:
return json.loads(path.read_text(encoding="utf-8"))
except (FileNotFoundError, json.JSONDecodeError):
return default
@staticmethod
def _load_text_file(path: Path) -> str:
try:
return path.read_text(encoding="utf-8").strip()
except FileNotFoundError:
return ""
def _require_entry(self, index: dict[str, Any], document_id: str) -> dict[str, Any]:
for entry in index["documents"]:
if entry["id"] == document_id:
return entry
raise FileNotFoundError(document_id)
def _resolve_document_path(self, entry: dict[str, Any]) -> Path:
return self.library_root / entry["folder"] / entry["stored_name"]
def _replace_document_content(
self, document_id: str, content: bytes, actor_name: str
) -> KnowledgeDocumentDetailRead:
index = self._load_index()
entry = self._require_entry(index, document_id)
current_user = CurrentUserContext(
username="onlyoffice",
name=actor_name or "ONLYOFFICE",
role_codes=["manager"],
is_admin=True,
)
return self.upload_document(
folder=entry["folder"],
filename=entry["original_name"],
content=content,
current_user=current_user,
)
@staticmethod
def _parse_onlyoffice_callback(payload: dict[str, Any]) -> OnlyOfficeCallbackPayload:
return parse_onlyoffice_callback(payload)
_build_onlyoffice_document_key = staticmethod(build_onlyoffice_document_key)
_build_onlyoffice_access_token = staticmethod(build_onlyoffice_access_token)
_resolve_onlyoffice_document_type = staticmethod(resolve_onlyoffice_document_type)
_normalize_filename = staticmethod(normalize_filename)
_normalize_folder = staticmethod(normalize_folder)
_extract_extension = staticmethod(extract_extension)
_parse_stored_name = staticmethod(parse_stored_name)
_format_time = staticmethod(format_time)
_format_size = staticmethod(format_size)
_resolve_file_type = staticmethod(resolve_file_type)
_resolve_file_type_label = staticmethod(resolve_file_type_label)
_can_preview = staticmethod(can_preview)
_read_text_preview = staticmethod(_read_text_preview)
_extract_docx_text = staticmethod(_extract_docx_text)
_normalize_extracted_text = staticmethod(_normalize_extracted_text)
_extract_pdf_text = staticmethod(_extract_pdf_text)
_extract_text_with_ocr = staticmethod(_extract_text_with_ocr)
_extract_xlsx_sheets = staticmethod(_extract_xlsx_sheets)
_extract_pptx_slides = staticmethod(_extract_pptx_slides)
def _extract_document_text_from_path(
self,
*,
file_path: Path,
original_name: str,
mime_type: str,
) -> str:
return _extract_document_text_from_path(
file_path=file_path,
original_name=original_name,
mime_type=mime_type,
)