feat(server): 扩展知识库服务,添加knowledge API端点和schema定义,前端新增knowledge服务模块
This commit is contained in:
@@ -1,21 +1,33 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import UTC, datetime
|
||||
from typing import Annotated
|
||||
|
||||
from fastapi import APIRouter, Body, Depends, HTTPException, Query, status
|
||||
from fastapi.responses import FileResponse
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.api.deps import CurrentUserContext, get_current_user, require_admin_user
|
||||
from app.api.deps import CurrentUserContext, get_current_user, get_db, require_admin_user
|
||||
from app.core.agent_enums import AgentName, AgentPermissionLevel, AgentRunSource, AgentRunStatus
|
||||
from app.models.agent_asset import AgentAsset
|
||||
from app.schemas.common import ErrorResponse
|
||||
from app.schemas.knowledge import (
|
||||
KnowledgeActionResponse,
|
||||
KnowledgeDocumentDetailRead,
|
||||
KnowledgeLibraryRead,
|
||||
LlmWikiDocumentDetailRead,
|
||||
LlmWikiIndexRead,
|
||||
LlmWikiSummaryUpdateWrite,
|
||||
KnowledgeOnlyOfficeCallbackRead,
|
||||
KnowledgeOnlyOfficeCallbackWrite,
|
||||
KnowledgeOnlyOfficeConfigRead,
|
||||
LlmWikiSyncRead,
|
||||
LlmWikiSyncWrite,
|
||||
)
|
||||
from app.services.agent_runs import AgentRunService
|
||||
from app.services.knowledge import KnowledgeService
|
||||
from app.services.llm_wiki import LlmWikiService
|
||||
|
||||
router = APIRouter(prefix="/knowledge")
|
||||
|
||||
@@ -38,6 +50,176 @@ def get_knowledge_library(
|
||||
return KnowledgeService().list_library()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/llm-wiki",
|
||||
response_model=LlmWikiIndexRead,
|
||||
summary="查询 LLM Wiki 索引",
|
||||
description="返回知识库解析目录中的文档索引和同步次数,仅供管理员查看知识候选与规则候选草稿。",
|
||||
responses={
|
||||
status.HTTP_401_UNAUTHORIZED: {
|
||||
"model": ErrorResponse,
|
||||
"description": "未提供知识库访问用户头。",
|
||||
},
|
||||
status.HTTP_403_FORBIDDEN: {
|
||||
"model": ErrorResponse,
|
||||
"description": "只有管理员可以查看 LLM Wiki 草稿内容。",
|
||||
},
|
||||
},
|
||||
)
|
||||
def get_llm_wiki_index(
|
||||
_: Annotated[CurrentUserContext, Depends(require_admin_user)],
|
||||
db: Annotated[Session, Depends(get_db)],
|
||||
) -> LlmWikiIndexRead:
|
||||
return LlmWikiService(db).get_index()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/llm-wiki/documents/{document_id}",
|
||||
response_model=LlmWikiDocumentDetailRead,
|
||||
summary="读取 LLM Wiki 文档解析结果",
|
||||
description="返回指定知识文档的解析文本、分块、知识候选与规则候选,仅供管理员查看。",
|
||||
responses={
|
||||
status.HTTP_401_UNAUTHORIZED: {
|
||||
"model": ErrorResponse,
|
||||
"description": "未提供知识库访问用户头。",
|
||||
},
|
||||
status.HTTP_403_FORBIDDEN: {
|
||||
"model": ErrorResponse,
|
||||
"description": "只有管理员可以查看 LLM Wiki 草稿内容。",
|
||||
},
|
||||
status.HTTP_404_NOT_FOUND: {
|
||||
"model": ErrorResponse,
|
||||
"description": "指定文档尚未生成 LLM Wiki。",
|
||||
},
|
||||
},
|
||||
)
|
||||
def get_llm_wiki_document_detail(
|
||||
document_id: str,
|
||||
_: Annotated[CurrentUserContext, Depends(require_admin_user)],
|
||||
db: Annotated[Session, Depends(get_db)],
|
||||
) -> LlmWikiDocumentDetailRead:
|
||||
try:
|
||||
return LlmWikiService(db).get_document_detail(document_id)
|
||||
except FileNotFoundError as exc:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="指定文档尚未生成 LLM Wiki。") from exc
|
||||
|
||||
|
||||
@router.patch(
|
||||
"/llm-wiki/documents/{document_id}",
|
||||
response_model=LlmWikiDocumentDetailRead,
|
||||
summary="更新 LLM Wiki 知识总结",
|
||||
description="管理员可修改指定知识文档的 LLM Wiki 知识总结预览,不直接改动原始文件。",
|
||||
responses={
|
||||
status.HTTP_401_UNAUTHORIZED: {
|
||||
"model": ErrorResponse,
|
||||
"description": "未提供知识库访问用户头。",
|
||||
},
|
||||
status.HTTP_403_FORBIDDEN: {
|
||||
"model": ErrorResponse,
|
||||
"description": "只有管理员可以修改 LLM Wiki 草稿内容。",
|
||||
},
|
||||
status.HTTP_404_NOT_FOUND: {
|
||||
"model": ErrorResponse,
|
||||
"description": "指定文档尚未生成 LLM Wiki。",
|
||||
},
|
||||
},
|
||||
)
|
||||
def update_llm_wiki_document_summary(
|
||||
document_id: str,
|
||||
payload: LlmWikiSummaryUpdateWrite,
|
||||
_: Annotated[CurrentUserContext, Depends(require_admin_user)],
|
||||
db: Annotated[Session, Depends(get_db)],
|
||||
) -> LlmWikiDocumentDetailRead:
|
||||
try:
|
||||
return LlmWikiService(db).update_document_summary(document_id, payload)
|
||||
except FileNotFoundError as exc:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="指定文档尚未生成 LLM Wiki。") from exc
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc
|
||||
|
||||
|
||||
@router.post(
|
||||
"/llm-wiki/sync",
|
||||
response_model=LlmWikiSyncRead,
|
||||
summary="触发 Hermes 形成 LLM Wiki 与规则草稿",
|
||||
description="按知识库文档变化情况增量触发系统 Hermes,形成知识候选和规则草稿。",
|
||||
responses={
|
||||
status.HTTP_401_UNAUTHORIZED: {
|
||||
"model": ErrorResponse,
|
||||
"description": "未提供知识库访问用户头。",
|
||||
},
|
||||
status.HTTP_403_FORBIDDEN: {
|
||||
"model": ErrorResponse,
|
||||
"description": "只有管理员可以触发 LLM Wiki 同步。",
|
||||
},
|
||||
},
|
||||
)
|
||||
def sync_llm_wiki(
|
||||
payload: LlmWikiSyncWrite,
|
||||
current_user: Annotated[CurrentUserContext, Depends(require_admin_user)],
|
||||
db: Annotated[Session, Depends(get_db)],
|
||||
) -> LlmWikiSyncRead:
|
||||
run_service = AgentRunService(db)
|
||||
task_asset = db.scalar(
|
||||
select(AgentAsset).where(AgentAsset.code == "task.hermes.llm_wiki_rule_formation")
|
||||
)
|
||||
run = run_service.create_run(
|
||||
agent=AgentName.HERMES.value,
|
||||
source=AgentRunSource.SCHEDULE.value,
|
||||
user_id=current_user.username,
|
||||
task_id=task_asset.id if task_asset is not None else None,
|
||||
permission_level=AgentPermissionLevel.READ.value,
|
||||
status=AgentRunStatus.RUNNING.value,
|
||||
result_summary="Hermes 正在形成 LLM Wiki 与规则草稿。",
|
||||
)
|
||||
|
||||
try:
|
||||
result = LlmWikiService(db).sync_folder(
|
||||
folder=payload.folder,
|
||||
current_user=current_user,
|
||||
document_ids=payload.document_ids,
|
||||
force=payload.force,
|
||||
)
|
||||
run_service.record_tool_call(
|
||||
run_id=run.run_id,
|
||||
tool_type="llm",
|
||||
tool_name="system_hermes_llm_wiki_sync",
|
||||
request_json=payload.model_dump(),
|
||||
response_json=result.model_dump(),
|
||||
status="succeeded",
|
||||
duration_ms=0,
|
||||
)
|
||||
run_service.update_run(
|
||||
run.run_id,
|
||||
status=AgentRunStatus.SUCCEEDED.value,
|
||||
result_summary=result.summary,
|
||||
finished_at=datetime.now(UTC),
|
||||
)
|
||||
return result
|
||||
except Exception as exc:
|
||||
run_service.record_tool_call(
|
||||
run_id=run.run_id,
|
||||
tool_type="llm",
|
||||
tool_name="system_hermes_llm_wiki_sync",
|
||||
request_json=payload.model_dump(),
|
||||
response_json={"error": str(exc)},
|
||||
status="failed",
|
||||
duration_ms=0,
|
||||
error_message=str(exc),
|
||||
)
|
||||
run_service.update_run(
|
||||
run.run_id,
|
||||
status=AgentRunStatus.FAILED.value,
|
||||
error_message=str(exc),
|
||||
finished_at=datetime.now(UTC),
|
||||
)
|
||||
if isinstance(exc, ValueError):
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc
|
||||
if isinstance(exc, FileNotFoundError):
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(exc)) from exc
|
||||
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(exc)) from exc
|
||||
|
||||
|
||||
@router.get(
|
||||
"/documents/{document_id}",
|
||||
response_model=KnowledgeDocumentDetailRead,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
@@ -28,17 +29,18 @@ class KnowledgePreviewPageRead(BaseModel):
|
||||
blocks: list[KnowledgePreviewBlockRead] = Field(default_factory=list)
|
||||
|
||||
|
||||
class KnowledgeDocumentRead(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
folder: str
|
||||
tag: str
|
||||
time: str
|
||||
version: str
|
||||
state: str
|
||||
stateTone: str
|
||||
owner: str
|
||||
icon: str
|
||||
class KnowledgeDocumentRead(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
folder: str
|
||||
tag: str
|
||||
time: str
|
||||
version: str
|
||||
stateCode: int = 1
|
||||
state: str
|
||||
stateTone: str
|
||||
owner: str
|
||||
icon: str
|
||||
fileType: str
|
||||
fileTypeLabel: str
|
||||
summary: str
|
||||
@@ -75,6 +77,106 @@ class KnowledgeLibraryRead(BaseModel):
|
||||
documents: list[KnowledgeDocumentRead] = Field(default_factory=list)
|
||||
|
||||
|
||||
class KnowledgeActionResponse(BaseModel):
|
||||
ok: bool = True
|
||||
detail: str
|
||||
class KnowledgeActionResponse(BaseModel):
|
||||
ok: bool = True
|
||||
detail: str
|
||||
|
||||
|
||||
class LlmWikiChunkRead(BaseModel):
|
||||
chunk_id: str
|
||||
title: str
|
||||
content: str
|
||||
source_page: int | None = None
|
||||
word_count: int = 0
|
||||
tags: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class LlmWikiKnowledgeCandidateRead(BaseModel):
|
||||
candidate_id: str
|
||||
title: str
|
||||
content: str
|
||||
domain: str = "expense"
|
||||
scenario: str = "reimbursement_policy"
|
||||
tags: list[str] = Field(default_factory=list)
|
||||
source_document_id: str
|
||||
source_document_name: str
|
||||
source_chunk_ids: list[str] = Field(default_factory=list)
|
||||
evidence: list[str] = Field(default_factory=list)
|
||||
confidence: float = 0.0
|
||||
status: str = "draft"
|
||||
created_by: str = "hermes"
|
||||
created_at: datetime | None = None
|
||||
|
||||
|
||||
class LlmWikiRuleCandidateRead(BaseModel):
|
||||
candidate_id: str
|
||||
source_type: str = "policy_document"
|
||||
template_key: str
|
||||
template_label: str
|
||||
domain: str = "expense"
|
||||
scenario: str = "reimbursement_policy"
|
||||
suggested_rule_name: str
|
||||
summary: str = ""
|
||||
template_sections: dict[str, Any] = Field(default_factory=dict)
|
||||
rule_markdown_draft: str
|
||||
runtime_rule: dict[str, Any] = Field(default_factory=dict)
|
||||
evidence: list[str] = Field(default_factory=list)
|
||||
confidence: float = 0.0
|
||||
source_document_id: str
|
||||
source_document_name: str
|
||||
source_chunk_ids: list[str] = Field(default_factory=list)
|
||||
generated_asset_id: str | None = None
|
||||
generated_asset_code: str | None = None
|
||||
generated_version: str | None = None
|
||||
validation_status: str = "valid"
|
||||
validation_errors: list[str] = Field(default_factory=list)
|
||||
status: str = "draft"
|
||||
created_by: str = "hermes"
|
||||
created_at: datetime | None = None
|
||||
|
||||
|
||||
class LlmWikiDocumentRead(BaseModel):
|
||||
document_id: str
|
||||
document_name: str
|
||||
folder: str
|
||||
document_version: str = "v1.0"
|
||||
checksum: str = ""
|
||||
extracted_text_path: str
|
||||
chunk_count: int = 0
|
||||
knowledge_candidate_count: int = 0
|
||||
rule_candidate_count: int = 0
|
||||
updated_at: datetime | None = None
|
||||
|
||||
|
||||
class LlmWikiDocumentDetailRead(LlmWikiDocumentRead):
|
||||
knowledge_summary_markdown: str = ""
|
||||
chunks: list[LlmWikiChunkRead] = Field(default_factory=list)
|
||||
knowledge_candidates: list[LlmWikiKnowledgeCandidateRead] = Field(default_factory=list)
|
||||
rule_candidates: list[LlmWikiRuleCandidateRead] = Field(default_factory=list)
|
||||
|
||||
|
||||
class LlmWikiIndexRead(BaseModel):
|
||||
documents: list[LlmWikiDocumentRead] = Field(default_factory=list)
|
||||
sync_run_count: int = 0
|
||||
|
||||
|
||||
class LlmWikiSyncWrite(BaseModel):
|
||||
folder: str = Field(default="报销制度", min_length=1)
|
||||
document_ids: list[str] = Field(default_factory=list)
|
||||
force: bool = False
|
||||
|
||||
|
||||
class LlmWikiSyncRead(BaseModel):
|
||||
ok: bool = True
|
||||
run_id: str
|
||||
folder: str
|
||||
document_count: int = 0
|
||||
knowledge_candidate_count: int = 0
|
||||
rule_candidate_count: int = 0
|
||||
generated_rule_count: int = 0
|
||||
generated_rule_asset_ids: list[str] = Field(default_factory=list)
|
||||
summary: str = ""
|
||||
|
||||
|
||||
class LlmWikiSummaryUpdateWrite(BaseModel):
|
||||
knowledge_summary_markdown: str = Field(min_length=1)
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import mimetypes
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
import hashlib
|
||||
import json
|
||||
import mimetypes
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.request import Request, urlopen
|
||||
from uuid import uuid4
|
||||
from xml.etree import ElementTree
|
||||
@@ -16,8 +18,8 @@ from zipfile import BadZipFile, ZipFile
|
||||
import jwt
|
||||
|
||||
from app.api.deps import CurrentUserContext
|
||||
from app.core.config import get_settings
|
||||
from app.core.logging import get_logger
|
||||
from app.core.config import get_settings
|
||||
from app.core.logging import get_logger
|
||||
from app.schemas.knowledge import (
|
||||
KnowledgeDocumentDetailRead,
|
||||
KnowledgeDocumentRead,
|
||||
@@ -64,7 +66,20 @@ IMAGE_EXTENSIONS = {"png", "jpg", "jpeg", "gif", "bmp", "webp", "svg"}
|
||||
ARCHIVE_EXTENSIONS = {"zip", "rar", "7z"}
|
||||
STRUCTURED_PREVIEW_EXTENSIONS = {"docx", "xlsx", "pptx"} | TEXT_EXTENSIONS
|
||||
INLINE_PREVIEW_EXTENSIONS = {"pdf"} | IMAGE_EXTENSIONS
|
||||
ONLYOFFICE_EDITABLE_EXTENSIONS = {"docx", "xlsx", "pptx"}
|
||||
ONLYOFFICE_EDITABLE_EXTENSIONS = {"docx", "xlsx", "pptx"}
|
||||
KNOWLEDGE_INGEST_SYNC_STALE_SECONDS = 90
|
||||
|
||||
KNOWLEDGE_INGEST_STATUS_PUBLISHED = 1
|
||||
KNOWLEDGE_INGEST_STATUS_SYNCING = 2
|
||||
KNOWLEDGE_INGEST_STATUS_INGESTED = 3
|
||||
KNOWLEDGE_INGEST_STATUS_FAILED = 4
|
||||
|
||||
KNOWLEDGE_INGEST_STATUS_META = {
|
||||
KNOWLEDGE_INGEST_STATUS_PUBLISHED: ("待归纳", "muted"),
|
||||
KNOWLEDGE_INGEST_STATUS_SYNCING: ("正归纳", "warning"),
|
||||
KNOWLEDGE_INGEST_STATUS_INGESTED: ("已归纳", "success"),
|
||||
KNOWLEDGE_INGEST_STATUS_FAILED: ("归纳失败", "danger"),
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
@@ -78,24 +93,40 @@ def prepare_knowledge_library() -> None:
|
||||
KnowledgeService().ensure_library_ready()
|
||||
|
||||
|
||||
class KnowledgeService:
|
||||
def __init__(self, storage_root: Path | None = None) -> None:
|
||||
settings = get_settings()
|
||||
self.storage_root = Path(storage_root or settings.resolved_storage_root_dir)
|
||||
self.library_root = self.storage_root / "knowledge"
|
||||
self.index_path = self.library_root / ".index.json"
|
||||
|
||||
def ensure_library_ready(self) -> None:
|
||||
self.library_root.mkdir(parents=True, exist_ok=True)
|
||||
for folder_name in FIXED_KNOWLEDGE_FOLDERS:
|
||||
(self.library_root / folder_name).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not self.index_path.exists():
|
||||
self._save_index({"version": 1, "documents": []})
|
||||
|
||||
index = self._load_index()
|
||||
if self._reconcile_index(index):
|
||||
self._save_index(index)
|
||||
class KnowledgeService:
|
||||
def __init__(self, storage_root: Path | None = None) -> None:
|
||||
settings = get_settings()
|
||||
self.storage_root = Path(storage_root or settings.resolved_storage_root_dir)
|
||||
self.library_root = self.storage_root / "knowledge"
|
||||
self.index_path = self.library_root / ".index.json"
|
||||
self.llm_wiki_root = self.library_root / ".llm_wiki"
|
||||
self.llm_wiki_documents_root = self.llm_wiki_root / "documents"
|
||||
self.llm_wiki_index_path = self.llm_wiki_root / "index.json"
|
||||
self.llm_wiki_sync_runs_path = self.llm_wiki_root / "sync_runs.json"
|
||||
|
||||
def ensure_library_ready(self) -> None:
|
||||
self.library_root.mkdir(parents=True, exist_ok=True)
|
||||
for folder_name in FIXED_KNOWLEDGE_FOLDERS:
|
||||
(self.library_root / folder_name).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.llm_wiki_documents_root.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not self.index_path.exists():
|
||||
self._save_index({"version": 1, "documents": []})
|
||||
if not self.llm_wiki_index_path.exists():
|
||||
self.llm_wiki_index_path.write_text(
|
||||
json.dumps({"documents": []}, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
if not self.llm_wiki_sync_runs_path.exists():
|
||||
self.llm_wiki_sync_runs_path.write_text(
|
||||
json.dumps({"runs": []}, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
index = self._load_index()
|
||||
if self._reconcile_index(index):
|
||||
self._save_index(index)
|
||||
|
||||
def list_library(self) -> KnowledgeLibraryRead:
|
||||
documents = self._load_documents()
|
||||
@@ -109,21 +140,23 @@ class KnowledgeService:
|
||||
]
|
||||
return KnowledgeLibraryRead(folders=folders, documents=documents)
|
||||
|
||||
def get_document_detail(self, document_id: str) -> KnowledgeDocumentDetailRead:
|
||||
self.ensure_library_ready()
|
||||
index = self._load_index()
|
||||
entry = self._require_entry(index, document_id)
|
||||
preview_kind, preview_pages = self._build_preview(entry)
|
||||
document = self._serialize_document(entry)
|
||||
def get_document_detail(self, document_id: str) -> KnowledgeDocumentDetailRead:
|
||||
self.ensure_library_ready()
|
||||
index = self._load_index()
|
||||
if self._reconcile_document_ingest_statuses(index, document_ids=[document_id]):
|
||||
self._save_index(index)
|
||||
entry = self._require_entry(index, document_id)
|
||||
preview_kind, preview_pages = self._build_preview(entry)
|
||||
document = self._serialize_document(entry)
|
||||
return KnowledgeDocumentDetailRead(
|
||||
**document.model_dump(),
|
||||
previewKind=preview_kind,
|
||||
previewPages=preview_pages,
|
||||
)
|
||||
|
||||
def upload_document(
|
||||
self,
|
||||
folder: str,
|
||||
def upload_document(
|
||||
self,
|
||||
folder: str,
|
||||
filename: str,
|
||||
content: bytes,
|
||||
current_user: CurrentUserContext,
|
||||
@@ -162,22 +195,23 @@ class KnowledgeService:
|
||||
checksum = hashlib.sha256(content).hexdigest()
|
||||
extension = self._extract_extension(normalized_name)
|
||||
|
||||
if existing_entry is None:
|
||||
entry = {
|
||||
"id": document_id,
|
||||
"folder": normalized_folder,
|
||||
"original_name": normalized_name,
|
||||
if existing_entry is None:
|
||||
entry = {
|
||||
"id": document_id,
|
||||
"folder": normalized_folder,
|
||||
"original_name": normalized_name,
|
||||
"stored_name": stored_name,
|
||||
"mime_type": mime_type,
|
||||
"extension": extension,
|
||||
"size_bytes": len(content),
|
||||
"sha256": checksum,
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
"uploaded_by": current_user.name,
|
||||
"version_number": 1,
|
||||
}
|
||||
index["documents"].append(entry)
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
"uploaded_by": current_user.name,
|
||||
"version_number": 1,
|
||||
"ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
|
||||
}
|
||||
index["documents"].append(entry)
|
||||
logger.info(
|
||||
"Knowledge document uploaded id=%s folder=%s filename=%s by=%s",
|
||||
document_id,
|
||||
@@ -193,12 +227,13 @@ class KnowledgeService:
|
||||
"extension": extension,
|
||||
"size_bytes": len(content),
|
||||
"sha256": checksum,
|
||||
"updated_at": now,
|
||||
"uploaded_by": current_user.name,
|
||||
"version_number": int(existing_entry.get("version_number", 1)) + 1,
|
||||
}
|
||||
)
|
||||
entry = existing_entry
|
||||
"updated_at": now,
|
||||
"uploaded_by": current_user.name,
|
||||
"version_number": int(existing_entry.get("version_number", 1)) + 1,
|
||||
"ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
|
||||
}
|
||||
)
|
||||
entry = existing_entry
|
||||
logger.info(
|
||||
"Knowledge document updated id=%s folder=%s filename=%s by=%s",
|
||||
document_id,
|
||||
@@ -222,16 +257,86 @@ class KnowledgeService:
|
||||
self._save_index(index)
|
||||
logger.info("Knowledge document deleted id=%s filename=%s", document_id, entry["original_name"])
|
||||
|
||||
def get_document_content(self, document_id: str) -> tuple[Path, str, str]:
|
||||
self.ensure_library_ready()
|
||||
index = self._load_index()
|
||||
entry = self._require_entry(index, document_id)
|
||||
def get_document_content(self, document_id: str) -> tuple[Path, str, str]:
|
||||
self.ensure_library_ready()
|
||||
index = self._load_index()
|
||||
entry = self._require_entry(index, document_id)
|
||||
file_path = self._resolve_document_path(entry)
|
||||
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(entry["original_name"])
|
||||
|
||||
return file_path, entry["mime_type"], entry["original_name"]
|
||||
|
||||
return file_path, entry["mime_type"], entry["original_name"]
|
||||
|
||||
def list_folder_documents(self, folder: str | None = None) -> list[dict[str, Any]]:
|
||||
self.ensure_library_ready()
|
||||
index = self._load_index()
|
||||
if self._reconcile_document_ingest_statuses(index):
|
||||
self._save_index(index)
|
||||
documents = list(index.get("documents") or [])
|
||||
if folder is None:
|
||||
return documents
|
||||
normalized_folder = self._normalize_folder(folder)
|
||||
return [item for item in documents if item.get("folder") == normalized_folder]
|
||||
|
||||
def get_document_entry(self, document_id: str) -> dict[str, Any]:
|
||||
self.ensure_library_ready()
|
||||
index = self._load_index()
|
||||
if self._reconcile_document_ingest_statuses(index, document_ids=[document_id]):
|
||||
self._save_index(index)
|
||||
return dict(self._require_entry(index, document_id))
|
||||
|
||||
def set_document_ingest_statuses(self, document_ids: list[str], status_code: int) -> None:
|
||||
self.ensure_library_ready()
|
||||
normalized_ids = {str(item).strip() for item in document_ids if str(item).strip()}
|
||||
if not normalized_ids:
|
||||
return
|
||||
|
||||
index = self._load_index()
|
||||
changed = False
|
||||
updated_at = datetime.now(UTC).isoformat()
|
||||
for entry in index.get("documents", []):
|
||||
if str(entry.get("id") or "").strip() not in normalized_ids:
|
||||
continue
|
||||
if self._normalize_ingest_status_code(entry.get("ingest_status")) == status_code:
|
||||
continue
|
||||
entry["ingest_status"] = status_code
|
||||
entry["ingest_status_updated_at"] = updated_at
|
||||
changed = True
|
||||
|
||||
if changed:
|
||||
self._save_index(index)
|
||||
|
||||
def refresh_document_ingest_statuses(
|
||||
self,
|
||||
document_ids: list[str] | None = None,
|
||||
*,
|
||||
preserve_syncing: bool = True,
|
||||
) -> None:
|
||||
self.ensure_library_ready()
|
||||
index = self._load_index()
|
||||
if self._reconcile_document_ingest_statuses(
|
||||
index,
|
||||
document_ids=document_ids,
|
||||
preserve_syncing=preserve_syncing,
|
||||
):
|
||||
self._save_index(index)
|
||||
|
||||
def get_llm_wiki_root(self) -> Path:
|
||||
self.ensure_library_ready()
|
||||
return self.llm_wiki_root
|
||||
|
||||
def extract_document_text(self, document_id: str) -> str:
|
||||
self.ensure_library_ready()
|
||||
entry = self.get_document_entry(document_id)
|
||||
file_path = self._resolve_document_path(entry)
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(entry["original_name"])
|
||||
return self._extract_document_text_from_path(
|
||||
file_path=file_path,
|
||||
original_name=str(entry.get("original_name") or file_path.name),
|
||||
mime_type=str(entry.get("mime_type") or "application/octet-stream"),
|
||||
)
|
||||
|
||||
def build_onlyoffice_config(
|
||||
self,
|
||||
@@ -365,33 +470,41 @@ class KnowledgeService:
|
||||
actor_name = callback.users[0] if callback.users else "ONLYOFFICE"
|
||||
self._replace_document_content(document_id, content, actor_name=actor_name)
|
||||
|
||||
def _load_documents(self) -> list[KnowledgeDocumentRead]:
|
||||
self.ensure_library_ready()
|
||||
index = self._load_index()
|
||||
self._reconcile_index(index)
|
||||
self._save_index(index)
|
||||
|
||||
documents = [self._serialize_document(entry) for entry in index["documents"]]
|
||||
return sorted(documents, key=lambda item: item.time, reverse=True)
|
||||
|
||||
def _serialize_document(self, entry: dict[str, Any]) -> KnowledgeDocumentRead:
|
||||
def _load_documents(self) -> list[KnowledgeDocumentRead]:
|
||||
self.ensure_library_ready()
|
||||
index = self._load_index()
|
||||
changed = self._reconcile_index(index)
|
||||
changed = self._reconcile_document_ingest_statuses(index) or changed
|
||||
if changed:
|
||||
self._save_index(index)
|
||||
|
||||
documents = [self._serialize_document(entry) for entry in index["documents"]]
|
||||
return sorted(documents, key=lambda item: item.time, reverse=True)
|
||||
|
||||
def _serialize_document(self, entry: dict[str, Any]) -> KnowledgeDocumentRead:
|
||||
extension = entry.get("extension") or self._extract_extension(entry["original_name"])
|
||||
file_type = self._resolve_file_type(extension)
|
||||
size_bytes = int(entry.get("size_bytes") or 0)
|
||||
updated_at = self._format_time(entry.get("updated_at") or entry.get("created_at"))
|
||||
|
||||
return KnowledgeDocumentRead(
|
||||
id=entry["id"],
|
||||
name=entry["original_name"],
|
||||
folder=entry["folder"],
|
||||
tag=f"{entry['folder']} / {extension.upper() or 'FILE'}",
|
||||
time=updated_at,
|
||||
version=f"v{int(entry.get('version_number', 1))}.0",
|
||||
state="已发布",
|
||||
stateTone="success",
|
||||
owner=entry.get("uploaded_by") or "系统导入",
|
||||
icon=ICON_BY_TYPE.get(file_type, ICON_BY_TYPE["binary"]),
|
||||
fileType=file_type,
|
||||
file_type = self._resolve_file_type(extension)
|
||||
size_bytes = int(entry.get("size_bytes") or 0)
|
||||
updated_at = self._format_time(entry.get("updated_at") or entry.get("created_at"))
|
||||
state_code = self._normalize_ingest_status_code(entry.get("ingest_status"))
|
||||
state_label, state_tone = KNOWLEDGE_INGEST_STATUS_META.get(
|
||||
state_code,
|
||||
KNOWLEDGE_INGEST_STATUS_META[KNOWLEDGE_INGEST_STATUS_PUBLISHED],
|
||||
)
|
||||
|
||||
return KnowledgeDocumentRead(
|
||||
id=entry["id"],
|
||||
name=entry["original_name"],
|
||||
folder=entry["folder"],
|
||||
tag=f"{entry['folder']} / {extension.upper() or 'FILE'}",
|
||||
time=updated_at,
|
||||
version=f"v{int(entry.get('version_number', 1))}.0",
|
||||
stateCode=state_code,
|
||||
state=state_label,
|
||||
stateTone=state_tone,
|
||||
owner=entry.get("uploaded_by") or "系统导入",
|
||||
icon=ICON_BY_TYPE.get(file_type, ICON_BY_TYPE["binary"]),
|
||||
fileType=file_type,
|
||||
fileTypeLabel=self._resolve_file_type_label(file_type),
|
||||
summary=f"{entry['folder']} · {extension.upper() or 'FILE'} · {self._format_size(size_bytes)}",
|
||||
mimeType=entry.get("mime_type") or "application/octet-stream",
|
||||
@@ -551,27 +664,31 @@ class KnowledgeService:
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
def _reconcile_index(self, index: dict[str, Any]) -> bool:
|
||||
changed = False
|
||||
documents = index.setdefault("documents", [])
|
||||
def _reconcile_index(self, index: dict[str, Any]) -> bool:
|
||||
changed = False
|
||||
documents = index.setdefault("documents", [])
|
||||
known_by_stored = {
|
||||
(item["folder"], item["stored_name"]): item
|
||||
for item in documents
|
||||
if item.get("folder") and item.get("stored_name")
|
||||
}
|
||||
|
||||
existing_items: list[dict[str, Any]] = []
|
||||
for item in documents:
|
||||
file_path = self._resolve_document_path(item)
|
||||
if file_path.exists():
|
||||
item["size_bytes"] = file_path.stat().st_size
|
||||
item["extension"] = self._extract_extension(item["original_name"])
|
||||
item["mime_type"] = item.get("mime_type") or (
|
||||
mimetypes.guess_type(item["original_name"])[0] or "application/octet-stream"
|
||||
)
|
||||
existing_items.append(item)
|
||||
else:
|
||||
changed = True
|
||||
existing_items: list[dict[str, Any]] = []
|
||||
for item in documents:
|
||||
file_path = self._resolve_document_path(item)
|
||||
if file_path.exists():
|
||||
item["size_bytes"] = file_path.stat().st_size
|
||||
item["extension"] = self._extract_extension(item["original_name"])
|
||||
item["mime_type"] = item.get("mime_type") or (
|
||||
mimetypes.guess_type(item["original_name"])[0] or "application/octet-stream"
|
||||
)
|
||||
normalized_status = self._normalize_ingest_status_code(item.get("ingest_status"))
|
||||
if item.get("ingest_status") != normalized_status:
|
||||
item["ingest_status"] = normalized_status
|
||||
changed = True
|
||||
existing_items.append(item)
|
||||
else:
|
||||
changed = True
|
||||
|
||||
for folder_name in FIXED_KNOWLEDGE_FOLDERS:
|
||||
folder_path = self.library_root / folder_name
|
||||
@@ -596,18 +713,128 @@ class KnowledgeService:
|
||||
"extension": self._extract_extension(original_name),
|
||||
"size_bytes": stat.st_size,
|
||||
"sha256": "",
|
||||
"created_at": datetime.fromtimestamp(stat.st_ctime, tz=UTC).isoformat(),
|
||||
"updated_at": datetime.fromtimestamp(stat.st_mtime, tz=UTC).isoformat(),
|
||||
"uploaded_by": "系统导入",
|
||||
"version_number": 1,
|
||||
}
|
||||
)
|
||||
changed = True
|
||||
"created_at": datetime.fromtimestamp(stat.st_ctime, tz=UTC).isoformat(),
|
||||
"updated_at": datetime.fromtimestamp(stat.st_mtime, tz=UTC).isoformat(),
|
||||
"uploaded_by": "系统导入",
|
||||
"version_number": 1,
|
||||
"ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
|
||||
}
|
||||
)
|
||||
changed = True
|
||||
|
||||
if changed or len(existing_items) != len(documents):
|
||||
index["documents"] = existing_items
|
||||
return True
|
||||
return False
|
||||
if changed or len(existing_items) != len(documents):
|
||||
index["documents"] = existing_items
|
||||
return True
|
||||
return False
|
||||
|
||||
def _reconcile_document_ingest_statuses(
|
||||
self,
|
||||
index: dict[str, Any],
|
||||
*,
|
||||
document_ids: list[str] | None = None,
|
||||
preserve_syncing: bool = True,
|
||||
) -> bool:
|
||||
changed = False
|
||||
target_ids = {str(item).strip() for item in document_ids or [] if str(item).strip()}
|
||||
wiki_index = self._load_llm_wiki_index()
|
||||
wiki_by_document_id = {
|
||||
str(item.get("document_id") or "").strip(): item
|
||||
for item in list(wiki_index.get("documents") or [])
|
||||
if str(item.get("document_id") or "").strip()
|
||||
}
|
||||
|
||||
for entry in index.get("documents", []):
|
||||
document_id = str(entry.get("id") or "").strip()
|
||||
if target_ids and document_id not in target_ids:
|
||||
continue
|
||||
|
||||
current_status = self._normalize_ingest_status_code(entry.get("ingest_status"))
|
||||
if entry.get("ingest_status") != current_status:
|
||||
entry["ingest_status"] = current_status
|
||||
changed = True
|
||||
|
||||
if (
|
||||
current_status == KNOWLEDGE_INGEST_STATUS_SYNCING
|
||||
and preserve_syncing
|
||||
and not self._is_syncing_status_stale(entry)
|
||||
):
|
||||
continue
|
||||
|
||||
desired_status = (
|
||||
KNOWLEDGE_INGEST_STATUS_INGESTED
|
||||
if self._has_ingested_llm_wiki_document(entry, wiki_by_document_id.get(document_id))
|
||||
else KNOWLEDGE_INGEST_STATUS_PUBLISHED
|
||||
)
|
||||
if current_status == KNOWLEDGE_INGEST_STATUS_FAILED and desired_status != KNOWLEDGE_INGEST_STATUS_INGESTED:
|
||||
continue
|
||||
if current_status != desired_status:
|
||||
entry["ingest_status"] = desired_status
|
||||
changed = True
|
||||
|
||||
return changed
|
||||
|
||||
def _load_llm_wiki_index(self) -> dict[str, Any]:
|
||||
try:
|
||||
payload = json.loads(self.llm_wiki_index_path.read_text(encoding="utf-8"))
|
||||
except (FileNotFoundError, json.JSONDecodeError):
|
||||
payload = {"documents": []}
|
||||
payload.setdefault("documents", [])
|
||||
return payload
|
||||
|
||||
def _has_ingested_llm_wiki_document(
|
||||
self,
|
||||
entry: dict[str, Any],
|
||||
wiki_document: dict[str, Any] | None,
|
||||
) -> bool:
|
||||
if not isinstance(wiki_document, dict):
|
||||
return False
|
||||
if int(wiki_document.get("knowledge_candidate_count") or 0) <= 0:
|
||||
return False
|
||||
|
||||
current_signature = self._build_llm_wiki_document_signature(entry)
|
||||
wiki_signature = wiki_document.get("signature")
|
||||
if isinstance(wiki_signature, dict):
|
||||
return wiki_signature == current_signature
|
||||
|
||||
return (
|
||||
str(wiki_document.get("document_id") or "").strip() == str(entry.get("id") or "").strip()
|
||||
and str(wiki_document.get("checksum") or "").strip() == str(entry.get("sha256") or "").strip()
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _build_llm_wiki_document_signature(entry: dict[str, Any]) -> dict[str, Any]:
|
||||
return {
|
||||
"document_id": str(entry.get("id") or ""),
|
||||
"original_name": str(entry.get("original_name") or ""),
|
||||
"stored_name": str(entry.get("stored_name") or ""),
|
||||
"sha256": str(entry.get("sha256") or ""),
|
||||
"version_number": int(entry.get("version_number") or 1),
|
||||
"updated_at": str(entry.get("updated_at") or ""),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _normalize_ingest_status_code(value: Any) -> int:
|
||||
try:
|
||||
status_code = int(value)
|
||||
except (TypeError, ValueError):
|
||||
return KNOWLEDGE_INGEST_STATUS_PUBLISHED
|
||||
if status_code not in KNOWLEDGE_INGEST_STATUS_META:
|
||||
return KNOWLEDGE_INGEST_STATUS_PUBLISHED
|
||||
return status_code
|
||||
|
||||
@staticmethod
|
||||
def _is_syncing_status_stale(entry: dict[str, Any]) -> bool:
|
||||
raw_value = str(entry.get("ingest_status_updated_at") or "").strip()
|
||||
if not raw_value:
|
||||
return True
|
||||
try:
|
||||
updated_at = datetime.fromisoformat(raw_value)
|
||||
except ValueError:
|
||||
return True
|
||||
if updated_at.tzinfo is None:
|
||||
updated_at = updated_at.replace(tzinfo=UTC)
|
||||
age_seconds = (datetime.now(UTC) - updated_at.astimezone(UTC)).total_seconds()
|
||||
return age_seconds >= KNOWLEDGE_INGEST_SYNC_STALE_SECONDS
|
||||
|
||||
def _require_entry(self, index: dict[str, Any], document_id: str) -> dict[str, Any]:
|
||||
for entry in index["documents"]:
|
||||
@@ -746,27 +973,109 @@ class KnowledgeService:
|
||||
def _can_preview(extension: str) -> bool:
|
||||
return extension in INLINE_PREVIEW_EXTENSIONS or extension in STRUCTURED_PREVIEW_EXTENSIONS
|
||||
|
||||
@staticmethod
|
||||
def _read_text_preview(file_path: Path) -> str:
|
||||
encodings = ("utf-8", "utf-8-sig", "gbk")
|
||||
for encoding in encodings:
|
||||
try:
|
||||
@staticmethod
|
||||
def _read_text_preview(file_path: Path) -> str:
|
||||
encodings = ("utf-8", "utf-8-sig", "gbk")
|
||||
for encoding in encodings:
|
||||
try:
|
||||
return file_path.read_text(encoding=encoding)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
return "当前文本文件编码暂不支持在线解析。"
|
||||
|
||||
@staticmethod
|
||||
def _extract_docx_text(file_path: Path) -> str:
|
||||
try:
|
||||
with ZipFile(file_path) as archive:
|
||||
xml_content = archive.read("word/document.xml")
|
||||
except (BadZipFile, KeyError):
|
||||
return "当前 Word 文件解析失败。"
|
||||
def _extract_docx_text(file_path: Path) -> str:
|
||||
try:
|
||||
with ZipFile(file_path) as archive:
|
||||
xml_content = archive.read("word/document.xml")
|
||||
except (BadZipFile, KeyError):
|
||||
return "当前 Word 文件解析失败。"
|
||||
|
||||
root = ElementTree.fromstring(xml_content)
|
||||
texts = [node.text.strip() for node in root.iter() if node.tag.endswith("}t") and node.text]
|
||||
return "\n".join(texts)
|
||||
texts = [node.text.strip() for node in root.iter() if node.tag.endswith("}t") and node.text]
|
||||
return "\n".join(texts)
|
||||
|
||||
def _extract_document_text_from_path(
|
||||
self,
|
||||
*,
|
||||
file_path: Path,
|
||||
original_name: str,
|
||||
mime_type: str,
|
||||
) -> str:
|
||||
extension = self._extract_extension(original_name)
|
||||
if extension in TEXT_EXTENSIONS:
|
||||
return self._normalize_extracted_text(self._read_text_preview(file_path))
|
||||
if extension == "docx":
|
||||
return self._normalize_extracted_text(self._extract_docx_text(file_path))
|
||||
if extension == "pdf":
|
||||
text = self._normalize_extracted_text(self._extract_pdf_text(file_path))
|
||||
if text:
|
||||
return text
|
||||
return self._normalize_extracted_text(
|
||||
self._extract_text_with_ocr(
|
||||
file_path=file_path,
|
||||
original_name=original_name,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
)
|
||||
if extension in IMAGE_EXTENSIONS:
|
||||
return self._normalize_extracted_text(
|
||||
self._extract_text_with_ocr(
|
||||
file_path=file_path,
|
||||
original_name=original_name,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
)
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _normalize_extracted_text(text: str) -> str:
|
||||
normalized = str(text or "").replace("\r\n", "\n").replace("\r", "\n")
|
||||
normalized = re.sub(r"\n{3,}", "\n\n", normalized)
|
||||
return normalized.strip()
|
||||
|
||||
@staticmethod
|
||||
def _extract_pdf_text(file_path: Path) -> str:
|
||||
pdftotext_bin = shutil.which("pdftotext")
|
||||
if not pdftotext_bin:
|
||||
return ""
|
||||
|
||||
completed = subprocess.run(
|
||||
[pdftotext_bin, str(file_path), "-"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=40,
|
||||
check=False,
|
||||
)
|
||||
if completed.returncode != 0:
|
||||
return ""
|
||||
return str(completed.stdout or "")
|
||||
|
||||
@staticmethod
|
||||
def _extract_text_with_ocr(
|
||||
*,
|
||||
file_path: Path,
|
||||
original_name: str,
|
||||
mime_type: str,
|
||||
) -> str:
|
||||
try:
|
||||
from app.services.ocr import OcrService
|
||||
|
||||
result = OcrService().recognize_files(
|
||||
[(original_name, file_path.read_bytes(), mime_type)]
|
||||
)
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
parts: list[str] = []
|
||||
for document in result.documents:
|
||||
text = str(getattr(document, "text", "") or "").strip()
|
||||
summary = str(getattr(document, "summary", "") or "").strip()
|
||||
if text:
|
||||
parts.append(text)
|
||||
elif summary:
|
||||
parts.append(summary)
|
||||
return "\n\n".join(part for part in parts if part)
|
||||
|
||||
@staticmethod
|
||||
def _extract_xlsx_sheets(file_path: Path) -> list[tuple[str, list[list[str]]]]:
|
||||
|
||||
Reference in New Issue
Block a user