Add MinerU document ingestion support

Normalize uploaded documents into structured markdown, add clearer parser
errors for missing dependencies, and cover the ingestion flow with
backend tests. This also replaces deprecated UTC timestamp helpers in
the touched backend paths so the knowledge pipeline stays warning-free.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-22 13:42:16 +08:00
parent a9ddf3c9b4
commit 3ee825aa90
20 changed files with 2159 additions and 156 deletions

View File

@@ -11,6 +11,13 @@ class DocumentOut(BaseModel):
summary: str | None
chunk_count: int
is_indexed: bool
ingestion_status: str
ingestion_error: str | None
indexed_at: datetime | None
parser_version: str | None
index_version: str | None
normalized_format: str | None
folder_id: str | None
created_at: datetime
model_config = {"from_attributes": True}
@@ -25,6 +32,10 @@ class DocumentChunkOut(BaseModel):
model_config = {"from_attributes": True}
class DocumentChunkUpdate(BaseModel):
content: str
class SearchRequest(BaseModel):
query: str
top_k: int = 5