Add MinerU document ingestion support

Normalize uploaded documents into structured markdown, add clearer parser
errors for missing dependencies, and cover the ingestion flow with
backend tests. This also replaces deprecated UTC timestamp helpers in
the touched backend paths so the knowledge pipeline stays warning-free.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-22 13:42:16 +08:00
parent a9ddf3c9b4
commit 3ee825aa90
20 changed files with 2159 additions and 156 deletions

View File

@@ -1,6 +1,10 @@
import psutil
import time
from datetime import datetime, timedelta
try:
import psutil
except ModuleNotFoundError: # pragma: no cover - optional runtime dependency fallback
psutil = None
from datetime import UTC, datetime, timedelta
from sqlalchemy import select, func, and_
from sqlalchemy.orm import Session
from app.models.conversation import Conversation, Message
@@ -16,6 +20,19 @@ class StatsService:
def get_system_health(self) -> dict:
"""获取系统健康指标"""
if psutil is None:
return {
"uptime_seconds": 0,
"cpu_percent": 0.0,
"memory_used_mb": 0.0,
"memory_total_mb": 0.0,
"memory_percent": 0.0,
"disk_used_gb": 0.0,
"disk_total_gb": 0.0,
"disk_percent": 0.0,
"active_users_24h": 0,
}
uptime_seconds = int(time.time() - psutil.boot_time())
cpu_percent = psutil.cpu_percent(interval=0.1)
mem = psutil.virtual_memory()
@@ -35,7 +52,7 @@ class StatsService:
def _get_daily_stats(self, model, date_column, user_id=None, days=30) -> list:
"""通用每日统计查询"""
cutoff = datetime.utcnow() - timedelta(days=days)
cutoff = datetime.now(UTC) - timedelta(days=days)
query = self.db.query(
func.date(date_column).label('date'),
func.count().label('count')
@@ -50,7 +67,7 @@ class StatsService:
def get_conversation_stats(self, user_id: str = None, days=30) -> dict:
"""获取对话统计数据"""
cutoff = datetime.utcnow() - timedelta(days=days)
cutoff = datetime.now(UTC) - timedelta(days=days)
daily_conversations = self._get_daily_stats(
Conversation, Conversation.created_at, user_id, days
@@ -100,7 +117,7 @@ class StatsService:
def get_knowledge_stats(self, user_id: str = None, days=30) -> dict:
"""获取知识库统计数据"""
cutoff = datetime.utcnow() - timedelta(days=days)
cutoff = datetime.now(UTC) - timedelta(days=days)
# New tags
tag_query = self.db.query(
@@ -145,7 +162,7 @@ class StatsService:
func.date(Task.completed_at).label('date'),
func.count().label('count')
).filter(
Task.completed_at >= datetime.utcnow() - timedelta(days=days),
Task.completed_at >= datetime.now(UTC) - timedelta(days=days),
Task.status == TaskStatus.DONE
)
if user_id:
@@ -195,7 +212,7 @@ class StatsService:
func.date(ForumPost.updated_at).label('date'),
func.count().label('count')
).filter(
ForumPost.updated_at >= datetime.utcnow() - timedelta(days=days),
ForumPost.updated_at >= datetime.now(UTC) - timedelta(days=days),
ForumPost.is_executed == True
)
if user_id:
@@ -243,7 +260,7 @@ class StatsService:
top_tags = [{"tag_path": r.tag_path, "usage_count": r.usage_count} for r in tag_query.all()]
# Token trend
now = datetime.utcnow()
now = datetime.now(UTC)
this_month_start = datetime(now.year, now.month, 1)
last_month_end = this_month_start - timedelta(days=1)
last_month_start = datetime(last_month_end.year, last_month_end.month, 1)