Add MinerU document ingestion support

Normalize uploaded documents into structured markdown, add clearer parser errors for missing dependencies, and cover the ingestion flow with backend tests. This also replaces deprecated UTC timestamp helpers in the touched backend paths so the knowledge pipeline stays warning-free. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 13:42:16 +08:00
parent a9ddf3c9b4
commit 3ee825aa90
20 changed files with 2159 additions and 156 deletions
--- a/backend/app/services/stats_service.py
+++ b/backend/app/services/stats_service.py
@@ -1,6 +1,10 @@
-import psutil
 import time
-from datetime import datetime, timedelta
+
+try:
+    import psutil
+except ModuleNotFoundError:  # pragma: no cover - optional runtime dependency fallback
+    psutil = None
+from datetime import UTC, datetime, timedelta
 from sqlalchemy import select, func, and_
 from sqlalchemy.orm import Session
 from app.models.conversation import Conversation, Message
@@ -16,6 +20,19 @@ class StatsService:

    def get_system_health(self) -> dict:
        """获取系统健康指标"""
+        if psutil is None:
+            return {
+                "uptime_seconds": 0,
+                "cpu_percent": 0.0,
+                "memory_used_mb": 0.0,
+                "memory_total_mb": 0.0,
+                "memory_percent": 0.0,
+                "disk_used_gb": 0.0,
+                "disk_total_gb": 0.0,
+                "disk_percent": 0.0,
+                "active_users_24h": 0,
+            }
+
        uptime_seconds = int(time.time() - psutil.boot_time())
        cpu_percent = psutil.cpu_percent(interval=0.1)
        mem = psutil.virtual_memory()
@@ -35,7 +52,7 @@ class StatsService:

    def _get_daily_stats(self, model, date_column, user_id=None, days=30) -> list:
        """通用每日统计查询"""
-        cutoff = datetime.utcnow() - timedelta(days=days)
+        cutoff = datetime.now(UTC) - timedelta(days=days)
        query = self.db.query(
            func.date(date_column).label('date'),
            func.count().label('count')
@@ -50,7 +67,7 @@ class StatsService:

    def get_conversation_stats(self, user_id: str = None, days=30) -> dict:
        """获取对话统计数据"""
-        cutoff = datetime.utcnow() - timedelta(days=days)
+        cutoff = datetime.now(UTC) - timedelta(days=days)

        daily_conversations = self._get_daily_stats(
            Conversation, Conversation.created_at, user_id, days
@@ -100,7 +117,7 @@ class StatsService:

    def get_knowledge_stats(self, user_id: str = None, days=30) -> dict:
        """获取知识库统计数据"""
-        cutoff = datetime.utcnow() - timedelta(days=days)
+        cutoff = datetime.now(UTC) - timedelta(days=days)

        # New tags
        tag_query = self.db.query(
@@ -145,7 +162,7 @@ class StatsService:
            func.date(Task.completed_at).label('date'),
            func.count().label('count')
        ).filter(
-            Task.completed_at >= datetime.utcnow() - timedelta(days=days),
+            Task.completed_at >= datetime.now(UTC) - timedelta(days=days),
            Task.status == TaskStatus.DONE
        )
        if user_id:
@@ -195,7 +212,7 @@ class StatsService:
            func.date(ForumPost.updated_at).label('date'),
            func.count().label('count')
        ).filter(
-            ForumPost.updated_at >= datetime.utcnow() - timedelta(days=days),
+            ForumPost.updated_at >= datetime.now(UTC) - timedelta(days=days),
            ForumPost.is_executed == True
        )
        if user_id:
@@ -243,7 +260,7 @@ class StatsService:
        top_tags = [{"tag_path": r.tag_path, "usage_count": r.usage_count} for r in tag_query.all()]

        # Token trend
-        now = datetime.utcnow()
+        now = datetime.now(UTC)
        this_month_start = datetime(now.year, now.month, 1)
        last_month_end = this_month_start - timedelta(days=1)
        last_month_start = datetime(last_month_end.year, last_month_end.month, 1)