feat(server): 系统缓存清理接口与 OCR 文本层兜底增强

- 新增 system_cache 模块与 POST /settings/cache/clear，管理员可一键清理 OCR 结果/运行时配置/模型失败冷却/知识库索引/地点语义等进程内缓存 - 各服务暴露 clear_*_cache 方法（ocr/runtime_settings/runtime_chat/knowledge/application_location_semantic），SettingsCacheClearRead 汇总清理项 - OCR 转图片失败时尝试用 PDF 文本层兜底构建识别文档（有效字符≥8），并写结果缓存；OcrService 暴露 clear_result_cache - receipt_folder 车票过滤补充身份证号关键词，附件文档/操作/展示模块同步适配 - 新增 system_cache_endpoints 测试，更新 openapi_schema/ocr/receipt_folder/attachment_association_jobs 测试
2026-06-24 12:35:51 +08:00
parent 50d2dc579a
commit 9a5ed0e94a
17 changed files with 932 additions and 13 deletions
--- a/server/src/app/services/receipt_folder.py
+++ b/server/src/app/services/receipt_folder.py
@@ -889,6 +889,8 @@ class ReceiptFolderTrainTicketMixin:
                "无效",
                "二维码",
                "座席",
+                "身份",
+                "身份证号",
                "证件",
            )
        ):
@@ -993,6 +995,11 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
                current_user=current_user,
            )
            if duplicate_receipt is not None:
+                duplicate_receipt = self._refresh_duplicate_receipt_from_document_if_stronger(
+                    receipt=duplicate_receipt,
+                    document=document,
+                    current_user=current_user,
+                )
                warning = "已上传过同样的单据，请不要重复上传。"
                existing_warnings = [str(item) for item in list(document.warnings or []) if str(item).strip()]
                enriched.append(
@@ -1061,6 +1068,7 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
                        if str(value).strip()
                    ],
                    "document_fields": self._build_ocr_document_fields_from_meta(meta),
+                    "preview_kind": str(meta.get("preview_kind") or document.preview_kind or ""),
                }
            )

@@ -1073,6 +1081,62 @@ class ReceiptFolderService(ReceiptFolderStorageMixin, ReceiptFolderItemMixin, Re
            update["warnings"] = list(dict.fromkeys(warnings))
        return document.model_copy(update=update)

+    def _refresh_duplicate_receipt_from_document_if_stronger(
+        self,
+        *,
+        receipt: ReceiptFolderItemRead,
+        document: OcrRecognizeDocumentRead,
+        current_user: CurrentUserContext,
+    ) -> ReceiptFolderItemRead:
+        try:
+            meta = self._read_receipt_meta(receipt.id, current_user)
+        except FileNotFoundError:
+            return receipt
+
+        incoming_meta = self._build_document_meta(document)
+        if not self._is_incoming_document_meta_stronger(meta, incoming_meta):
+            return receipt
+
+        for key in (
+            "engine",
+            "model",
+            "ocr_text",
+            "summary",
+            "ocr_avg_score",
+            "ocr_line_count",
+            "page_count",
+            "document_type",
+            "document_type_label",
+            "scene_code",
+            "scene_label",
+            "ocr_classification_source",
+            "ocr_classification_confidence",
+            "ocr_classification_evidence",
+            "document_fields",
+            "ocr_warnings",
+        ):
+            meta[key] = incoming_meta[key]
+        meta["updated_at"] = datetime.now(UTC).isoformat()
+        self._write_meta(self._receipt_dir(self._owner_key(current_user), receipt.id), meta)
+        return self._build_item(meta)
+
+    @staticmethod
+    def _is_incoming_document_meta_stronger(existing_meta: dict[str, Any], incoming_meta: dict[str, Any]) -> bool:
+        existing_type = str(existing_meta.get("document_type") or "other").strip() or "other"
+        incoming_type = str(incoming_meta.get("document_type") or "other").strip() or "other"
+        existing_fields = [field for field in list(existing_meta.get("document_fields") or []) if isinstance(field, dict)]
+        incoming_fields = [field for field in list(incoming_meta.get("document_fields") or []) if isinstance(field, dict)]
+        existing_text = str(existing_meta.get("ocr_text") or "").strip()
+        incoming_text = str(incoming_meta.get("ocr_text") or "").strip()
+
+        if incoming_type != "other" and existing_type == "other":
+            return True
+        if incoming_fields and not existing_fields:
+            return True
+        if incoming_text and not existing_text:
+            return True
+        return False
+
    def _build_ocr_document_fields_from_meta(self, meta: dict[str, Any]) -> list[OcrRecognizeFieldRead]:
        return [
            OcrRecognizeFieldRead(