feat(server): 系统缓存清理接口与 OCR 文本层兜底增强

- 新增 system_cache 模块与 POST /settings/cache/clear，管理员可一键清理 OCR 结果/运行时配置/模型失败冷却/知识库索引/地点语义等进程内缓存 - 各服务暴露 clear_*_cache 方法（ocr/runtime_settings/runtime_chat/knowledge/application_location_semantic），SettingsCacheClearRead 汇总清理项 - OCR 转图片失败时尝试用 PDF 文本层兜底构建识别文档（有效字符≥8），并写结果缓存；OcrService 暴露 clear_result_cache - receipt_folder 车票过滤补充身份证号关键词，附件文档/操作/展示模块同步适配 - 新增 system_cache_endpoints 测试，更新 openapi_schema/ocr/receipt_folder/attachment_association_jobs 测试
2026-06-24 12:35:51 +08:00
parent 50d2dc579a
commit 9a5ed0e94a
17 changed files with 932 additions and 13 deletions
--- a/server/src/app/services/ocr.py
+++ b/server/src/app/services/ocr.py
@@ -148,13 +148,23 @@ class OcrService:
                        for item in pdf_inputs:
                            cache_keys_by_source.setdefault(item.source_key, cache_key)
                    except RuntimeError as exc:
-                        documents.append(
-                            OcrRecognizeDocumentRead(
-                                filename=normalized_name,
-                                media_type=resolved_media_type,
-                                warnings=[str(exc)],
-                            )
+                        fallback_document = self._build_pdf_text_layer_fallback_document(
+                            filename=normalized_name,
+                            media_type=resolved_media_type,
+                            text_layer=text_layer,
+                            render_warning=str(exc),
                        )
+                        if fallback_document is not None:
+                            documents.append(fallback_document)
+                            self._write_cached_document(cache_key, fallback_document)
+                        else:
+                            documents.append(
+                                OcrRecognizeDocumentRead(
+                                    filename=normalized_name,
+                                    media_type=resolved_media_type,
+                                    warnings=[str(exc)],
+                                )
+                            )
                    continue

                source_key = uuid4().hex
@@ -328,6 +338,13 @@ class OcrService:
            while len(cls._result_cache) > OCR_RESULT_CACHE_LIMIT:
                cls._result_cache.popitem(last=False)

+    @classmethod
+    def clear_result_cache(cls) -> int:
+        with cls._cache_lock:
+            cleared_count = len(cls._result_cache)
+            cls._result_cache.clear()
+        return cleared_count
+
    @classmethod
    def _resolve_worker_semaphore(cls, limit: int) -> threading.Semaphore:
        normalized_limit = max(1, int(limit or 1))
@@ -425,6 +442,36 @@ class OcrService:
            )
        return descriptors

+    def _build_pdf_text_layer_fallback_document(
+        self,
+        *,
+        filename: str,
+        media_type: str,
+        text_layer: str,
+        render_warning: str,
+    ) -> OcrRecognizeDocumentRead | None:
+        normalized_text = self._normalize_extracted_text(text_layer)
+        if self._meaningful_char_count(normalized_text) < 8:
+            return None
+
+        aggregated = AggregatedOcrDocument(
+            filename=filename,
+            media_type=media_type,
+            source_key=uuid4().hex,
+            page_count=1,
+            warnings=[
+                str(render_warning or "").strip() or "PDF 转图片失败。",
+                "PDF 转图片失败，已使用 PDF 文本层继续抽取识别信息。",
+            ],
+            lines=[
+                OcrRecognizeLineRead(text=line, page_index=0)
+                for line in normalized_text.splitlines()
+                if line.strip()
+            ],
+        )
+        aggregated.text_layer_fragments.append(normalized_text)
+        return self._finalize_document(aggregated)
+
    def _extract_pdf_text_layer(self, pdf_path: Path) -> str:
        try:
            completed = subprocess.run(