feat(server): 系统缓存清理接口与 OCR 文本层兜底增强

- 新增 system_cache 模块与 POST /settings/cache/clear,管理员可一键清理 OCR 结果/运行时配置/模型失败冷却/知识库索引/地点语义等进程内缓存
- 各服务暴露 clear_*_cache 方法(ocr/runtime_settings/runtime_chat/knowledge/application_location_semantic),SettingsCacheClearRead 汇总清理项
- OCR 转图片失败时尝试用 PDF 文本层兜底构建识别文档(有效字符≥8),并写结果缓存;OcrService 暴露 clear_result_cache
- receipt_folder 车票过滤补充身份证号关键词,附件文档/操作/展示模块同步适配
- 新增 system_cache_endpoints 测试,更新 openapi_schema/ocr/receipt_folder/attachment_association_jobs 测试
This commit is contained in:
caoxiaozhu
2026-06-24 12:35:51 +08:00
parent 50d2dc579a
commit 9a5ed0e94a
17 changed files with 932 additions and 13 deletions

View File

@@ -148,13 +148,23 @@ class OcrService:
for item in pdf_inputs:
cache_keys_by_source.setdefault(item.source_key, cache_key)
except RuntimeError as exc:
documents.append(
OcrRecognizeDocumentRead(
filename=normalized_name,
media_type=resolved_media_type,
warnings=[str(exc)],
)
fallback_document = self._build_pdf_text_layer_fallback_document(
filename=normalized_name,
media_type=resolved_media_type,
text_layer=text_layer,
render_warning=str(exc),
)
if fallback_document is not None:
documents.append(fallback_document)
self._write_cached_document(cache_key, fallback_document)
else:
documents.append(
OcrRecognizeDocumentRead(
filename=normalized_name,
media_type=resolved_media_type,
warnings=[str(exc)],
)
)
continue
source_key = uuid4().hex
@@ -328,6 +338,13 @@ class OcrService:
while len(cls._result_cache) > OCR_RESULT_CACHE_LIMIT:
cls._result_cache.popitem(last=False)
@classmethod
def clear_result_cache(cls) -> int:
with cls._cache_lock:
cleared_count = len(cls._result_cache)
cls._result_cache.clear()
return cleared_count
@classmethod
def _resolve_worker_semaphore(cls, limit: int) -> threading.Semaphore:
normalized_limit = max(1, int(limit or 1))
@@ -425,6 +442,36 @@ class OcrService:
)
return descriptors
def _build_pdf_text_layer_fallback_document(
self,
*,
filename: str,
media_type: str,
text_layer: str,
render_warning: str,
) -> OcrRecognizeDocumentRead | None:
normalized_text = self._normalize_extracted_text(text_layer)
if self._meaningful_char_count(normalized_text) < 8:
return None
aggregated = AggregatedOcrDocument(
filename=filename,
media_type=media_type,
source_key=uuid4().hex,
page_count=1,
warnings=[
str(render_warning or "").strip() or "PDF 转图片失败。",
"PDF 转图片失败,已使用 PDF 文本层继续抽取识别信息。",
],
lines=[
OcrRecognizeLineRead(text=line, page_index=0)
for line in normalized_text.splitlines()
if line.strip()
],
)
aggregated.text_layer_fragments.append(normalized_text)
return self._finalize_document(aggregated)
def _extract_pdf_text_layer(self, pdf_path: Path) -> str:
try:
completed = subprocess.run(