feat(server): 系统缓存清理接口与 OCR 文本层兜底增强
- 新增 system_cache 模块与 POST /settings/cache/clear,管理员可一键清理 OCR 结果/运行时配置/模型失败冷却/知识库索引/地点语义等进程内缓存 - 各服务暴露 clear_*_cache 方法(ocr/runtime_settings/runtime_chat/knowledge/application_location_semantic),SettingsCacheClearRead 汇总清理项 - OCR 转图片失败时尝试用 PDF 文本层兜底构建识别文档(有效字符≥8),并写结果缓存;OcrService 暴露 clear_result_cache - receipt_folder 车票过滤补充身份证号关键词,附件文档/操作/展示模块同步适配 - 新增 system_cache_endpoints 测试,更新 openapi_schema/ocr/receipt_folder/attachment_association_jobs 测试
This commit is contained in:
@@ -148,13 +148,23 @@ class OcrService:
|
||||
for item in pdf_inputs:
|
||||
cache_keys_by_source.setdefault(item.source_key, cache_key)
|
||||
except RuntimeError as exc:
|
||||
documents.append(
|
||||
OcrRecognizeDocumentRead(
|
||||
filename=normalized_name,
|
||||
media_type=resolved_media_type,
|
||||
warnings=[str(exc)],
|
||||
)
|
||||
fallback_document = self._build_pdf_text_layer_fallback_document(
|
||||
filename=normalized_name,
|
||||
media_type=resolved_media_type,
|
||||
text_layer=text_layer,
|
||||
render_warning=str(exc),
|
||||
)
|
||||
if fallback_document is not None:
|
||||
documents.append(fallback_document)
|
||||
self._write_cached_document(cache_key, fallback_document)
|
||||
else:
|
||||
documents.append(
|
||||
OcrRecognizeDocumentRead(
|
||||
filename=normalized_name,
|
||||
media_type=resolved_media_type,
|
||||
warnings=[str(exc)],
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
source_key = uuid4().hex
|
||||
@@ -328,6 +338,13 @@ class OcrService:
|
||||
while len(cls._result_cache) > OCR_RESULT_CACHE_LIMIT:
|
||||
cls._result_cache.popitem(last=False)
|
||||
|
||||
@classmethod
|
||||
def clear_result_cache(cls) -> int:
|
||||
with cls._cache_lock:
|
||||
cleared_count = len(cls._result_cache)
|
||||
cls._result_cache.clear()
|
||||
return cleared_count
|
||||
|
||||
@classmethod
|
||||
def _resolve_worker_semaphore(cls, limit: int) -> threading.Semaphore:
|
||||
normalized_limit = max(1, int(limit or 1))
|
||||
@@ -425,6 +442,36 @@ class OcrService:
|
||||
)
|
||||
return descriptors
|
||||
|
||||
def _build_pdf_text_layer_fallback_document(
|
||||
self,
|
||||
*,
|
||||
filename: str,
|
||||
media_type: str,
|
||||
text_layer: str,
|
||||
render_warning: str,
|
||||
) -> OcrRecognizeDocumentRead | None:
|
||||
normalized_text = self._normalize_extracted_text(text_layer)
|
||||
if self._meaningful_char_count(normalized_text) < 8:
|
||||
return None
|
||||
|
||||
aggregated = AggregatedOcrDocument(
|
||||
filename=filename,
|
||||
media_type=media_type,
|
||||
source_key=uuid4().hex,
|
||||
page_count=1,
|
||||
warnings=[
|
||||
str(render_warning or "").strip() or "PDF 转图片失败。",
|
||||
"PDF 转图片失败,已使用 PDF 文本层继续抽取识别信息。",
|
||||
],
|
||||
lines=[
|
||||
OcrRecognizeLineRead(text=line, page_index=0)
|
||||
for line in normalized_text.splitlines()
|
||||
if line.strip()
|
||||
],
|
||||
)
|
||||
aggregated.text_layer_fragments.append(normalized_text)
|
||||
return self._finalize_document(aggregated)
|
||||
|
||||
def _extract_pdf_text_layer(self, pdf_path: Path) -> str:
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
|
||||
Reference in New Issue
Block a user