feat(server): 系统缓存清理接口与 OCR 文本层兜底增强

- 新增 system_cache 模块与 POST /settings/cache/clear,管理员可一键清理 OCR 结果/运行时配置/模型失败冷却/知识库索引/地点语义等进程内缓存
- 各服务暴露 clear_*_cache 方法(ocr/runtime_settings/runtime_chat/knowledge/application_location_semantic),SettingsCacheClearRead 汇总清理项
- OCR 转图片失败时尝试用 PDF 文本层兜底构建识别文档(有效字符≥8),并写结果缓存;OcrService 暴露 clear_result_cache
- receipt_folder 车票过滤补充身份证号关键词,附件文档/操作/展示模块同步适配
- 新增 system_cache_endpoints 测试,更新 openapi_schema/ocr/receipt_folder/attachment_association_jobs 测试
This commit is contained in:
caoxiaozhu
2026-06-24 12:35:51 +08:00
parent 50d2dc579a
commit 9a5ed0e94a
17 changed files with 932 additions and 13 deletions

View File

@@ -4,7 +4,7 @@ import base64
from app.api.deps import CurrentUserContext
from app.core.config import get_settings
from app.schemas.ocr import OcrRecognizeDocumentRead, OcrRecognizeFieldRead
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead
from app.services.document_preview import DocumentPreviewAssets
from app.services.receipt_folder import ReceiptFolderService
@@ -121,6 +121,53 @@ def test_receipt_folder_pdf_save_eagerly_renders_image_preview(monkeypatch, tmp_
get_settings.cache_clear()
def test_receipt_folder_persist_enriches_pdf_ocr_document_with_image_preview(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
preview_path.write_bytes(b"rendered-preview")
return preview_path
monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
service = ReceiptFolderService()
result = service.persist_ocr_batch(
files=[("2月23_上海-武汉.pdf", b"%PDF-1.4 fake", "application/pdf")],
result=OcrRecognizeBatchRead(
total_file_count=1,
success_count=1,
documents=[
OcrRecognizeDocumentRead(
filename="2月23_上海-武汉.pdf",
media_type="application/pdf",
text="铁路电子客票 上海虹桥 武汉 G456 354.00",
summary="铁路电子客票,上海虹桥至武汉。",
document_type="train_ticket",
document_type_label="火车/高铁票",
scene_code="travel",
scene_label="差旅票据",
),
],
),
current_user=current_user,
)
document = result.documents[0]
assert document.receipt_id
assert document.receipt_preview_url.endswith(f"/receipt-folder/{document.receipt_id}/preview")
assert document.preview_kind == "image"
finally:
get_settings.cache_clear()
def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
@@ -433,6 +480,75 @@ def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -
get_settings.cache_clear()
def test_receipt_folder_duplicate_uses_newer_ocr_when_existing_meta_is_weaker(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
service = ReceiptFolderService()
content = b"%PDF-1.7 same train ticket"
stale_receipt = service.save_receipt(
filename="2月20_武汉-上海.pdf",
content=content,
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
document_type="other",
document_type_label="其他单据",
scene_code="other",
scene_label="其他票据",
warnings=["PDF 转图片失败Missing language pack for Adobe-GB1"],
),
)
result = service.persist_ocr_batch(
files=[("2月20_武汉-上海.pdf", content, "application/pdf")],
result=OcrRecognizeBatchRead(
total_file_count=1,
success_count=1,
documents=[
OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text="G458 Wuhan Shanghaihongqiao 2026 02 20 07:55 票价: 354.00 12306",
summary="Wuhan Shanghaihongqiao G458 354.00",
document_type="train_ticket",
document_type_label="火车/高铁票",
scene_code="travel",
scene_label="差旅票据",
document_fields=[
OcrRecognizeFieldRead(key="amount", label="金额", value="354元"),
OcrRecognizeFieldRead(key="trip_no", label="车次/航班", value="G458"),
OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"),
],
),
],
),
current_user=current_user,
)
document = result.documents[0]
assert document.receipt_id == stale_receipt.id
assert document.document_type == "train_ticket"
assert document.document_type_label == "火车/高铁票"
assert any(field.label == "金额" and field.value == "354元" for field in document.document_fields)
assert any("重复上传" in warning for warning in document.warnings)
repaired = service.get_receipt(stale_receipt.id, current_user)
assert repaired.document_type == "train_ticket"
assert repaired.document_type_label == "火车/高铁票"
assert {field.label: field.value for field in repaired.fields}["金额"] == "354元"
finally:
get_settings.cache_clear()
def test_receipt_folder_recovers_train_ticket_detail_from_other_english_ocr(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()