feat(server): 系统缓存清理接口与 OCR 文本层兜底增强
- 新增 system_cache 模块与 POST /settings/cache/clear,管理员可一键清理 OCR 结果/运行时配置/模型失败冷却/知识库索引/地点语义等进程内缓存 - 各服务暴露 clear_*_cache 方法(ocr/runtime_settings/runtime_chat/knowledge/application_location_semantic),SettingsCacheClearRead 汇总清理项 - OCR 转图片失败时尝试用 PDF 文本层兜底构建识别文档(有效字符≥8),并写结果缓存;OcrService 暴露 clear_result_cache - receipt_folder 车票过滤补充身份证号关键词,附件文档/操作/展示模块同步适配 - 新增 system_cache_endpoints 测试,更新 openapi_schema/ocr/receipt_folder/attachment_association_jobs 测试
This commit is contained in:
@@ -4,7 +4,7 @@ import base64
|
||||
|
||||
from app.api.deps import CurrentUserContext
|
||||
from app.core.config import get_settings
|
||||
from app.schemas.ocr import OcrRecognizeDocumentRead, OcrRecognizeFieldRead
|
||||
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead
|
||||
from app.services.document_preview import DocumentPreviewAssets
|
||||
from app.services.receipt_folder import ReceiptFolderService
|
||||
|
||||
@@ -121,6 +121,53 @@ def test_receipt_folder_pdf_save_eagerly_renders_image_preview(monkeypatch, tmp_
|
||||
get_settings.cache_clear()
|
||||
|
||||
|
||||
def test_receipt_folder_persist_enriches_pdf_ocr_document_with_image_preview(monkeypatch, tmp_path) -> None:
|
||||
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
|
||||
get_settings.cache_clear()
|
||||
try:
|
||||
current_user = CurrentUserContext(
|
||||
username="pytest",
|
||||
name="Py Test",
|
||||
role_codes=[],
|
||||
is_admin=False,
|
||||
)
|
||||
|
||||
def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
|
||||
preview_path.write_bytes(b"rendered-preview")
|
||||
return preview_path
|
||||
|
||||
monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
|
||||
|
||||
service = ReceiptFolderService()
|
||||
result = service.persist_ocr_batch(
|
||||
files=[("2月23_上海-武汉.pdf", b"%PDF-1.4 fake", "application/pdf")],
|
||||
result=OcrRecognizeBatchRead(
|
||||
total_file_count=1,
|
||||
success_count=1,
|
||||
documents=[
|
||||
OcrRecognizeDocumentRead(
|
||||
filename="2月23_上海-武汉.pdf",
|
||||
media_type="application/pdf",
|
||||
text="铁路电子客票 上海虹桥 武汉 G456 354.00",
|
||||
summary="铁路电子客票,上海虹桥至武汉。",
|
||||
document_type="train_ticket",
|
||||
document_type_label="火车/高铁票",
|
||||
scene_code="travel",
|
||||
scene_label="差旅票据",
|
||||
),
|
||||
],
|
||||
),
|
||||
current_user=current_user,
|
||||
)
|
||||
|
||||
document = result.documents[0]
|
||||
assert document.receipt_id
|
||||
assert document.receipt_preview_url.endswith(f"/receipt-folder/{document.receipt_id}/preview")
|
||||
assert document.preview_kind == "image"
|
||||
finally:
|
||||
get_settings.cache_clear()
|
||||
|
||||
|
||||
def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None:
|
||||
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
|
||||
get_settings.cache_clear()
|
||||
@@ -433,6 +480,75 @@ def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -
|
||||
get_settings.cache_clear()
|
||||
|
||||
|
||||
def test_receipt_folder_duplicate_uses_newer_ocr_when_existing_meta_is_weaker(monkeypatch, tmp_path) -> None:
|
||||
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
|
||||
get_settings.cache_clear()
|
||||
try:
|
||||
current_user = CurrentUserContext(
|
||||
username="pytest",
|
||||
name="Py Test",
|
||||
role_codes=[],
|
||||
is_admin=False,
|
||||
)
|
||||
service = ReceiptFolderService()
|
||||
content = b"%PDF-1.7 same train ticket"
|
||||
stale_receipt = service.save_receipt(
|
||||
filename="2月20_武汉-上海.pdf",
|
||||
content=content,
|
||||
media_type="application/pdf",
|
||||
current_user=current_user,
|
||||
document=OcrRecognizeDocumentRead(
|
||||
filename="2月20_武汉-上海.pdf",
|
||||
media_type="application/pdf",
|
||||
document_type="other",
|
||||
document_type_label="其他单据",
|
||||
scene_code="other",
|
||||
scene_label="其他票据",
|
||||
warnings=["PDF 转图片失败:Missing language pack for Adobe-GB1"],
|
||||
),
|
||||
)
|
||||
|
||||
result = service.persist_ocr_batch(
|
||||
files=[("2月20_武汉-上海.pdf", content, "application/pdf")],
|
||||
result=OcrRecognizeBatchRead(
|
||||
total_file_count=1,
|
||||
success_count=1,
|
||||
documents=[
|
||||
OcrRecognizeDocumentRead(
|
||||
filename="2月20_武汉-上海.pdf",
|
||||
media_type="application/pdf",
|
||||
text="G458 Wuhan Shanghaihongqiao 2026 02 20 07:55 票价: 354.00 12306",
|
||||
summary="Wuhan Shanghaihongqiao G458 354.00",
|
||||
document_type="train_ticket",
|
||||
document_type_label="火车/高铁票",
|
||||
scene_code="travel",
|
||||
scene_label="差旅票据",
|
||||
document_fields=[
|
||||
OcrRecognizeFieldRead(key="amount", label="金额", value="354元"),
|
||||
OcrRecognizeFieldRead(key="trip_no", label="车次/航班", value="G458"),
|
||||
OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"),
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
current_user=current_user,
|
||||
)
|
||||
|
||||
document = result.documents[0]
|
||||
assert document.receipt_id == stale_receipt.id
|
||||
assert document.document_type == "train_ticket"
|
||||
assert document.document_type_label == "火车/高铁票"
|
||||
assert any(field.label == "金额" and field.value == "354元" for field in document.document_fields)
|
||||
assert any("重复上传" in warning for warning in document.warnings)
|
||||
|
||||
repaired = service.get_receipt(stale_receipt.id, current_user)
|
||||
assert repaired.document_type == "train_ticket"
|
||||
assert repaired.document_type_label == "火车/高铁票"
|
||||
assert {field.label: field.value for field in repaired.fields}["金额"] == "354元"
|
||||
finally:
|
||||
get_settings.cache_clear()
|
||||
|
||||
|
||||
def test_receipt_folder_recovers_train_ticket_detail_from_other_english_ocr(monkeypatch, tmp_path) -> None:
|
||||
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
|
||||
get_settings.cache_clear()
|
||||
|
||||
Reference in New Issue
Block a user