feat(server): 系统缓存清理接口与 OCR 文本层兜底增强

- 新增 system_cache 模块与 POST /settings/cache/clear,管理员可一键清理 OCR 结果/运行时配置/模型失败冷却/知识库索引/地点语义等进程内缓存
- 各服务暴露 clear_*_cache 方法(ocr/runtime_settings/runtime_chat/knowledge/application_location_semantic),SettingsCacheClearRead 汇总清理项
- OCR 转图片失败时尝试用 PDF 文本层兜底构建识别文档(有效字符≥8),并写结果缓存;OcrService 暴露 clear_result_cache
- receipt_folder 车票过滤补充身份证号关键词,附件文档/操作/展示模块同步适配
- 新增 system_cache_endpoints 测试,更新 openapi_schema/ocr/receipt_folder/attachment_association_jobs 测试
This commit is contained in:
caoxiaozhu
2026-06-24 12:35:51 +08:00
parent 50d2dc579a
commit 9a5ed0e94a
17 changed files with 932 additions and 13 deletions

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import base64
from collections.abc import Generator
from datetime import UTC, date, datetime
from decimal import Decimal
@@ -16,6 +17,7 @@ from app.models.employee import Employee
from app.models.financial_record import ExpenseClaim, ExpenseClaimItem
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead
from app.services.attachment_association_jobs import clear_attachment_association_jobs_for_tests
from app.services.expense_claims import ExpenseClaimService
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
from app.services.ocr import OcrService
from app.services.receipt_folder import ReceiptFolderService
@@ -149,6 +151,13 @@ def fake_ocr_recognize(
)
def fake_ocr_recognize_without_preview(
self,
files: list[tuple[str, bytes, str | None]],
) -> OcrRecognizeBatchRead:
return fake_ocr_recognize(self, files)
def test_attachment_association_job_links_receipts_after_conversation_exit(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
@@ -233,6 +242,233 @@ def test_attachment_association_job_links_receipts_after_conversation_exit(monke
get_settings.cache_clear()
def test_attachment_association_keeps_receipt_folder_preview_and_fields_after_cache_clear(
monkeypatch,
tmp_path,
) -> None:
preview_bytes = b"receipt-folder-preview-png"
preview_data_url = f"data:image/png;base64,{base64.b64encode(preview_bytes).decode('ascii')}"
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
clear_attachment_association_jobs_for_tests()
monkeypatch.setattr(OcrService, "recognize_files", fake_ocr_recognize_without_preview)
monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path / "attachments")
try:
client, session_factory = build_client(monkeypatch)
current_user = CurrentUserContext(
username="zhangsan@example.com",
name="张三",
role_codes=["user"],
is_admin=False,
employee_no="E10001",
)
with session_factory() as db:
seed_travel_claim(db)
receipt = ReceiptFolderService().save_receipt(
filename="2月20 武汉-上海.pdf",
content=b"%PDF-1.7 fake-ticket",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20 武汉-上海.pdf",
media_type="application/pdf",
text="电子发票(铁路电子客票) 武汉站 G458 上海虹桥站 2026年02月20日 07:55开 二等座 票价 354.00",
summary="铁路电子客票,武汉-上海,票价 354 元。",
avg_score=0.96,
line_count=1,
page_count=1,
document_type="train_ticket",
document_type_label="火车/高铁票",
scene_code="travel",
scene_label="差旅票据",
preview_kind="image",
preview_data_url=preview_data_url,
document_fields=[
OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-20 07:55"),
OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"),
OcrRecognizeFieldRead(key="amount", label="金额", value="354元"),
],
),
)
OcrService.clear_result_cache()
headers = {
"x-auth-username": "zhangsan@example.com",
"x-auth-name": "Zhang San",
"x-auth-employee-no": "E10001",
"x-auth-role-codes": "user",
}
response = client.post(
"/api/v1/reimbursements/attachment-association-jobs",
headers=headers,
json={
"receipt_ids": [receipt.id],
"prompt": "请帮我处理已上传的附件。",
"conversation_id": "inline-test",
},
)
assert response.status_code == 202
job_id = response.json()["job_id"]
status_response = client.get(
f"/api/v1/reimbursements/attachment-association-jobs/{job_id}",
headers=headers,
)
assert status_response.status_code == 200
assert status_response.json()["status"] == "succeeded"
with session_factory() as db:
claim = db.scalar(
select(ExpenseClaim)
.options(selectinload(ExpenseClaim.items))
.where(ExpenseClaim.id == "claim-bg-association")
)
assert claim is not None
attached_item = next(item for item in claim.items if item.invoice_id)
metadata = ExpenseClaimService(db).get_claim_item_attachment_meta(
claim_id=claim.id,
item_id=attached_item.id,
current_user=current_user,
)
assert metadata is not None
assert metadata["preview_kind"] == "image"
assert metadata["document_info"]["document_type"] == "train_ticket"
assert metadata["document_info"]["document_type_label"] == "火车/高铁票"
assert {
(field["label"], field["value"])
for field in metadata["document_info"]["fields"]
} >= {
("列车出发时间", "2026-02-20 07:55"),
("行程", "武汉-上海"),
("金额", "354元"),
}
preview_path, media_type, filename = ExpenseClaimService(db).get_claim_item_attachment_preview_content(
claim_id=claim.id,
item_id=attached_item.id,
current_user=current_user,
)
assert media_type == "image/png"
assert filename.endswith(".png")
assert preview_path.read_bytes() == preview_bytes
finally:
clear_attachment_association_jobs_for_tests()
get_settings.cache_clear()
def test_attachment_meta_repairs_existing_pdf_fallback_from_source_receipt(
monkeypatch,
tmp_path,
) -> None:
preview_bytes = b"legacy-repaired-preview-png"
preview_data_url = f"data:image/png;base64,{base64.b64encode(preview_bytes).decode('ascii')}"
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path / "attachments")
try:
current_user = CurrentUserContext(
username="zhangsan@example.com",
name="张三",
role_codes=["user"],
is_admin=False,
employee_no="E10001",
)
client, session_factory = build_client(monkeypatch)
client.close()
with session_factory() as db:
claim = seed_travel_claim(db)
item = claim.items[0]
receipt = ReceiptFolderService().save_receipt(
filename="2月20 武汉-上海.pdf",
content=b"%PDF-1.7 fake-ticket",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20 武汉-上海.pdf",
media_type="application/pdf",
text="电子发票(铁路电子客票) 武汉站 G458 上海虹桥站 2026年02月20日 07:55开 二等座 票价 354.00",
summary="铁路电子客票,武汉-上海,票价 354 元。",
avg_score=0.96,
line_count=1,
page_count=1,
document_type="train_ticket",
document_type_label="火车/高铁票",
scene_code="travel",
scene_label="差旅票据",
preview_kind="image",
preview_data_url=preview_data_url,
document_fields=[
OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-20 07:55"),
OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"),
OcrRecognizeFieldRead(key="amount", label="金额", value="354元"),
],
),
)
attachment_dir = tmp_path / "attachments" / claim.id / item.id
attachment_dir.mkdir(parents=True)
file_path = attachment_dir / "2月20_武汉-上海.pdf"
file_path.write_bytes(b"%PDF-1.7 persisted-but-bad-meta")
storage = ExpenseClaimAttachmentStorage()
item.invoice_id = storage.to_storage_key(file_path)
storage.write_meta(
file_path,
{
"file_name": file_path.name,
"storage_key": storage.to_storage_key(file_path),
"media_type": "application/pdf",
"size_bytes": file_path.stat().st_size,
"previewable": True,
"preview_kind": "pdf",
"preview_storage_key": storage.to_storage_key(file_path),
"preview_media_type": "application/pdf",
"preview_file_name": file_path.name,
"document_info": {
"document_type": "other",
"document_type_label": "其他单据",
"scene_code": "other",
"scene_label": "其他票据",
"fields": [],
},
"source_receipt_id": receipt.id,
},
)
db.commit()
service = ExpenseClaimService(db)
metadata = service.get_claim_item_attachment_meta(
claim_id=claim.id,
item_id=item.id,
current_user=current_user,
)
assert metadata is not None
assert metadata["preview_kind"] == "image"
assert metadata["document_info"]["document_type"] == "train_ticket"
assert metadata["document_info"]["document_type_label"] == "火车/高铁票"
assert {
(field["label"], field["value"])
for field in metadata["document_info"]["fields"]
} >= {
("列车出发时间", "2026-02-20 07:55"),
("行程", "武汉-上海"),
("金额", "354元"),
}
preview_path, media_type, filename = service.get_claim_item_attachment_preview_content(
claim_id=claim.id,
item_id=item.id,
current_user=current_user,
)
assert media_type == "image/png"
assert filename.endswith(".png")
assert preview_path.read_bytes() == preview_bytes
finally:
get_settings.cache_clear()
def test_attachment_association_job_fails_without_editable_claim(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()