refactor: enforce 800 line source limits

This commit is contained in:
caoxiaozhu
2026-06-22 11:58:53 +08:00
parent 08a4fa3577
commit 6d33ba5742
150 changed files with 27413 additions and 23791 deletions

View File

@@ -153,53 +153,68 @@ class ExpenseClaimAttachmentOperationsMixin:
media_type=media_type,
item=item,
)
source_receipt_document = self._resolve_source_receipt_document(
source_receipt_id=source_receipt_id,
current_user=current_user,
fallback_filename=normalized_name,
fallback_media_type=resolved_media_type,
)
ocr_document = None
document_info = None
requirement_check = None
ocr_status = "empty"
ocr_error = ""
upload_ocr_document = None
try:
ocr_result = OcrService(self.db).recognize_files(
[(normalized_name, content, media_type or "application/octet-stream")]
)
documents = list(ocr_result.documents or [])
if documents:
ocr_document = documents[0]
ocr_status = "recognized"
document_info = self._build_attachment_document_info(ocr_document)
self._backfill_item_type_from_attachment(
item=item,
document_info=document_info,
)
self._backfill_item_amount_from_attachment(
item=item,
document=ocr_document,
document_info=document_info,
)
self._backfill_item_date_from_attachment(
item=item,
document=ocr_document,
document_info=document_info,
)
self._backfill_item_reason_from_attachment(
item=item,
document=ocr_document,
document_info=document_info,
)
requirement_check = self._build_attachment_requirement_check(
item=item,
document_info=document_info,
)
attachment_analysis = self._build_attachment_analysis(
document=ocr_document,
item=item,
claim=claim,
document_info=document_info,
requirement_check=requirement_check,
)
upload_ocr_document = documents[0]
except Exception as exc: # pragma: no cover - fallback path depends on OCR runtime
ocr_status = "failed"
ocr_error = str(exc)
ocr_document = self._choose_attachment_ocr_document(
source_receipt_document=source_receipt_document,
upload_ocr_document=upload_ocr_document,
)
if ocr_document is not None:
ocr_status = "recognized"
ocr_error = ""
document_info = self._build_attachment_document_info(ocr_document)
self._backfill_item_type_from_attachment(
item=item,
document_info=document_info,
)
self._backfill_item_amount_from_attachment(
item=item,
document=ocr_document,
document_info=document_info,
)
self._backfill_item_date_from_attachment(
item=item,
document=ocr_document,
document_info=document_info,
)
self._backfill_item_reason_from_attachment(
item=item,
document=ocr_document,
document_info=document_info,
)
requirement_check = self._build_attachment_requirement_check(
item=item,
document_info=document_info,
)
attachment_analysis = self._build_attachment_analysis(
document=ocr_document,
item=item,
claim=claim,
document_info=document_info,
requirement_check=requirement_check,
)
elif ocr_error:
ocr_status = "failed"
attachment_analysis = self._build_failed_ocr_attachment_analysis(
media_type=media_type,
error_message=ocr_error,
@@ -240,6 +255,7 @@ class ExpenseClaimAttachmentOperationsMixin:
if str(item).strip()
],
"ocr_warnings": [str(item) for item in getattr(ocr_document, "warnings", []) or []],
"source_receipt_id": str(source_receipt_id or "").strip(),
}
self._attachment_storage.write_meta(file_path, meta)
ReceiptFolderService().save_linked_attachment(
@@ -283,6 +299,143 @@ class ExpenseClaimAttachmentOperationsMixin:
"attachment": self._build_attachment_payload(item),
}
def _resolve_source_receipt_document(
self,
*,
source_receipt_id: str,
current_user: CurrentUserContext,
fallback_filename: str,
fallback_media_type: str,
) -> SimpleNamespace | None:
normalized_receipt_id = str(source_receipt_id or "").strip()
if not normalized_receipt_id:
return None
try:
receipt = ReceiptFolderService().get_receipt(normalized_receipt_id, current_user)
except FileNotFoundError:
return None
raw_meta = receipt.raw_meta if isinstance(receipt.raw_meta, dict) else {}
fields = self._normalize_receipt_document_fields(
[field.model_dump() for field in list(receipt.fields or [])]
)
if not fields:
fields = self._normalize_receipt_document_fields(raw_meta.get("document_fields"))
document = SimpleNamespace(
filename=str(receipt.file_name or fallback_filename or "").strip(),
media_type=str(receipt.media_type or fallback_media_type or "application/octet-stream").strip(),
engine=str(receipt.engine or raw_meta.get("engine") or ""),
model=str(receipt.model or raw_meta.get("model") or ""),
text=str(receipt.ocr_text or raw_meta.get("ocr_text") or ""),
summary=str(receipt.summary or raw_meta.get("summary") or ""),
avg_score=float(receipt.avg_score or raw_meta.get("ocr_avg_score") or 0.0),
line_count=int(receipt.line_count or raw_meta.get("ocr_line_count") or 0),
page_count=max(1, int(receipt.page_count or raw_meta.get("page_count") or 1)),
document_type=str(receipt.document_type or raw_meta.get("document_type") or "other").strip(),
document_type_label=str(
receipt.document_type_label or raw_meta.get("document_type_label") or "其他单据"
).strip(),
scene_code=str(receipt.scene_code or raw_meta.get("scene_code") or "other").strip(),
scene_label=str(receipt.scene_label or raw_meta.get("scene_label") or "其他票据").strip(),
classification_source=str(raw_meta.get("ocr_classification_source") or "receipt_folder"),
classification_confidence=float(
receipt.classification_confidence
or raw_meta.get("ocr_classification_confidence")
or 0.0
),
classification_evidence=[
str(value)
for value in list(
receipt.classification_evidence
or raw_meta.get("ocr_classification_evidence")
or []
)
if str(value).strip()
],
document_fields=fields,
preview_kind=str(raw_meta.get("preview_kind") or ""),
preview_data_url="",
warnings=[
str(value)
for value in list(receipt.warnings or raw_meta.get("ocr_warnings") or [])
if str(value).strip()
],
)
return document if self._attachment_ocr_signal_score(document) > 0 else None
@staticmethod
def _normalize_receipt_document_fields(raw_fields: Any) -> list[dict[str, str]]:
fields: list[dict[str, str]] = []
for field in list(raw_fields or []):
if isinstance(field, dict):
key = str(field.get("key") or "").strip()
label = str(field.get("label") or "").strip()
value = str(field.get("value") or "").strip()
else:
key = str(getattr(field, "key", "") or "").strip()
label = str(getattr(field, "label", "") or "").strip()
value = str(getattr(field, "value", "") or "").strip()
if label and value:
fields.append({"key": key, "label": label, "value": value})
return fields
@classmethod
def _choose_attachment_ocr_document(
cls,
*,
source_receipt_document: Any | None,
upload_ocr_document: Any | None,
) -> Any | None:
source_score = cls._attachment_ocr_signal_score(source_receipt_document)
upload_score = cls._attachment_ocr_signal_score(upload_ocr_document)
if source_score <= 0:
return upload_ocr_document if upload_score > 0 else None
if upload_score <= 0:
return source_receipt_document
source_type = cls._attachment_document_type(source_receipt_document)
upload_type = cls._attachment_document_type(upload_ocr_document)
if source_type not in {"", "other"} and upload_type in {"", "other"}:
return source_receipt_document
if (
source_type == upload_type
and cls._attachment_document_field_count(source_receipt_document)
> cls._attachment_document_field_count(upload_ocr_document)
):
return source_receipt_document
if source_score > upload_score + 2:
return source_receipt_document
return upload_ocr_document
@classmethod
def _attachment_ocr_signal_score(cls, document: Any | None) -> int:
if document is None:
return 0
score = 0
document_type = cls._attachment_document_type(document)
if document_type not in {"", "other"}:
score += 4
score += min(3, cls._attachment_document_field_count(document))
if str(getattr(document, "text", "") or "").strip():
score += 2
if str(getattr(document, "summary", "") or "").strip():
score += 1
if int(getattr(document, "line_count", 0) or 0) > 0:
score += 1
return score
@staticmethod
def _attachment_document_type(document: Any | None) -> str:
return str(getattr(document, "document_type", "") or "").strip().lower()
@staticmethod
def _attachment_document_field_count(document: Any | None) -> int:
if document is None:
return 0
return len(list(getattr(document, "document_fields", []) or []))
def get_claim_item_attachment_meta(
self,
*,