refactor: enforce 800 line source limits
This commit is contained in:
@@ -153,53 +153,68 @@ class ExpenseClaimAttachmentOperationsMixin:
|
||||
media_type=media_type,
|
||||
item=item,
|
||||
)
|
||||
source_receipt_document = self._resolve_source_receipt_document(
|
||||
source_receipt_id=source_receipt_id,
|
||||
current_user=current_user,
|
||||
fallback_filename=normalized_name,
|
||||
fallback_media_type=resolved_media_type,
|
||||
)
|
||||
ocr_document = None
|
||||
document_info = None
|
||||
requirement_check = None
|
||||
ocr_status = "empty"
|
||||
ocr_error = ""
|
||||
upload_ocr_document = None
|
||||
try:
|
||||
ocr_result = OcrService(self.db).recognize_files(
|
||||
[(normalized_name, content, media_type or "application/octet-stream")]
|
||||
)
|
||||
documents = list(ocr_result.documents or [])
|
||||
if documents:
|
||||
ocr_document = documents[0]
|
||||
ocr_status = "recognized"
|
||||
document_info = self._build_attachment_document_info(ocr_document)
|
||||
self._backfill_item_type_from_attachment(
|
||||
item=item,
|
||||
document_info=document_info,
|
||||
)
|
||||
self._backfill_item_amount_from_attachment(
|
||||
item=item,
|
||||
document=ocr_document,
|
||||
document_info=document_info,
|
||||
)
|
||||
self._backfill_item_date_from_attachment(
|
||||
item=item,
|
||||
document=ocr_document,
|
||||
document_info=document_info,
|
||||
)
|
||||
self._backfill_item_reason_from_attachment(
|
||||
item=item,
|
||||
document=ocr_document,
|
||||
document_info=document_info,
|
||||
)
|
||||
requirement_check = self._build_attachment_requirement_check(
|
||||
item=item,
|
||||
document_info=document_info,
|
||||
)
|
||||
attachment_analysis = self._build_attachment_analysis(
|
||||
document=ocr_document,
|
||||
item=item,
|
||||
claim=claim,
|
||||
document_info=document_info,
|
||||
requirement_check=requirement_check,
|
||||
)
|
||||
upload_ocr_document = documents[0]
|
||||
except Exception as exc: # pragma: no cover - fallback path depends on OCR runtime
|
||||
ocr_status = "failed"
|
||||
ocr_error = str(exc)
|
||||
|
||||
ocr_document = self._choose_attachment_ocr_document(
|
||||
source_receipt_document=source_receipt_document,
|
||||
upload_ocr_document=upload_ocr_document,
|
||||
)
|
||||
if ocr_document is not None:
|
||||
ocr_status = "recognized"
|
||||
ocr_error = ""
|
||||
document_info = self._build_attachment_document_info(ocr_document)
|
||||
self._backfill_item_type_from_attachment(
|
||||
item=item,
|
||||
document_info=document_info,
|
||||
)
|
||||
self._backfill_item_amount_from_attachment(
|
||||
item=item,
|
||||
document=ocr_document,
|
||||
document_info=document_info,
|
||||
)
|
||||
self._backfill_item_date_from_attachment(
|
||||
item=item,
|
||||
document=ocr_document,
|
||||
document_info=document_info,
|
||||
)
|
||||
self._backfill_item_reason_from_attachment(
|
||||
item=item,
|
||||
document=ocr_document,
|
||||
document_info=document_info,
|
||||
)
|
||||
requirement_check = self._build_attachment_requirement_check(
|
||||
item=item,
|
||||
document_info=document_info,
|
||||
)
|
||||
attachment_analysis = self._build_attachment_analysis(
|
||||
document=ocr_document,
|
||||
item=item,
|
||||
claim=claim,
|
||||
document_info=document_info,
|
||||
requirement_check=requirement_check,
|
||||
)
|
||||
elif ocr_error:
|
||||
ocr_status = "failed"
|
||||
attachment_analysis = self._build_failed_ocr_attachment_analysis(
|
||||
media_type=media_type,
|
||||
error_message=ocr_error,
|
||||
@@ -240,6 +255,7 @@ class ExpenseClaimAttachmentOperationsMixin:
|
||||
if str(item).strip()
|
||||
],
|
||||
"ocr_warnings": [str(item) for item in getattr(ocr_document, "warnings", []) or []],
|
||||
"source_receipt_id": str(source_receipt_id or "").strip(),
|
||||
}
|
||||
self._attachment_storage.write_meta(file_path, meta)
|
||||
ReceiptFolderService().save_linked_attachment(
|
||||
@@ -283,6 +299,143 @@ class ExpenseClaimAttachmentOperationsMixin:
|
||||
"attachment": self._build_attachment_payload(item),
|
||||
}
|
||||
|
||||
def _resolve_source_receipt_document(
|
||||
self,
|
||||
*,
|
||||
source_receipt_id: str,
|
||||
current_user: CurrentUserContext,
|
||||
fallback_filename: str,
|
||||
fallback_media_type: str,
|
||||
) -> SimpleNamespace | None:
|
||||
normalized_receipt_id = str(source_receipt_id or "").strip()
|
||||
if not normalized_receipt_id:
|
||||
return None
|
||||
|
||||
try:
|
||||
receipt = ReceiptFolderService().get_receipt(normalized_receipt_id, current_user)
|
||||
except FileNotFoundError:
|
||||
return None
|
||||
|
||||
raw_meta = receipt.raw_meta if isinstance(receipt.raw_meta, dict) else {}
|
||||
fields = self._normalize_receipt_document_fields(
|
||||
[field.model_dump() for field in list(receipt.fields or [])]
|
||||
)
|
||||
if not fields:
|
||||
fields = self._normalize_receipt_document_fields(raw_meta.get("document_fields"))
|
||||
|
||||
document = SimpleNamespace(
|
||||
filename=str(receipt.file_name or fallback_filename or "").strip(),
|
||||
media_type=str(receipt.media_type or fallback_media_type or "application/octet-stream").strip(),
|
||||
engine=str(receipt.engine or raw_meta.get("engine") or ""),
|
||||
model=str(receipt.model or raw_meta.get("model") or ""),
|
||||
text=str(receipt.ocr_text or raw_meta.get("ocr_text") or ""),
|
||||
summary=str(receipt.summary or raw_meta.get("summary") or ""),
|
||||
avg_score=float(receipt.avg_score or raw_meta.get("ocr_avg_score") or 0.0),
|
||||
line_count=int(receipt.line_count or raw_meta.get("ocr_line_count") or 0),
|
||||
page_count=max(1, int(receipt.page_count or raw_meta.get("page_count") or 1)),
|
||||
document_type=str(receipt.document_type or raw_meta.get("document_type") or "other").strip(),
|
||||
document_type_label=str(
|
||||
receipt.document_type_label or raw_meta.get("document_type_label") or "其他单据"
|
||||
).strip(),
|
||||
scene_code=str(receipt.scene_code or raw_meta.get("scene_code") or "other").strip(),
|
||||
scene_label=str(receipt.scene_label or raw_meta.get("scene_label") or "其他票据").strip(),
|
||||
classification_source=str(raw_meta.get("ocr_classification_source") or "receipt_folder"),
|
||||
classification_confidence=float(
|
||||
receipt.classification_confidence
|
||||
or raw_meta.get("ocr_classification_confidence")
|
||||
or 0.0
|
||||
),
|
||||
classification_evidence=[
|
||||
str(value)
|
||||
for value in list(
|
||||
receipt.classification_evidence
|
||||
or raw_meta.get("ocr_classification_evidence")
|
||||
or []
|
||||
)
|
||||
if str(value).strip()
|
||||
],
|
||||
document_fields=fields,
|
||||
preview_kind=str(raw_meta.get("preview_kind") or ""),
|
||||
preview_data_url="",
|
||||
warnings=[
|
||||
str(value)
|
||||
for value in list(receipt.warnings or raw_meta.get("ocr_warnings") or [])
|
||||
if str(value).strip()
|
||||
],
|
||||
)
|
||||
return document if self._attachment_ocr_signal_score(document) > 0 else None
|
||||
|
||||
@staticmethod
|
||||
def _normalize_receipt_document_fields(raw_fields: Any) -> list[dict[str, str]]:
|
||||
fields: list[dict[str, str]] = []
|
||||
for field in list(raw_fields or []):
|
||||
if isinstance(field, dict):
|
||||
key = str(field.get("key") or "").strip()
|
||||
label = str(field.get("label") or "").strip()
|
||||
value = str(field.get("value") or "").strip()
|
||||
else:
|
||||
key = str(getattr(field, "key", "") or "").strip()
|
||||
label = str(getattr(field, "label", "") or "").strip()
|
||||
value = str(getattr(field, "value", "") or "").strip()
|
||||
if label and value:
|
||||
fields.append({"key": key, "label": label, "value": value})
|
||||
return fields
|
||||
|
||||
@classmethod
|
||||
def _choose_attachment_ocr_document(
|
||||
cls,
|
||||
*,
|
||||
source_receipt_document: Any | None,
|
||||
upload_ocr_document: Any | None,
|
||||
) -> Any | None:
|
||||
source_score = cls._attachment_ocr_signal_score(source_receipt_document)
|
||||
upload_score = cls._attachment_ocr_signal_score(upload_ocr_document)
|
||||
if source_score <= 0:
|
||||
return upload_ocr_document if upload_score > 0 else None
|
||||
if upload_score <= 0:
|
||||
return source_receipt_document
|
||||
|
||||
source_type = cls._attachment_document_type(source_receipt_document)
|
||||
upload_type = cls._attachment_document_type(upload_ocr_document)
|
||||
if source_type not in {"", "other"} and upload_type in {"", "other"}:
|
||||
return source_receipt_document
|
||||
if (
|
||||
source_type == upload_type
|
||||
and cls._attachment_document_field_count(source_receipt_document)
|
||||
> cls._attachment_document_field_count(upload_ocr_document)
|
||||
):
|
||||
return source_receipt_document
|
||||
if source_score > upload_score + 2:
|
||||
return source_receipt_document
|
||||
return upload_ocr_document
|
||||
|
||||
@classmethod
|
||||
def _attachment_ocr_signal_score(cls, document: Any | None) -> int:
|
||||
if document is None:
|
||||
return 0
|
||||
score = 0
|
||||
document_type = cls._attachment_document_type(document)
|
||||
if document_type not in {"", "other"}:
|
||||
score += 4
|
||||
score += min(3, cls._attachment_document_field_count(document))
|
||||
if str(getattr(document, "text", "") or "").strip():
|
||||
score += 2
|
||||
if str(getattr(document, "summary", "") or "").strip():
|
||||
score += 1
|
||||
if int(getattr(document, "line_count", 0) or 0) > 0:
|
||||
score += 1
|
||||
return score
|
||||
|
||||
@staticmethod
|
||||
def _attachment_document_type(document: Any | None) -> str:
|
||||
return str(getattr(document, "document_type", "") or "").strip().lower()
|
||||
|
||||
@staticmethod
|
||||
def _attachment_document_field_count(document: Any | None) -> int:
|
||||
if document is None:
|
||||
return 0
|
||||
return len(list(getattr(document, "document_fields", []) or []))
|
||||
|
||||
def get_claim_item_attachment_meta(
|
||||
self,
|
||||
*,
|
||||
|
||||
Reference in New Issue
Block a user