diff --git a/server/src/app/schemas/reimbursement.py b/server/src/app/schemas/reimbursement.py index e6306e9..7d9179a 100644 --- a/server/src/app/schemas/reimbursement.py +++ b/server/src/app/schemas/reimbursement.py @@ -54,6 +54,32 @@ class ExpenseClaimAttachmentAnalysisRead(BaseModel): suggestion: str = "" +class ExpenseClaimAttachmentDocumentFieldRead(BaseModel): + key: str + label: str + value: str + + +class ExpenseClaimAttachmentDocumentInfoRead(BaseModel): + document_type: str = "other" + document_type_label: str = "其他单据" + scene_code: str = "other" + scene_label: str = "其他票据" + fields: list[ExpenseClaimAttachmentDocumentFieldRead] = Field(default_factory=list) + + +class ExpenseClaimAttachmentRequirementRead(BaseModel): + matches: bool = False + current_expense_type: str = "other" + current_expense_type_label: str = "其他" + allowed_scene_labels: list[str] = Field(default_factory=list) + recognized_scene_code: str = "other" + recognized_scene_label: str = "其他票据" + recognized_document_type: str = "other" + recognized_document_type_label: str = "其他单据" + message: str = "" + + class ExpenseClaimAttachmentRead(BaseModel): file_name: str storage_key: str @@ -62,6 +88,8 @@ class ExpenseClaimAttachmentRead(BaseModel): uploaded_at: datetime | None = None previewable: bool = True analysis: ExpenseClaimAttachmentAnalysisRead | None = None + document_info: ExpenseClaimAttachmentDocumentInfoRead | None = None + requirement_check: ExpenseClaimAttachmentRequirementRead | None = None class ExpenseClaimItemUpdate(BaseModel): diff --git a/server/src/app/services/expense_claims.py b/server/src/app/services/expense_claims.py index 38198a1..a03bd07 100644 --- a/server/src/app/services/expense_claims.py +++ b/server/src/app/services/expense_claims.py @@ -21,6 +21,7 @@ from app.schemas.ontology import OntologyEntity, OntologyParseResult from app.schemas.reimbursement import ExpenseClaimItemCreate, ExpenseClaimItemUpdate from app.services.agent_foundation import AgentFoundationService from app.services.audit import AuditLogService +from app.services.document_intelligence import build_document_insight from app.services.ocr import OcrService EXPENSE_TYPE_LABELS = { @@ -89,6 +90,18 @@ EXPENSE_TYPE_ALLOWED_DOCUMENT_SCENES = { "training": {"training"}, } +DOCUMENT_SCENE_LABELS = { + "travel": "差旅", + "hotel": "住宿", + "transport": "交通", + "meal": "餐饮", + "entertainment": "业务招待", + "office": "办公用品", + "meeting": "会务", + "training": "培训", + "other": "其他票据", +} + class ExpenseClaimService: def __init__(self, db: Session) -> None: @@ -307,19 +320,28 @@ class ExpenseClaimService: item=item, ) ocr_document = None + document_info = None + requirement_check = None ocr_status = "empty" ocr_error = "" try: - ocr_result = OcrService().recognize_files( + ocr_result = OcrService(self.db).recognize_files( [(normalized_name, content, media_type or "application/octet-stream")] ) documents = list(ocr_result.documents or []) if documents: ocr_document = documents[0] ocr_status = "recognized" + document_info = self._build_attachment_document_info(ocr_document) + requirement_check = self._build_attachment_requirement_check( + item=item, + document_info=document_info, + ) attachment_analysis = self._build_attachment_analysis( document=ocr_document, item=item, + document_info=document_info, + requirement_check=requirement_check, ) except Exception as exc: # pragma: no cover - fallback path depends on OCR runtime ocr_status = "failed" @@ -342,12 +364,21 @@ class ExpenseClaimService: "uploaded_at": datetime.now(UTC).isoformat(), "previewable": self._is_previewable_media_type(media_type, normalized_name), "analysis": attachment_analysis, + "document_info": document_info, + "requirement_check": requirement_check, "ocr_status": ocr_status, "ocr_error": ocr_error, "ocr_text": str(getattr(ocr_document, "text", "") or ""), "ocr_summary": str(getattr(ocr_document, "summary", "") or ""), "ocr_avg_score": float(getattr(ocr_document, "avg_score", 0.0) or 0.0), "ocr_line_count": int(getattr(ocr_document, "line_count", 0) or 0), + "ocr_classification_source": str(getattr(ocr_document, "classification_source", "") or ""), + "ocr_classification_confidence": float(getattr(ocr_document, "classification_confidence", 0.0) or 0.0), + "ocr_classification_evidence": [ + str(item) + for item in getattr(ocr_document, "classification_evidence", []) or [] + if str(item).strip() + ], "ocr_warnings": [str(item) for item in getattr(ocr_document, "warnings", []) or []], } self._write_attachment_meta(file_path, meta) @@ -1129,6 +1160,14 @@ class ExpenseClaimService: if not isinstance(analysis, dict): analysis = None + document_info = metadata.get("document_info") + if not isinstance(document_info, dict): + document_info = None + + requirement_check = metadata.get("requirement_check") + if not isinstance(requirement_check, dict): + requirement_check = None + return { "file_name": str(metadata.get("file_name") or filename), "storage_key": str(item.invoice_id or ""), @@ -1137,6 +1176,8 @@ class ExpenseClaimService: "uploaded_at": uploaded_at, "previewable": bool(metadata.get("previewable", self._is_previewable_media_type(media_type, filename))), "analysis": analysis, + "document_info": document_info, + "requirement_check": requirement_check, } @staticmethod @@ -1153,6 +1194,120 @@ class ExpenseClaimService: def _resolve_attachment_display_name(storage_key: str | None) -> str: return Path(str(storage_key or "").strip()).name + def _build_attachment_document_info(self, document: Any) -> dict[str, Any]: + insight = build_document_insight( + filename=str(getattr(document, "filename", "") or ""), + summary=str(getattr(document, "summary", "") or ""), + text=str(getattr(document, "text", "") or ""), + ) + raw_fields = list(getattr(document, "document_fields", []) or []) + normalized_fields: list[dict[str, str]] = [] + for item in raw_fields: + key = "" + label = "" + value = "" + if isinstance(item, dict): + key = str(item.get("key") or "").strip() + label = str(item.get("label") or "").strip() + value = str(item.get("value") or "").strip() + else: + key = str(getattr(item, "key", "") or "").strip() + label = str(getattr(item, "label", "") or "").strip() + value = str(getattr(item, "value", "") or "").strip() + if key and label and value: + normalized_fields.append( + { + "key": key, + "label": label, + "value": value, + } + ) + + if not normalized_fields: + normalized_fields = [ + { + "key": field.key, + "label": field.label, + "value": field.value, + } + for field in insight.fields + if field.value + ] + + document_type = str(getattr(document, "document_type", "") or "").strip() + if document_type in {"", "other"}: + document_type = insight.document_type + + document_type_label = str(getattr(document, "document_type_label", "") or "").strip() + if not document_type_label or document_type_label == "其他单据": + document_type_label = insight.document_type_label + + scene_code = str(getattr(document, "scene_code", "") or "").strip() + if scene_code in {"", "other"}: + scene_code = insight.scene_code + + scene_label = str(getattr(document, "scene_label", "") or "").strip() + if not scene_label or scene_label == "其他票据": + scene_label = insight.scene_label + + return { + "document_type": document_type, + "document_type_label": document_type_label, + "scene_code": scene_code, + "scene_label": scene_label, + "fields": normalized_fields, + } + + def _build_attachment_requirement_check( + self, + *, + item: ExpenseClaimItem, + document_info: dict[str, Any], + ) -> dict[str, Any]: + expense_type = str(item.item_type or "").strip().lower() or "other" + expense_label = self._resolve_expense_type_label(expense_type) + allowed_scenes = EXPENSE_TYPE_ALLOWED_DOCUMENT_SCENES.get(expense_type, set()) + allowed_scene_labels = [self._resolve_document_scene_label(code) for code in sorted(allowed_scenes)] + recognized_scene_code = str(document_info.get("scene_code") or "other").strip() or "other" + recognized_scene_label = str( + document_info.get("scene_label") or self._resolve_document_scene_label(recognized_scene_code) + ).strip() + recognized_document_type = str(document_info.get("document_type") or "other").strip() or "other" + recognized_document_type_label = str(document_info.get("document_type_label") or "其他单据").strip() or "其他单据" + matches = not allowed_scenes or recognized_scene_code in allowed_scenes + + if matches: + if allowed_scene_labels: + message = ( + f"当前费用项目为{expense_label},已识别为{recognized_document_type_label}," + f"符合当前{expense_label}场景的附件要求。" + ) + else: + message = f"当前费用项目为{expense_label},已识别为{recognized_document_type_label}。" + else: + expected_text = "、".join(label + "相关票据" for label in allowed_scene_labels) or "对应场景票据" + message = ( + f"当前费用项目为{expense_label},要求上传{expected_text};" + f"当前识别为{recognized_document_type_label},不符合当前场景,建议过滤或更换附件。" + ) + + return { + "matches": matches, + "current_expense_type": expense_type, + "current_expense_type_label": expense_label, + "allowed_scene_labels": allowed_scene_labels, + "recognized_scene_code": recognized_scene_code, + "recognized_scene_label": recognized_scene_label, + "recognized_document_type": recognized_document_type, + "recognized_document_type_label": recognized_document_type_label, + "message": message, + } + + @staticmethod + def _resolve_document_scene_label(scene_code: str) -> str: + normalized = str(scene_code or "").strip().lower() + return DOCUMENT_SCENE_LABELS.get(normalized, "其他票据") + @staticmethod def _extract_amount_candidates(text: str) -> list[Decimal]: values: list[Decimal] = [] @@ -1285,7 +1440,14 @@ class ExpenseClaimService: "suggestion": "建议重新上传更清晰的票据图片,或稍后重试识别后再提交。", } - def _build_attachment_analysis(self, *, document: Any, item: ExpenseClaimItem) -> dict[str, Any]: + def _build_attachment_analysis( + self, + *, + document: Any, + item: ExpenseClaimItem, + document_info: dict[str, Any] | None = None, + requirement_check: dict[str, Any] | None = None, + ) -> dict[str, Any]: warnings = [str(value).strip() for value in list(getattr(document, "warnings", []) or []) if str(value).strip()] text = " ".join( [ @@ -1296,11 +1458,19 @@ class ExpenseClaimService: compact_text = text.replace(" ", "") avg_score = float(getattr(document, "avg_score", 0.0) or 0.0) line_count = int(getattr(document, "line_count", 0) or 0) + document_info = document_info or self._build_attachment_document_info(document) + requirement_check = requirement_check or self._build_attachment_requirement_check( + item=item, + document_info=document_info, + ) document_scene_matches = self._detect_expense_scenes(text) purpose_mismatch_point = self._build_purpose_mismatch_point( item=item, document_scenes=set(document_scene_matches.keys()), ) + recognized_document_type = str(document_info.get("document_type") or "other").strip().lower() or "other" + recognized_document_label = str(document_info.get("document_type_label") or "其他单据").strip() or "其他单据" + requirement_matches = bool(requirement_check.get("matches")) has_ticket_keyword = any( keyword in compact_text @@ -1329,8 +1499,8 @@ class ExpenseClaimService: points.append(f"识别提示:{warnings[0]}") if line_count == 0 or not compact_text: points.append("附件内容:未识别到有效文字,当前附件更像普通图片或内容过于模糊。") - if not has_ticket_keyword: - points.append("票据类型:未识别到发票、票据、电子行程单等关键字。") + if recognized_document_type == "other" and not has_ticket_keyword: + points.append("票据类型:未识别到发票、票据、电子行程单等关键字,暂无法判断票据类型。") if not amount_candidates: points.append("金额字段:未识别到可用于核对的金额。") elif amount_mismatch: @@ -1338,6 +1508,8 @@ class ExpenseClaimService: points.append(f"金额字段:附件识别金额 {candidate_text} 元与报销金额 {item_amount} 元不一致。") if not has_date_text: points.append("日期字段:未识别到开票日期或业务发生日期。") + if not requirement_matches: + points.append(f"附件类型要求:{requirement_check.get('message')}") if purpose_mismatch_point: points.append(purpose_mismatch_point) if avg_score and avg_score < 0.72: @@ -1349,9 +1521,10 @@ class ExpenseClaimService: "severity": "pass", "label": "AI提示符合条件", "headline": "AI提示:附件符合基础校验条件", - "summary": "已识别到票据关键字段,附件可继续进入人工复核与报销流程。", + "summary": "已识别到票据类型和关键字段,且符合当前费用场景的附件要求。", "points": [ - "票据类型:已识别到可用于报销核验的票据关键字。", + f"票据类型:已识别为{recognized_document_label}。", + f"附件类型要求:{requirement_check.get('message')}", f"金额字段:已识别到与当前明细接近的金额 {item_amount} 元。", ], "suggestion": "建议继续核对报销分类、费用说明和业务场景是否一致。", @@ -1365,21 +1538,22 @@ class ExpenseClaimService: if ( line_count == 0 or not compact_text - or (not has_ticket_keyword and issue_count >= 2) + or (recognized_document_type == "other" and not has_ticket_keyword and issue_count >= 2) + or not requirement_matches or (purpose_mismatch_point and amount_mismatch) ): severity = "high" label = "高风险" headline = "AI提示:附件不符合票据校验条件" - summary = "当前附件存在明显异常,票据内容与填写信息不一致,或无法作为有效报销材料。" + summary = "当前附件存在明显异常,票据类型与当前费用场景不匹配,或无法作为有效报销材料。" elif purpose_mismatch_point or amount_mismatch or issue_count >= 2 or warnings or (avg_score and avg_score < 0.72): severity = "medium" label = "中风险" headline = "AI提示:附件存在明显待整改项" - summary = "当前附件可见部分内容,但金额、用途、日期或票据类型仍有缺失或不一致。" + summary = "当前附件可见部分内容,但金额、用途、日期或附件类型仍有缺失或不一致。" suggestion = { - "high": "建议重新上传清晰的票据原件,确保包含发票抬头、金额、日期等核心字段。", + "high": "建议过滤当前不匹配的票据,重新上传符合当前费用场景的清晰原件。", "medium": "建议根据风险点补齐清晰票据,或修正金额、日期、费用说明后再提交。", "low": "建议人工再次核对金额和业务说明,确认后可继续流转。", }[severity] @@ -1503,14 +1677,35 @@ class ExpenseClaimService: list(metadata.get("ocr_warnings") or []), ) ): + stored_document_info = metadata.get("document_info") + if not isinstance(stored_document_info, dict): + stored_document_info = {} document = SimpleNamespace( + filename=str(metadata.get("file_name") or file_path.name), text=str(metadata.get("ocr_text") or ""), summary=str(metadata.get("ocr_summary") or ""), avg_score=float(metadata.get("ocr_avg_score") or 0.0), line_count=int(metadata.get("ocr_line_count") or 0), + document_type=str(stored_document_info.get("document_type") or ""), + document_type_label=str(stored_document_info.get("document_type_label") or ""), + scene_code=str(stored_document_info.get("scene_code") or ""), + scene_label=str(stored_document_info.get("scene_label") or ""), + document_fields=list(stored_document_info.get("fields") or []), warnings=[str(value) for value in list(metadata.get("ocr_warnings") or []) if str(value).strip()], ) - analysis = self._build_attachment_analysis(document=document, item=item) + document_info = self._build_attachment_document_info(document) + requirement_check = self._build_attachment_requirement_check( + item=item, + document_info=document_info, + ) + analysis = self._build_attachment_analysis( + document=document, + item=item, + document_info=document_info, + requirement_check=requirement_check, + ) + metadata["document_info"] = document_info + metadata["requirement_check"] = requirement_check else: analysis = self._build_fallback_attachment_analysis(media_type=media_type, item=item) diff --git a/server/tests/test_expense_claim_service.py b/server/tests/test_expense_claim_service.py index 73f3471..b0a5f27 100644 --- a/server/tests/test_expense_claim_service.py +++ b/server/tests/test_expense_claim_service.py @@ -187,6 +187,8 @@ def test_update_claim_item_reanalyzes_existing_attachment(monkeypatch, tmp_path) ) assert uploaded_meta is not None assert uploaded_meta["analysis"]["severity"] == "pass" + assert uploaded_meta["document_info"]["document_type"] == "office_invoice" + assert uploaded_meta["requirement_check"]["matches"] is True updated = service.update_claim_item( claim_id=claim.id, @@ -207,8 +209,9 @@ def test_update_claim_item_reanalyzes_existing_attachment(monkeypatch, tmp_path) current_user=current_user, ) assert refreshed_meta is not None - assert refreshed_meta["analysis"]["severity"] == "medium" - assert any("用途字段" in point for point in refreshed_meta["analysis"]["points"]) + assert refreshed_meta["analysis"]["severity"] == "high" + assert refreshed_meta["requirement_check"]["matches"] is False + assert any("附件类型要求" in point for point in refreshed_meta["analysis"]["points"]) def test_delete_claim_item_removes_row_and_attachment_files(monkeypatch, tmp_path) -> None: diff --git a/server/tests/test_reimbursement_endpoints.py b/server/tests/test_reimbursement_endpoints.py index c7dbc79..be05879 100644 --- a/server/tests/test_reimbursement_endpoints.py +++ b/server/tests/test_reimbursement_endpoints.py @@ -154,6 +154,8 @@ def test_claim_item_attachment_upload_preview_and_delete(monkeypatch, tmp_path) upload_payload = upload_response.json() assert upload_payload["attachment"]["file_name"] == "office-note.png" assert upload_payload["attachment"]["analysis"]["label"] == "AI提示符合条件" + assert upload_payload["attachment"]["document_info"]["document_type"] == "office_invoice" + assert upload_payload["attachment"]["requirement_check"]["matches"] is True assert upload_payload["invoice_id"] meta_response = client.get( @@ -164,6 +166,7 @@ def test_claim_item_attachment_upload_preview_and_delete(monkeypatch, tmp_path) meta_payload = meta_response.json() assert meta_payload["media_type"] == "image/png" assert meta_payload["analysis"]["headline"] + assert meta_payload["document_info"]["fields"][0]["label"] == "金额" content_response = client.get( f"/api/v1/reimbursements/claims/{claim_id}/items/{item_id}/attachment", @@ -228,7 +231,8 @@ def test_claim_item_attachment_upload_flags_purpose_and_amount_mismatch(monkeypa analysis = upload_response.json()["attachment"]["analysis"] assert analysis["severity"] == "high" assert any("金额字段" in point for point in analysis["points"]) - assert any("用途字段" in point for point in analysis["points"]) + assert any("附件类型要求" in point for point in analysis["points"]) + assert upload_response.json()["attachment"]["requirement_check"]["matches"] is False def test_claim_item_attachment_upload_flags_non_invoice_image_as_high_risk(monkeypatch, tmp_path) -> None: