fix(expense): narrow travel route risk indicators

2026-06-17 09:36:24 +08:00
parent 9f7b8b46a3
commit 470f343b29
10 changed files with 1040 additions and 368 deletions
--- a/server/src/app/services/expense_claim_platform_risk.py
+++ b/server/src/app/services/expense_claim_platform_risk.py
@@ -11,11 +11,23 @@ from app.models.financial_record import ExpenseClaim, ExpenseClaimItem
 from app.services.agent_asset_rule_library import AgentAssetRuleLibraryManager
 from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY
 from app.services.budget import BudgetService
+from app.services.expense_claim_platform_context_tools import (
+    collect_attachment_cities,
+    collect_invoice_keys_from_contexts,
+    collect_invoice_keys_from_document_info,
+    count_values,
+    extract_known_cities_from_text,
+    resolve_first_document_field_value,
+)
 from app.services.expense_rule_runtime import (
    RuntimeTravelPolicy,
 )
 from app.services.expense_type_keywords import resolve_expense_type_code_from_text
+from app.services.expense_claim_platform_route_risk import resolve_multi_city_related_item_ids
 from app.services.expense_claim_platform_risk_flag import build_platform_risk_flag
+from app.services.expense_claim_platform_text_risk import (
+    collect_vague_goods_description_evidence,
+)
 from app.services.risk_rule_manifest_classifier import is_budget_risk_manifest
 from app.services.risk_rule_manifest_normalizer import normalize_risk_rule_manifest
 from app.services.risk_rule_template_executor import RiskRuleTemplateExecutor
@@ -24,44 +36,6 @@ from app.services.risk_rule_template_executor import RiskRuleTemplateExecutor
 class ExpenseClaimPlatformRiskMixin:
    _DEFAULT_RISK_BUSINESS_STAGE = "reimbursement"
    _SUPPORTED_RISK_BUSINESS_STAGES = {"expense_application", "reimbursement"}
-    _CLEAR_TRAVEL_DOCUMENT_TYPES = {
-        "flight_itinerary",
-        "train_ticket",
-        "ship_ticket",
-        "hotel_invoice",
-        "taxi_receipt",
-        "parking_toll_receipt",
-    }
-    _CLEAR_TRAVEL_SCENE_CODES = {"travel", "hotel", "transport"}
-    _GOODS_DESCRIPTION_FIELD_KEYS = {
-        "goodsname",
-        "servicename",
-        "itemname",
-        "project",
-        "productname",
-        "description",
-        "content",
-        "expensecontent",
-        "feeitem",
-    }
-    _GOODS_DESCRIPTION_LABEL_TOKENS = (
-        "商品",
-        "服务",
-        "货物",
-        "项目",
-        "品名",
-        "名称",
-        "费用内容",
-        "消费内容",
-    )
-    _VAGUE_KEYWORD_NEGATION_MARKERS = (
-        "不含",
-        "不包含",
-        "不包括",
-        "未包含",
-        "不涉及",
-        "不属于",
-    )

    def evaluate_platform_risk_rules(
        self,
@@ -539,7 +513,7 @@ class ExpenseClaimPlatformRiskMixin:
        policy = self._get_expense_rule_catalog().travel_policy
        if policy is None:
            return None
-        declared_cities = self._extract_known_cities_from_text(
+        declared_cities = extract_known_cities_from_text(
            " ".join(
                [
                    str(claim.location or ""),
@@ -548,7 +522,7 @@ class ExpenseClaimPlatformRiskMixin:
            ),
            policy,
        )
-        evidence_cities = self._collect_attachment_cities(contexts, policy)
+        evidence_cities = collect_attachment_cities(contexts, policy)
        if not declared_cities or not evidence_cities:
            return None
        if set(declared_cities) & set(evidence_cities):
@@ -574,9 +548,9 @@ class ExpenseClaimPlatformRiskMixin:
        claim: ExpenseClaim,
        contexts: list[dict[str, Any]],
    ) -> dict[str, Any] | None:
-        invoice_keys = self._collect_invoice_keys_from_contexts(contexts)
+        invoice_keys = collect_invoice_keys_from_contexts(contexts)
        duplicate_keys = [
-            key for key, count in self._count_values(invoice_keys).items() if count > 1
+            key for key, count in count_values(invoice_keys).items() if count > 1
        ]
        if duplicate_keys:
            return self._build_platform_risk_flag(
@@ -604,7 +578,7 @@ class ExpenseClaimPlatformRiskMixin:
            other_document_info = other_meta.get("document_info")
            if not isinstance(other_document_info, dict):
                continue
-            other_keys = self._collect_invoice_keys_from_document_info(other_document_info)
+            other_keys = collect_invoice_keys_from_document_info(other_document_info)
            if set(invoice_keys) & set(other_keys):
                matched_claim_ids.add(str(other_item.claim_id or ""))

@@ -635,7 +609,7 @@ class ExpenseClaimPlatformRiskMixin:
            return None
        mismatched_buyers: list[str] = []
        for context in contexts:
-            buyer = self._resolve_first_document_field_value(
+            buyer = resolve_first_document_field_value(
                context.get("document_info") or {},
                keys={"buyer_name", "buyer", "purchaser_name", "claimant"},
                labels={"购买方", "抬头", "买方", "购方"},
@@ -667,7 +641,7 @@ class ExpenseClaimPlatformRiskMixin:
        for context in contexts:
            text = " ".join(
                [
-                    self._resolve_first_document_field_value(
+                    resolve_first_document_field_value(
                        context.get("document_info") or {},
                        keys={"date", "issue_date", "invoice_date"},
                        labels={"日期", "开票日期", "发生时间"},
@@ -723,99 +697,16 @@ class ExpenseClaimPlatformRiskMixin:
        keywords: list[str],
        fallback_message: str,
    ) -> dict[str, Any] | None:
-        matched_keywords: list[str] = []
-        matched_fields: list[dict[str, str]] = []
-
-        for context in contexts:
-            document_info = context.get("document_info") or {}
-            if self._is_clear_travel_document(document_info):
-                continue
-
-            field_values = self._collect_goods_description_field_values(document_info)
-            if field_values:
-                for value in field_values:
-                    hits = self._collect_non_negated_keyword_hits(value, keywords)
-                    for keyword in hits:
-                        if keyword not in matched_keywords:
-                            matched_keywords.append(keyword)
-                    if hits:
-                        matched_fields.append(
-                            {
-                                "item_index": str(context.get("index") or ""),
-                                "value": value[:80],
-                            }
-                        )
-                continue
-
-            fallback_text = f"{context.get('ocr_summary') or ''}\n{context.get('ocr_text') or ''}"
-            hits = self._collect_non_negated_keyword_hits(fallback_text, keywords)
-            for keyword in hits:
-                if keyword not in matched_keywords:
-                    matched_keywords.append(keyword)
-            if hits:
-                matched_fields.append(
-                    {
-                        "item_index": str(context.get("index") or ""),
-                        "value": "OCR全文兜底",
-                    }
-                )
-
-        if not matched_keywords:
+        evidence = collect_vague_goods_description_evidence(contexts, keywords)
+        if not evidence:
            return None

        return self._build_platform_risk_flag(
            manifest,
            message=fallback_message,
-            evidence={
-                "matched_keywords": matched_keywords,
-                "matched_fields": matched_fields[:5],
-            },
+            evidence=evidence,
        )

-    @classmethod
-    def _is_clear_travel_document(cls, document_info: dict[str, Any]) -> bool:
-        document_type = str(document_info.get("document_type") or "").strip().lower()
-        scene_code = str(document_info.get("scene_code") or "").strip().lower()
-        return (
-            document_type in cls._CLEAR_TRAVEL_DOCUMENT_TYPES
-            or scene_code in cls._CLEAR_TRAVEL_SCENE_CODES
-        )
-
-    @classmethod
-    def _collect_goods_description_field_values(cls, document_info: dict[str, Any]) -> list[str]:
-        values: list[str] = []
-        for field in list(document_info.get("fields") or []):
-            if not isinstance(field, dict):
-                continue
-            field_key = str(field.get("key") or "").strip().lower().replace("_", "")
-            label = str(field.get("label") or "").replace(" ", "")
-            value = str(field.get("value") or "").strip()
-            if not value:
-                continue
-            if field_key in cls._GOODS_DESCRIPTION_FIELD_KEYS or any(
-                token in label for token in cls._GOODS_DESCRIPTION_LABEL_TOKENS
-            ):
-                values.append(value)
-        return values
-
-    @classmethod
-    def _collect_non_negated_keyword_hits(cls, text: str, keywords: list[str]) -> list[str]:
-        normalized = str(text or "")
-        if not normalized:
-            return []
-
-        hits: list[str] = []
-        for keyword in keywords:
-            if not keyword:
-                continue
-            for match in re.finditer(re.escape(keyword), normalized):
-                window = normalized[max(0, match.start() - 12): match.end() + 12]
-                if any(marker in window for marker in cls._VAGUE_KEYWORD_NEGATION_MARKERS):
-                    continue
-                hits.append(keyword)
-                break
-        return hits
-
    def _evaluate_multi_city_reason_required_risk(
        self,
        manifest: dict[str, Any],
@@ -826,9 +717,9 @@ class ExpenseClaimPlatformRiskMixin:
        policy = self._get_expense_rule_catalog().travel_policy
        if policy is None:
            return None
-        cities = self._collect_attachment_cities(contexts, policy)
+        cities = collect_attachment_cities(contexts, policy)
        for item in list(claim.items or []):
-            for city in self._extract_known_cities_from_text(str(item.item_location or ""), policy):
+            for city in extract_known_cities_from_text(str(item.item_location or ""), policy):
                if city not in cities:
                    cities.append(city)
        if len(cities) <= 2:
@@ -836,13 +727,21 @@ class ExpenseClaimPlatformRiskMixin:
        reason_corpus = self._build_travel_reason_corpus(claim)
        if self._text_contains_keywords(reason_corpus, policy.route_exception_keywords):
            return None
+        related_item_ids, extra_cities = resolve_multi_city_related_item_ids(
+            claim,
+            contexts,
+            policy,
+        )
+        evidence = {"cities": cities[:8]}
+        if extra_cities:
+            evidence["extra_cities"] = extra_cities[:8]
        return self._with_related_item_ids(
            self._build_platform_risk_flag(
                manifest,
                message=f"本次报销识别到多城市行程（{'、'.join(cities[:5])}），但事由中未说明中转、多地拜访或改签原因。",
-                evidence={"cities": cities[:8]},
+                evidence=evidence,
            ),
-            self._context_item_ids(contexts),
+            related_item_ids or self._context_item_ids(contexts),
        )

    def _build_platform_risk_flag(
@@ -882,92 +781,3 @@ class ExpenseClaimPlatformRiskMixin:
        if len(normalized_item_ids) == 1:
            flag["item_id"] = normalized_item_ids[0]
        return flag
-
-    @staticmethod
-    def _count_values(values: list[str]) -> dict[str, int]:
-        counts: dict[str, int] = {}
-        for value in values:
-            normalized = str(value or "").strip()
-            if not normalized:
-                continue
-            counts[normalized] = counts.get(normalized, 0) + 1
-        return counts
-
-    def _collect_invoice_keys_from_contexts(self, contexts: list[dict[str, Any]]) -> list[str]:
-        invoice_keys: list[str] = []
-        for context in contexts:
-            document_info = context.get("document_info") or {}
-            for key in self._collect_invoice_keys_from_document_info(document_info):
-                if key not in invoice_keys:
-                    invoice_keys.append(key)
-        return invoice_keys
-
-    def _collect_invoice_keys_from_document_info(self, document_info: dict[str, Any]) -> list[str]:
-        keys: list[str] = []
-        for field in list(document_info.get("fields") or []):
-            if not isinstance(field, dict):
-                continue
-            field_key = str(field.get("key") or "").strip().lower().replace("_", "")
-            label = str(field.get("label") or "").replace(" ", "")
-            value = str(field.get("value") or "").strip()
-            if not value:
-                continue
-            if field_key in {"invoiceno", "invoicenumber", "number", "code"} or any(
-                token in label for token in ("发票号码", "票号", "发票代码", "号码")
-            ):
-                normalized = re.sub(r"\s+", "", value)
-                if normalized and normalized not in keys:
-                    keys.append(normalized)
-        return keys
-
-    def _collect_attachment_cities(
-        self,
-        contexts: list[dict[str, Any]],
-        policy: RuntimeTravelPolicy,
-    ) -> list[str]:
-        cities: list[str] = []
-        for context in contexts:
-            document_info = context.get("document_info") or {}
-            parts = [
-                str(context.get("ocr_summary") or ""),
-                str(context.get("ocr_text") or ""),
-                str(context.get("item").item_location if context.get("item") is not None else ""),
-            ]
-            for field in list(document_info.get("fields") or []):
-                if isinstance(field, dict):
-                    parts.append(str(field.get("value") or ""))
-            for city in self._extract_known_cities_from_text(" ".join(parts), policy):
-                if city not in cities:
-                    cities.append(city)
-        return cities
-
-    @staticmethod
-    def _extract_known_cities_from_text(text: str, policy: RuntimeTravelPolicy) -> list[str]:
-        normalized = str(text or "").strip()
-        if not normalized:
-            return []
-        cities: list[str] = []
-        for city in sorted(policy.city_tiers.keys(), key=lambda item: len(item), reverse=True):
-            if city in normalized and city not in cities:
-                cities.append(city)
-        return cities
-
-    @staticmethod
-    def _resolve_first_document_field_value(
-        document_info: dict[str, Any],
-        *,
-        keys: set[str],
-        labels: set[str],
-    ) -> str:
-        normalized_keys = {key.replace("_", "").lower() for key in keys}
-        for field in list(document_info.get("fields") or []):
-            if not isinstance(field, dict):
-                continue
-            field_key = str(field.get("key") or "").strip().lower().replace("_", "")
-            label = str(field.get("label") or "").replace(" ", "")
-            value = str(field.get("value") or "").strip()
-            if not value:
-                continue
-            if field_key in normalized_keys or any(token in label for token in labels):
-                return value
-        return ""