fix(expense): narrow travel route risk indicators

This commit is contained in:
caoxiaozhu
2026-06-17 09:36:24 +08:00
parent 9f7b8b46a3
commit 470f343b29
10 changed files with 1040 additions and 368 deletions

View File

@@ -0,0 +1,97 @@
from __future__ import annotations
import re
from typing import Any
from app.services.expense_rule_runtime import RuntimeTravelPolicy
def count_values(values: list[str]) -> dict[str, int]:
counts: dict[str, int] = {}
for value in values:
normalized = str(value or "").strip()
if not normalized:
continue
counts[normalized] = counts.get(normalized, 0) + 1
return counts
def collect_invoice_keys_from_contexts(contexts: list[dict[str, Any]]) -> list[str]:
invoice_keys: list[str] = []
for context in contexts:
document_info = context.get("document_info") or {}
for key in collect_invoice_keys_from_document_info(document_info):
if key not in invoice_keys:
invoice_keys.append(key)
return invoice_keys
def collect_invoice_keys_from_document_info(document_info: dict[str, Any]) -> list[str]:
keys: list[str] = []
for field in list(document_info.get("fields") or []):
if not isinstance(field, dict):
continue
field_key = str(field.get("key") or "").strip().lower().replace("_", "")
label = str(field.get("label") or "").replace(" ", "")
value = str(field.get("value") or "").strip()
if not value:
continue
if field_key in {"invoiceno", "invoicenumber", "number", "code"} or any(
token in label for token in ("发票号码", "票号", "发票代码", "号码")
):
normalized = re.sub(r"\s+", "", value)
if normalized and normalized not in keys:
keys.append(normalized)
return keys
def collect_attachment_cities(
contexts: list[dict[str, Any]],
policy: RuntimeTravelPolicy,
) -> list[str]:
cities: list[str] = []
for context in contexts:
document_info = context.get("document_info") or {}
parts = [
str(context.get("ocr_summary") or ""),
str(context.get("ocr_text") or ""),
str(context.get("item").item_location if context.get("item") is not None else ""),
]
for field in list(document_info.get("fields") or []):
if isinstance(field, dict):
parts.append(str(field.get("value") or ""))
for city in extract_known_cities_from_text(" ".join(parts), policy):
if city not in cities:
cities.append(city)
return cities
def extract_known_cities_from_text(text: str, policy: RuntimeTravelPolicy) -> list[str]:
normalized = str(text or "").strip()
if not normalized:
return []
cities: list[str] = []
for city in sorted(policy.city_tiers.keys(), key=lambda item: len(item), reverse=True):
if city in normalized and city not in cities:
cities.append(city)
return cities
def resolve_first_document_field_value(
document_info: dict[str, Any],
*,
keys: set[str],
labels: set[str],
) -> str:
normalized_keys = {key.replace("_", "").lower() for key in keys}
for field in list(document_info.get("fields") or []):
if not isinstance(field, dict):
continue
field_key = str(field.get("key") or "").strip().lower().replace("_", "")
label = str(field.get("label") or "").replace(" ", "")
value = str(field.get("value") or "").strip()
if not value:
continue
if field_key in normalized_keys or any(token in label for token in labels):
return value
return ""

View File

@@ -11,11 +11,23 @@ from app.models.financial_record import ExpenseClaim, ExpenseClaimItem
from app.services.agent_asset_rule_library import AgentAssetRuleLibraryManager
from app.services.agent_asset_spreadsheet import RISK_RULES_LIBRARY
from app.services.budget import BudgetService
from app.services.expense_claim_platform_context_tools import (
collect_attachment_cities,
collect_invoice_keys_from_contexts,
collect_invoice_keys_from_document_info,
count_values,
extract_known_cities_from_text,
resolve_first_document_field_value,
)
from app.services.expense_rule_runtime import (
RuntimeTravelPolicy,
)
from app.services.expense_type_keywords import resolve_expense_type_code_from_text
from app.services.expense_claim_platform_route_risk import resolve_multi_city_related_item_ids
from app.services.expense_claim_platform_risk_flag import build_platform_risk_flag
from app.services.expense_claim_platform_text_risk import (
collect_vague_goods_description_evidence,
)
from app.services.risk_rule_manifest_classifier import is_budget_risk_manifest
from app.services.risk_rule_manifest_normalizer import normalize_risk_rule_manifest
from app.services.risk_rule_template_executor import RiskRuleTemplateExecutor
@@ -24,44 +36,6 @@ from app.services.risk_rule_template_executor import RiskRuleTemplateExecutor
class ExpenseClaimPlatformRiskMixin:
_DEFAULT_RISK_BUSINESS_STAGE = "reimbursement"
_SUPPORTED_RISK_BUSINESS_STAGES = {"expense_application", "reimbursement"}
_CLEAR_TRAVEL_DOCUMENT_TYPES = {
"flight_itinerary",
"train_ticket",
"ship_ticket",
"hotel_invoice",
"taxi_receipt",
"parking_toll_receipt",
}
_CLEAR_TRAVEL_SCENE_CODES = {"travel", "hotel", "transport"}
_GOODS_DESCRIPTION_FIELD_KEYS = {
"goodsname",
"servicename",
"itemname",
"project",
"productname",
"description",
"content",
"expensecontent",
"feeitem",
}
_GOODS_DESCRIPTION_LABEL_TOKENS = (
"商品",
"服务",
"货物",
"项目",
"品名",
"名称",
"费用内容",
"消费内容",
)
_VAGUE_KEYWORD_NEGATION_MARKERS = (
"不含",
"不包含",
"不包括",
"未包含",
"不涉及",
"不属于",
)
def evaluate_platform_risk_rules(
self,
@@ -539,7 +513,7 @@ class ExpenseClaimPlatformRiskMixin:
policy = self._get_expense_rule_catalog().travel_policy
if policy is None:
return None
declared_cities = self._extract_known_cities_from_text(
declared_cities = extract_known_cities_from_text(
" ".join(
[
str(claim.location or ""),
@@ -548,7 +522,7 @@ class ExpenseClaimPlatformRiskMixin:
),
policy,
)
evidence_cities = self._collect_attachment_cities(contexts, policy)
evidence_cities = collect_attachment_cities(contexts, policy)
if not declared_cities or not evidence_cities:
return None
if set(declared_cities) & set(evidence_cities):
@@ -574,9 +548,9 @@ class ExpenseClaimPlatformRiskMixin:
claim: ExpenseClaim,
contexts: list[dict[str, Any]],
) -> dict[str, Any] | None:
invoice_keys = self._collect_invoice_keys_from_contexts(contexts)
invoice_keys = collect_invoice_keys_from_contexts(contexts)
duplicate_keys = [
key for key, count in self._count_values(invoice_keys).items() if count > 1
key for key, count in count_values(invoice_keys).items() if count > 1
]
if duplicate_keys:
return self._build_platform_risk_flag(
@@ -604,7 +578,7 @@ class ExpenseClaimPlatformRiskMixin:
other_document_info = other_meta.get("document_info")
if not isinstance(other_document_info, dict):
continue
other_keys = self._collect_invoice_keys_from_document_info(other_document_info)
other_keys = collect_invoice_keys_from_document_info(other_document_info)
if set(invoice_keys) & set(other_keys):
matched_claim_ids.add(str(other_item.claim_id or ""))
@@ -635,7 +609,7 @@ class ExpenseClaimPlatformRiskMixin:
return None
mismatched_buyers: list[str] = []
for context in contexts:
buyer = self._resolve_first_document_field_value(
buyer = resolve_first_document_field_value(
context.get("document_info") or {},
keys={"buyer_name", "buyer", "purchaser_name", "claimant"},
labels={"购买方", "抬头", "买方", "购方"},
@@ -667,7 +641,7 @@ class ExpenseClaimPlatformRiskMixin:
for context in contexts:
text = " ".join(
[
self._resolve_first_document_field_value(
resolve_first_document_field_value(
context.get("document_info") or {},
keys={"date", "issue_date", "invoice_date"},
labels={"日期", "开票日期", "发生时间"},
@@ -723,99 +697,16 @@ class ExpenseClaimPlatformRiskMixin:
keywords: list[str],
fallback_message: str,
) -> dict[str, Any] | None:
matched_keywords: list[str] = []
matched_fields: list[dict[str, str]] = []
for context in contexts:
document_info = context.get("document_info") or {}
if self._is_clear_travel_document(document_info):
continue
field_values = self._collect_goods_description_field_values(document_info)
if field_values:
for value in field_values:
hits = self._collect_non_negated_keyword_hits(value, keywords)
for keyword in hits:
if keyword not in matched_keywords:
matched_keywords.append(keyword)
if hits:
matched_fields.append(
{
"item_index": str(context.get("index") or ""),
"value": value[:80],
}
)
continue
fallback_text = f"{context.get('ocr_summary') or ''}\n{context.get('ocr_text') or ''}"
hits = self._collect_non_negated_keyword_hits(fallback_text, keywords)
for keyword in hits:
if keyword not in matched_keywords:
matched_keywords.append(keyword)
if hits:
matched_fields.append(
{
"item_index": str(context.get("index") or ""),
"value": "OCR全文兜底",
}
)
if not matched_keywords:
evidence = collect_vague_goods_description_evidence(contexts, keywords)
if not evidence:
return None
return self._build_platform_risk_flag(
manifest,
message=fallback_message,
evidence={
"matched_keywords": matched_keywords,
"matched_fields": matched_fields[:5],
},
evidence=evidence,
)
@classmethod
def _is_clear_travel_document(cls, document_info: dict[str, Any]) -> bool:
document_type = str(document_info.get("document_type") or "").strip().lower()
scene_code = str(document_info.get("scene_code") or "").strip().lower()
return (
document_type in cls._CLEAR_TRAVEL_DOCUMENT_TYPES
or scene_code in cls._CLEAR_TRAVEL_SCENE_CODES
)
@classmethod
def _collect_goods_description_field_values(cls, document_info: dict[str, Any]) -> list[str]:
values: list[str] = []
for field in list(document_info.get("fields") or []):
if not isinstance(field, dict):
continue
field_key = str(field.get("key") or "").strip().lower().replace("_", "")
label = str(field.get("label") or "").replace(" ", "")
value = str(field.get("value") or "").strip()
if not value:
continue
if field_key in cls._GOODS_DESCRIPTION_FIELD_KEYS or any(
token in label for token in cls._GOODS_DESCRIPTION_LABEL_TOKENS
):
values.append(value)
return values
@classmethod
def _collect_non_negated_keyword_hits(cls, text: str, keywords: list[str]) -> list[str]:
normalized = str(text or "")
if not normalized:
return []
hits: list[str] = []
for keyword in keywords:
if not keyword:
continue
for match in re.finditer(re.escape(keyword), normalized):
window = normalized[max(0, match.start() - 12): match.end() + 12]
if any(marker in window for marker in cls._VAGUE_KEYWORD_NEGATION_MARKERS):
continue
hits.append(keyword)
break
return hits
def _evaluate_multi_city_reason_required_risk(
self,
manifest: dict[str, Any],
@@ -826,9 +717,9 @@ class ExpenseClaimPlatformRiskMixin:
policy = self._get_expense_rule_catalog().travel_policy
if policy is None:
return None
cities = self._collect_attachment_cities(contexts, policy)
cities = collect_attachment_cities(contexts, policy)
for item in list(claim.items or []):
for city in self._extract_known_cities_from_text(str(item.item_location or ""), policy):
for city in extract_known_cities_from_text(str(item.item_location or ""), policy):
if city not in cities:
cities.append(city)
if len(cities) <= 2:
@@ -836,13 +727,21 @@ class ExpenseClaimPlatformRiskMixin:
reason_corpus = self._build_travel_reason_corpus(claim)
if self._text_contains_keywords(reason_corpus, policy.route_exception_keywords):
return None
related_item_ids, extra_cities = resolve_multi_city_related_item_ids(
claim,
contexts,
policy,
)
evidence = {"cities": cities[:8]}
if extra_cities:
evidence["extra_cities"] = extra_cities[:8]
return self._with_related_item_ids(
self._build_platform_risk_flag(
manifest,
message=f"本次报销识别到多城市行程({''.join(cities[:5])}),但事由中未说明中转、多地拜访或改签原因。",
evidence={"cities": cities[:8]},
evidence=evidence,
),
self._context_item_ids(contexts),
related_item_ids or self._context_item_ids(contexts),
)
def _build_platform_risk_flag(
@@ -882,92 +781,3 @@ class ExpenseClaimPlatformRiskMixin:
if len(normalized_item_ids) == 1:
flag["item_id"] = normalized_item_ids[0]
return flag
@staticmethod
def _count_values(values: list[str]) -> dict[str, int]:
counts: dict[str, int] = {}
for value in values:
normalized = str(value or "").strip()
if not normalized:
continue
counts[normalized] = counts.get(normalized, 0) + 1
return counts
def _collect_invoice_keys_from_contexts(self, contexts: list[dict[str, Any]]) -> list[str]:
invoice_keys: list[str] = []
for context in contexts:
document_info = context.get("document_info") or {}
for key in self._collect_invoice_keys_from_document_info(document_info):
if key not in invoice_keys:
invoice_keys.append(key)
return invoice_keys
def _collect_invoice_keys_from_document_info(self, document_info: dict[str, Any]) -> list[str]:
keys: list[str] = []
for field in list(document_info.get("fields") or []):
if not isinstance(field, dict):
continue
field_key = str(field.get("key") or "").strip().lower().replace("_", "")
label = str(field.get("label") or "").replace(" ", "")
value = str(field.get("value") or "").strip()
if not value:
continue
if field_key in {"invoiceno", "invoicenumber", "number", "code"} or any(
token in label for token in ("发票号码", "票号", "发票代码", "号码")
):
normalized = re.sub(r"\s+", "", value)
if normalized and normalized not in keys:
keys.append(normalized)
return keys
def _collect_attachment_cities(
self,
contexts: list[dict[str, Any]],
policy: RuntimeTravelPolicy,
) -> list[str]:
cities: list[str] = []
for context in contexts:
document_info = context.get("document_info") or {}
parts = [
str(context.get("ocr_summary") or ""),
str(context.get("ocr_text") or ""),
str(context.get("item").item_location if context.get("item") is not None else ""),
]
for field in list(document_info.get("fields") or []):
if isinstance(field, dict):
parts.append(str(field.get("value") or ""))
for city in self._extract_known_cities_from_text(" ".join(parts), policy):
if city not in cities:
cities.append(city)
return cities
@staticmethod
def _extract_known_cities_from_text(text: str, policy: RuntimeTravelPolicy) -> list[str]:
normalized = str(text or "").strip()
if not normalized:
return []
cities: list[str] = []
for city in sorted(policy.city_tiers.keys(), key=lambda item: len(item), reverse=True):
if city in normalized and city not in cities:
cities.append(city)
return cities
@staticmethod
def _resolve_first_document_field_value(
document_info: dict[str, Any],
*,
keys: set[str],
labels: set[str],
) -> str:
normalized_keys = {key.replace("_", "").lower() for key in keys}
for field in list(document_info.get("fields") or []):
if not isinstance(field, dict):
continue
field_key = str(field.get("key") or "").strip().lower().replace("_", "")
label = str(field.get("label") or "").replace(" ", "")
value = str(field.get("value") or "").strip()
if not value:
continue
if field_key in normalized_keys or any(token in label for token in labels):
return value
return ""

View File

@@ -0,0 +1,244 @@
from __future__ import annotations
from typing import Any
from app.models.financial_record import ExpenseClaim
from app.services.expense_rule_runtime import RuntimeTravelPolicy
def resolve_multi_city_related_item_ids(
claim: ExpenseClaim,
contexts: list[dict[str, Any]],
policy: RuntimeTravelPolicy,
) -> tuple[list[str], list[str]]:
segments = _collect_travel_route_segments(contexts, policy)
if not segments:
return _context_item_ids(contexts), []
first_origin = str(segments[0].get("origin") or "").strip()
first_destination = str(segments[0].get("destination") or "").strip()
expected_destination = _resolve_expected_travel_city(claim, contexts, policy)
baseline_cities = _unique_text_values(
[first_origin, expected_destination or first_destination]
)
destination_cities = _unique_text_values(
[str(segment.get("destination") or "") for segment in segments]
)
extra_cities = [
city
for city in destination_cities
if city and city not in set(baseline_cities)
]
if not extra_cities:
route_cities = _unique_text_values(
[
city
for segment in segments
for city in (
str(segment.get("origin") or ""),
str(segment.get("destination") or ""),
)
]
)
extra_cities = [
city
for city in route_cities
if city and city not in set(baseline_cities)
]
if not extra_cities:
return [], []
affected_segments = [
segment
for segment in segments
if str(segment.get("origin") or "") in extra_cities
or str(segment.get("destination") or "") in extra_cities
]
return _route_segment_item_ids(affected_segments), extra_cities
def _collect_travel_route_segments(
contexts: list[dict[str, Any]],
policy: RuntimeTravelPolicy,
) -> list[dict[str, Any]]:
segments: list[dict[str, Any]] = []
for context in list(contexts or []):
if not isinstance(context, dict) or not _is_long_distance_context(context, policy):
continue
route_segment = _extract_route_segment(context, policy)
if route_segment is None:
continue
origin, destination = route_segment
segments.append(
{
"item": context.get("item"),
"origin": origin,
"destination": destination,
}
)
return segments
def _resolve_expected_travel_city(
claim: ExpenseClaim,
contexts: list[dict[str, Any]],
policy: RuntimeTravelPolicy,
) -> str:
claim_city = _extract_first_known_city(str(claim.location or ""), policy)
if claim_city:
return claim_city
for context in list(contexts or []):
document_info = context.get("document_info") if isinstance(context, dict) else {}
document_type = str(document_info.get("document_type") or "").strip().lower()
scene_code = str(document_info.get("scene_code") or "").strip().lower()
if document_type != "hotel_invoice" and scene_code != "hotel":
continue
for city in _extract_context_cities(context, policy):
return city
return ""
def _extract_route_segment(
context: dict[str, Any],
policy: RuntimeTravelPolicy,
) -> tuple[str, str] | None:
document_info = context.get("document_info") or {}
item = context.get("item")
route_value = _resolve_document_field_value(
document_info,
keys={"route", "route_cities", "routecities", "travel_route", "trip_route"},
labels={"路线", "行程", "起讫", "起终", "始发", "到达"},
)
candidates = [
route_value,
str(getattr(item, "item_location", "") or ""),
str(getattr(item, "item_reason", "") or ""),
str(context.get("ocr_summary") or ""),
str(context.get("ocr_text") or ""),
]
for candidate in candidates:
normalized = str(candidate or "").strip()
if not normalized:
continue
for separator in ("-", "", "", ""):
if separator not in normalized:
continue
origin_text, destination_text = [
segment.strip()
for segment in normalized.split(separator, 1)
]
origin = _extract_first_known_city(origin_text, policy)
destination = _extract_first_known_city(destination_text, policy)
if origin and destination and origin != destination:
return origin, destination
return None
def _is_long_distance_context(
context: dict[str, Any],
policy: RuntimeTravelPolicy,
) -> bool:
document_info = context.get("document_info") or {}
item = context.get("item")
document_type = str(document_info.get("document_type") or "").strip().lower()
scene_code = str(document_info.get("scene_code") or "").strip().lower()
item_type = str(getattr(item, "item_type", "") or "").strip().lower()
long_distance_types = set(policy.long_distance_document_types)
return (
document_type in long_distance_types
or item_type in long_distance_types
or scene_code == "travel"
)
def _extract_context_cities(
context: dict[str, Any],
policy: RuntimeTravelPolicy,
) -> list[str]:
document_info = context.get("document_info") or {}
item = context.get("item")
parts = [
str(context.get("ocr_summary") or ""),
str(context.get("ocr_text") or ""),
str(getattr(item, "item_location", "") or ""),
str(getattr(item, "item_reason", "") or ""),
]
for field in list(document_info.get("fields") or []):
if isinstance(field, dict):
parts.append(str(field.get("value") or ""))
return _extract_known_cities_from_text(" ".join(parts), policy)
def _extract_known_cities_from_text(text: str, policy: RuntimeTravelPolicy) -> list[str]:
normalized = str(text or "").strip()
if not normalized:
return []
cities: list[str] = []
for city in sorted(policy.city_tiers.keys(), key=lambda item: len(item), reverse=True):
if city in normalized and city not in cities:
cities.append(city)
return cities
def _extract_first_known_city(text: str, policy: RuntimeTravelPolicy) -> str:
cities = _extract_known_cities_from_text(text, policy)
return cities[0] if cities else ""
def _resolve_document_field_value(
document_info: dict[str, Any],
*,
keys: set[str],
labels: set[str],
) -> str:
normalized_keys = {key.replace("_", "").lower() for key in keys}
for field in list(document_info.get("fields") or []):
if not isinstance(field, dict):
continue
field_key = str(field.get("key") or "").strip().lower().replace("_", "")
label = str(field.get("label") or "").replace(" ", "")
value = str(field.get("value") or "").strip()
if not value:
continue
if field_key in normalized_keys or any(token in label for token in labels):
return value
return ""
def _route_segment_item_ids(segments: list[dict[str, Any]]) -> list[str]:
item_ids: list[str] = []
seen: set[str] = set()
for segment in list(segments or []):
item = segment.get("item") if isinstance(segment, dict) else None
item_id = str(getattr(item, "id", "") or "").strip()
if item_id and item_id not in seen:
seen.add(item_id)
item_ids.append(item_id)
return item_ids
def _context_item_ids(contexts: list[dict[str, Any]]) -> list[str]:
item_ids: list[str] = []
seen: set[str] = set()
for context in list(contexts or []):
item = context.get("item") if isinstance(context, dict) else None
item_id = str(getattr(item, "id", "") or "").strip()
if item_id and item_id not in seen:
seen.add(item_id)
item_ids.append(item_id)
return item_ids
def _unique_text_values(values: list[str]) -> list[str]:
normalized_values: list[str] = []
seen: set[str] = set()
for value in list(values or []):
normalized = str(value or "").strip()
if not normalized or normalized in seen:
continue
seen.add(normalized)
normalized_values.append(normalized)
return normalized_values

View File

@@ -0,0 +1,136 @@
from __future__ import annotations
import re
from typing import Any
_CLEAR_TRAVEL_DOCUMENT_TYPES = {
"flight_itinerary",
"train_ticket",
"ship_ticket",
"hotel_invoice",
"taxi_receipt",
"parking_toll_receipt",
}
_CLEAR_TRAVEL_SCENE_CODES = {"travel", "hotel", "transport"}
_GOODS_DESCRIPTION_FIELD_KEYS = {
"goodsname",
"servicename",
"itemname",
"project",
"productname",
"description",
"content",
"expensecontent",
"feeitem",
}
_GOODS_DESCRIPTION_LABEL_TOKENS = (
"商品",
"服务",
"货物",
"项目",
"品名",
"名称",
"费用内容",
"消费内容",
)
_VAGUE_KEYWORD_NEGATION_MARKERS = (
"不含",
"不包含",
"不包括",
"未包含",
"不涉及",
"不属于",
)
def collect_vague_goods_description_evidence(
contexts: list[dict[str, Any]],
keywords: list[str],
) -> dict[str, Any] | None:
matched_keywords: list[str] = []
matched_fields: list[dict[str, str]] = []
for context in contexts:
document_info = context.get("document_info") or {}
if _is_clear_travel_document(document_info):
continue
field_values = _collect_goods_description_field_values(document_info)
if field_values:
for value in field_values:
hits = _collect_non_negated_keyword_hits(value, keywords)
for keyword in hits:
if keyword not in matched_keywords:
matched_keywords.append(keyword)
if hits:
matched_fields.append(
{
"item_index": str(context.get("index") or ""),
"value": value[:80],
}
)
continue
fallback_text = f"{context.get('ocr_summary') or ''}\n{context.get('ocr_text') or ''}"
hits = _collect_non_negated_keyword_hits(fallback_text, keywords)
for keyword in hits:
if keyword not in matched_keywords:
matched_keywords.append(keyword)
if hits:
matched_fields.append(
{
"item_index": str(context.get("index") or ""),
"value": "OCR全文兜底",
}
)
if not matched_keywords:
return None
return {
"matched_keywords": matched_keywords,
"matched_fields": matched_fields[:5],
}
def _is_clear_travel_document(document_info: dict[str, Any]) -> bool:
document_type = str(document_info.get("document_type") or "").strip().lower()
scene_code = str(document_info.get("scene_code") or "").strip().lower()
return (
document_type in _CLEAR_TRAVEL_DOCUMENT_TYPES
or scene_code in _CLEAR_TRAVEL_SCENE_CODES
)
def _collect_goods_description_field_values(document_info: dict[str, Any]) -> list[str]:
values: list[str] = []
for field in list(document_info.get("fields") or []):
if not isinstance(field, dict):
continue
field_key = str(field.get("key") or "").strip().lower().replace("_", "")
label = str(field.get("label") or "").replace(" ", "")
value = str(field.get("value") or "").strip()
if not value:
continue
if field_key in _GOODS_DESCRIPTION_FIELD_KEYS or any(
token in label for token in _GOODS_DESCRIPTION_LABEL_TOKENS
):
values.append(value)
return values
def _collect_non_negated_keyword_hits(text: str, keywords: list[str]) -> list[str]:
normalized = str(text or "")
if not normalized:
return []
hits: list[str] = []
for keyword in keywords:
if not keyword:
continue
for match in re.finditer(re.escape(keyword), normalized):
window = normalized[max(0, match.start() - 12): match.end() + 12]
if any(marker in window for marker in _VAGUE_KEYWORD_NEGATION_MARKERS):
continue
hits.append(keyword)
break
return hits