refactor(server): user_agent/steward/ocr 等服务重构并适配关联任务

- user_agent 拆分 application/locations/knowledge/response/review 四个子模块,接入申请位置语义与关联草稿分支
- steward planner/runtime/slot/plan_builder 决策链路重构,travel_reimbursement_calculator/orchestrator_expense_query 适配
- ocr/document_preview/document_intelligence/receipt_folder 复用预览与资产缓存,expense_claim_draft_flow/application_handoff 适配
- pyproject.toml 新增依赖,paddleocr bootstrap 脚本与 server_start.sh 调整
- 更新差旅/交通/通信等财务规则表,同步 document_intelligence/ocr/receipt_folder/user_agent 等测试
This commit is contained in:
caoxiaozhu
2026-06-24 10:42:24 +08:00
parent 332f77389d
commit 0264a4b5b4
41 changed files with 1273 additions and 182 deletions

View File

@@ -19,6 +19,7 @@ dependencies = [
"python-dotenv>=1.0.1,<2.0.0", "python-dotenv>=1.0.1,<2.0.0",
"email-validator>=2.2.0,<3.0.0", "email-validator>=2.2.0,<3.0.0",
"python-multipart>=0.0.20,<1.0.0", "python-multipart>=0.0.20,<1.0.0",
"jieba>=0.42.1,<0.43.0",
"openpyxl>=3.1.5,<4.0.0", "openpyxl>=3.1.5,<4.0.0",
"lightrag-hku>=1.4.16,<1.5.0", "lightrag-hku>=1.4.16,<1.5.0",
"qdrant-client>=1.18.0,<2.0.0", "qdrant-client>=1.18.0,<2.0.0",

View File

@@ -14,7 +14,7 @@ if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then
fi fi
apt-get update apt-get update
apt-get install -y --no-install-recommends libgl1 libglib2.0-0 poppler-utils poppler-data apt-get install -y --no-install-recommends libgl1 libglib2.0-0 poppler-utils poppler-data mupdf-tools
rm -rf "${OCR_VENV_DIR}" rm -rf "${OCR_VENV_DIR}"
"${PYTHON_BIN}" -m venv "${OCR_VENV_DIR}" "${PYTHON_BIN}" -m venv "${OCR_VENV_DIR}"

View File

@@ -13,7 +13,7 @@ if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then
fi fi
apt-get update apt-get update
apt-get install -y --no-install-recommends libgl1 libglib2.0-0 poppler-utils poppler-data apt-get install -y --no-install-recommends libgl1 libglib2.0-0 poppler-utils poppler-data mupdf-tools
"${PYTHON_BIN}" -m venv "${OCR_VENV_DIR}" "${PYTHON_BIN}" -m venv "${OCR_VENV_DIR}"
"${OCR_VENV_DIR}/bin/pip" install --upgrade pip "${OCR_VENV_DIR}/bin/pip" install --upgrade pip

View File

@@ -272,7 +272,7 @@ run_bootstrap_python() {
} }
dependencies_ready() { dependencies_ready() {
"$PYTHON_BIN" -c "import alembic, dotenv, email_validator, fastapi, jwt, lightrag, multipart, openpyxl, psycopg, pydantic_settings, qdrant_client, sqlalchemy, uvicorn" >/dev/null 2>&1 "$PYTHON_BIN" -c "import alembic, dotenv, email_validator, fastapi, jieba, jwt, lightrag, multipart, openpyxl, psycopg, pydantic_settings, qdrant_client, sqlalchemy, uvicorn" >/dev/null 2>&1
} }
pip_ready() { pip_ready() {

View File

@@ -562,7 +562,7 @@ def _extract_document_fields(text: str, document_type: str = "") -> list[Documen
if date_value: if date_value:
append_field("date", "日期", date_value) append_field("date", "日期", date_value)
merchant = _extract_merchant(text) merchant = "中国铁路" if normalized_type == "train_ticket" else _extract_merchant(text)
if merchant: if merchant:
append_field("merchant_name", "商户", merchant) append_field("merchant_name", "商户", merchant)

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import base64 import base64
import binascii import binascii
import mimetypes import mimetypes
import os
import re import re
import shutil import shutil
import subprocess import subprocess
@@ -11,9 +12,19 @@ from pathlib import Path
class DocumentPreviewAssets: class DocumentPreviewAssets:
PDF_RENDERER_ID = "pdftoppm-png-r160-poppler-data" PDF_RENDERER_ID = "pdf-raster-cjk-safe-v3"
PDF_PREVIEW_MEDIA_TYPE = "image/png" PDF_PREVIEW_MEDIA_TYPE = "image/png"
PDF_PREVIEW_SUFFIX = ".png" PDF_PREVIEW_SUFFIX = ".png"
PDF_UNUSABLE_PREVIEW_ERRORS = (
"Missing language pack",
"Unknown font tag",
"No font in show",
)
POPPLER_DATA_DIR_CANDIDATES = (
"/usr/share/poppler",
"/usr/local/share/poppler",
"/opt/homebrew/share/poppler",
)
@staticmethod @staticmethod
def decode_data_url(payload: str) -> tuple[str, bytes] | None: def decode_data_url(payload: str) -> tuple[str, bytes] | None:
@@ -64,30 +75,117 @@ class DocumentPreviewAssets:
) -> Path: ) -> Path:
preview_path.parent.mkdir(parents=True, exist_ok=True) preview_path.parent.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory(prefix=".pdf-preview-", dir=str(preview_path.parent)) as temp_dir: with tempfile.TemporaryDirectory(prefix=".pdf-preview-", dir=str(preview_path.parent)) as temp_dir:
prefix = Path(temp_dir) / "page" pages = cls.render_pdf_pages(
pdf_path=pdf_path,
output_dir=Path(temp_dir),
timeout_seconds=timeout_seconds,
)
shutil.copyfile(pages[0], preview_path)
return preview_path
@classmethod
def render_pdf_pages(
cls,
*,
pdf_path: Path,
output_dir: Path,
timeout_seconds: int | float,
resolution: int = 160,
) -> list[Path]:
output_dir.mkdir(parents=True, exist_ok=True)
errors: list[str] = []
for renderer_name, command in cls._pdf_render_commands(
pdf_path=pdf_path,
output_dir=output_dir,
resolution=resolution,
):
executable = shutil.which(renderer_name)
if not executable:
errors.append(f"{renderer_name}: executable not found")
continue
cls._clear_rendered_pdf_pages(output_dir)
command[0] = executable
completed = subprocess.run( completed = subprocess.run(
[ command,
"pdftoppm",
"-png",
"-r",
"160",
str(pdf_path),
str(prefix),
],
capture_output=True, capture_output=True,
text=True, text=True,
timeout=timeout_seconds, timeout=timeout_seconds,
check=False, check=False,
env=cls._pdf_render_env(),
) )
if completed.returncode != 0:
detail = (completed.stderr or completed.stdout or "").strip() detail = (completed.stderr or completed.stdout or "").strip()
raise RuntimeError(detail or "pdftoppm failed to render PDF preview.") if completed.returncode != 0:
errors.append(f"{renderer_name}: {detail or 'renderer returned non-zero status'}")
continue
if cls.render_output_indicates_unusable_pdf_preview(detail):
errors.append(f"{renderer_name}: {detail or 'renderer produced unusable output'}")
continue
pages = sorted(Path(temp_dir).glob("page-*.png"), key=cls._extract_pdf_page_sort_key) pages = sorted(output_dir.glob("page-*.png"), key=cls._extract_pdf_page_sort_key)
if not pages: if pages:
raise RuntimeError("pdftoppm did not generate a preview image.") return pages
shutil.copyfile(pages[0], preview_path) errors.append(f"{renderer_name}: renderer did not generate PNG pages")
return preview_path
cls._clear_rendered_pdf_pages(output_dir)
detail = "".join(errors[-3:])
raise RuntimeError(detail or "no PDF renderer generated usable PNG pages")
@classmethod
def render_output_indicates_unusable_pdf_preview(cls, output: str) -> bool:
return any(token in str(output or "") for token in cls.PDF_UNUSABLE_PREVIEW_ERRORS)
@classmethod
def _pdf_render_commands(
cls,
*,
pdf_path: Path,
output_dir: Path,
resolution: int,
) -> list[tuple[str, list[str]]]:
prefix = output_dir / "page"
page_pattern = output_dir / "page-%d.png"
return [
(
"pdftoppm",
["pdftoppm", "-png", "-r", str(resolution), str(pdf_path), str(prefix)],
),
(
"mutool",
["mutool", "draw", "-r", str(resolution), "-o", str(page_pattern), str(pdf_path)],
),
(
"gs",
[
"gs",
"-dSAFER",
"-dBATCH",
"-dNOPAUSE",
"-sDEVICE=png16m",
f"-r{resolution}",
f"-sOutputFile={page_pattern}",
str(pdf_path),
],
),
(
"pdftocairo",
["pdftocairo", "-png", "-r", str(resolution), str(pdf_path), str(prefix)],
),
]
@classmethod
def _pdf_render_env(cls) -> dict[str, str]:
env = os.environ.copy()
for candidate in cls.POPPLER_DATA_DIR_CANDIDATES:
if (Path(candidate) / "cMap").exists():
env.setdefault("POPPLER_DATADIR", candidate)
break
return env
@staticmethod
def _clear_rendered_pdf_pages(output_dir: Path) -> None:
for page in output_dir.glob("page-*.png"):
page.unlink(missing_ok=True)
@staticmethod @staticmethod
def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]: def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]:

View File

@@ -9,7 +9,10 @@ from sqlalchemy import or_, select
from app.models.financial_record import ExpenseClaim from app.models.financial_record import ExpenseClaim
from app.services.expense_claim_risk_stage import with_risk_business_stage from app.services.expense_claim_risk_stage import with_risk_business_stage
from app.services.expense_claim_workflow_constants import APPLICATION_ARCHIVE_STAGE from app.services.expense_claim_workflow_constants import (
APPLICATION_ARCHIVE_STAGE,
APPLICATION_LINK_STATUS_STAGE,
)
APPLICATION_REIMBURSEMENT_TYPE_MAP = { APPLICATION_REIMBURSEMENT_TYPE_MAP = {
@@ -248,3 +251,151 @@ class ExpenseClaimApplicationHandoffMixin:
) )
return archived_applications return archived_applications
@staticmethod
def _reference_matches_deleted_reimbursement(
flag: dict[str, Any],
*,
reimbursement_claim_id: str,
reimbursement_claim_no: str,
) -> bool:
reference_ids = {
str(flag.get(key) or "").strip()
for key in (
"generated_draft_claim_id",
"generatedDraftClaimId",
"reimbursement_claim_id",
"reimbursementClaimId",
)
}
reference_nos = {
str(flag.get(key) or "").strip().upper()
for key in (
"generated_draft_claim_no",
"generatedDraftClaimNo",
"reimbursement_claim_no",
"reimbursementClaimNo",
)
}
return (
bool(reimbursement_claim_id and reimbursement_claim_id in reference_ids)
or bool(reimbursement_claim_no and reimbursement_claim_no.upper() in reference_nos)
)
@classmethod
def _remove_deleted_reimbursement_link_references(
cls,
risk_flags: list[Any],
*,
reimbursement_claim_id: str,
reimbursement_claim_no: str,
) -> tuple[list[Any], bool]:
next_flags: list[Any] = []
changed = False
stale_link_keys = {
"generated_draft_claim_id",
"generatedDraftClaimId",
"generated_draft_claim_no",
"generatedDraftClaimNo",
"reimbursement_claim_id",
"reimbursementClaimId",
"reimbursement_claim_no",
"reimbursementClaimNo",
"handoff_event_type",
"handoffEventType",
"handoff_message",
"handoffMessage",
}
for flag in list(risk_flags or []):
if not isinstance(flag, dict):
next_flags.append(flag)
continue
if not cls._reference_matches_deleted_reimbursement(
flag,
reimbursement_claim_id=reimbursement_claim_id,
reimbursement_claim_no=reimbursement_claim_no,
):
next_flags.append(flag)
continue
next_flag = dict(flag)
for key in stale_link_keys:
if key in next_flag:
next_flag.pop(key, None)
changed = True
next_flags.append(next_flag)
return next_flags, changed
def _sync_linked_applications_after_reimbursement_deleted(
self,
*,
reimbursement_claim: ExpenseClaim,
operator: str,
current_user: Any,
) -> list[dict[str, str]]:
reimbursement_claim_id = str(reimbursement_claim.id or "").strip()
reimbursement_claim_no = str(reimbursement_claim.claim_no or "").strip()
synced_applications: list[dict[str, str]] = []
for application_claim in self._find_linked_application_claims(reimbursement_claim):
previous_status = str(application_claim.status or "").strip()
previous_stage = str(application_claim.approval_stage or "").strip()
before_json = self._serialize_claim(application_claim)
next_flags, removed_link_references = self._remove_deleted_reimbursement_link_references(
list(application_claim.risk_flags_json or []),
reimbursement_claim_id=reimbursement_claim_id,
reimbursement_claim_no=reimbursement_claim_no,
)
sync_flag = with_risk_business_stage(
{
"source": "application_link_sync",
"event_type": "expense_application_reimbursement_deleted",
"sync_event_id": str(uuid.uuid4()),
"severity": "info",
"actionability": "system_trace",
"label": "关联报销单已删除",
"message": (
f"关联报销单 {reimbursement_claim_no or reimbursement_claim_id} 已删除,"
"申请单已回到待关联状态。"
),
"operator": operator,
"operator_username": getattr(current_user, "username", ""),
"operator_role_codes": [
str(item).strip().lower()
for item in getattr(current_user, "role_codes", [])
if str(item).strip()
],
"application_claim_id": application_claim.id,
"application_claim_no": application_claim.claim_no,
"deleted_reimbursement_claim_id": reimbursement_claim_id,
"deleted_reimbursement_claim_no": reimbursement_claim_no,
"previous_status": previous_status,
"previous_approval_stage": previous_stage,
"next_status": "approved",
"next_approval_stage": APPLICATION_LINK_STATUS_STAGE,
"removed_link_references": removed_link_references,
"created_at": datetime.now(UTC).isoformat(),
},
"expense_application",
)
application_claim.status = "approved"
application_claim.approval_stage = APPLICATION_LINK_STATUS_STAGE
application_claim.risk_flags_json = [*next_flags, sync_flag]
synced_applications.append(
{
"application_claim_id": application_claim.id,
"application_claim_no": str(application_claim.claim_no or "").strip(),
"next_approval_stage": APPLICATION_LINK_STATUS_STAGE,
}
)
self.audit_service.log_action(
actor=operator,
action="expense_application.unlink_deleted_reimbursement",
resource_type="expense_claim",
resource_id=application_claim.id,
before_json=before_json,
after_json=self._serialize_claim(application_claim),
)
return synced_applications

View File

@@ -714,6 +714,17 @@ class ExpenseClaimAttachmentOperationsMixin:
timeout_seconds=OcrService(self.db).settings.ocr_timeout_seconds, timeout_seconds=OcrService(self.db).settings.ocr_timeout_seconds,
) )
except Exception: except Exception:
metadata.update(
{
"previewable": True,
"preview_kind": "pdf",
"preview_storage_key": self._attachment_storage.to_storage_key(file_path),
"preview_media_type": "application/pdf",
"preview_file_name": file_path.name,
"preview_rendered_with": "",
}
)
self._attachment_storage.write_meta(file_path, metadata)
return metadata return metadata
metadata.update( metadata.update(

View File

@@ -827,8 +827,8 @@ class ExpenseClaimDraftFlowMixin(ExpenseClaimApplicationLinkMixin, ExpenseClaimD
document_count = max(len(context_documents), len(attachment_names), self._resolve_attachment_count(context_json)) document_count = max(len(context_documents), len(attachment_names), self._resolve_attachment_count(context_json))
return { return {
"message": ( "message": (
f"检测到已有草稿 {association_candidate.claim_no}" f"检测到已有草稿 {association_candidate.claim_no}"
f"当前新上传了 {document_count} 张票据,请先选择关联到现有草稿,或单独建立新的报销单。" f"当前新上传了 {document_count} 张票据,请先选择关联到现有草稿,或单独新建一张报销单。"
), ),
"draft_only": False, "draft_only": False,
"status": "pending_association_decision", "status": "pending_association_decision",
@@ -859,7 +859,7 @@ class ExpenseClaimDraftFlowMixin(ExpenseClaimApplicationLinkMixin, ExpenseClaimD
if existing_draft_count >= MAX_DRAFT_CLAIMS_PER_USER: if existing_draft_count >= MAX_DRAFT_CLAIMS_PER_USER:
return { return {
"message": ( "message": (
f"当前已保存 {MAX_DRAFT_CLAIMS_PER_USER} 个草稿,请先完成已保存的草稿," f"当前已保存 {MAX_DRAFT_CLAIMS_PER_USER} 个草稿,请先完成已保存的草稿,"
"才能再次新建草稿。" "才能再次新建草稿。"
), ),
"draft_limit_reached": True, "draft_limit_reached": True,

View File

@@ -688,6 +688,13 @@ class ExpenseClaimItemActionMixin:
before_json = self._serialize_claim(claim) before_json = self._serialize_claim(claim)
resource_id = claim.id resource_id = claim.id
operator = self._access_policy.resolve_current_user_display_name(current_user)
if not self._is_expense_application_claim(claim):
self._sync_linked_applications_after_reimbursement_deleted(
reimbursement_claim=claim,
operator=operator,
current_user=current_user,
)
self._release_budget_for_delete(claim, current_user) self._release_budget_for_delete(claim, current_user)
self._delete_claim_analysis_records(resource_id) self._delete_claim_analysis_records(resource_id)
@@ -1008,4 +1015,3 @@ class ExpenseClaimService(ExpenseClaimStandardAdjustmentMixin, ExpenseClaimItemA
) )
return claim return claim

View File

@@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
from datetime import UTC, datetime from datetime import UTC, datetime
from threading import Lock
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
@@ -16,11 +17,23 @@ from app.schemas.notification_state import (
class NotificationStateService: class NotificationStateService:
_storage_ready_bind_ids: set[int] = set()
_storage_ready_lock = Lock()
def __init__(self, db: Session) -> None: def __init__(self, db: Session) -> None:
self.db = db self.db = db
def ensure_storage_ready(self) -> None: def ensure_storage_ready(self) -> None:
Base.metadata.create_all(bind=self.db.get_bind(), tables=[NotificationState.__table__]) bind = self.db.get_bind()
bind_id = id(bind)
if bind_id in self._storage_ready_bind_ids:
return
with self._storage_ready_lock:
if bind_id in self._storage_ready_bind_ids:
return
Base.metadata.create_all(bind=bind, tables=[NotificationState.__table__])
self._storage_ready_bind_ids.add(bind_id)
def list_states(self, current_user: CurrentUserContext) -> NotificationStateListRead: def list_states(self, current_user: CurrentUserContext) -> NotificationStateListRead:
self.ensure_storage_ready() self.ensure_storage_ready()

View File

@@ -16,11 +16,13 @@ from sqlalchemy.orm import Session
from app.core.config import SERVER_DIR, get_settings from app.core.config import SERVER_DIR, get_settings
from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead, OcrRecognizeLineRead from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead, OcrRecognizeLineRead
from app.services.document_preview import DocumentPreviewAssets
from app.services.document_intelligence import DocumentIntelligenceService from app.services.document_intelligence import DocumentIntelligenceService
WORKER_JSON_PREFIX = "__OCR_JSON__=" WORKER_JSON_PREFIX = "__OCR_JSON__="
SUPPORTED_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".pdf"} SUPPORTED_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".pdf"}
OCR_RESULT_CACHE_LIMIT = 32 OCR_RESULT_CACHE_LIMIT = 32
OCR_RESULT_CACHE_PIPELINE_VERSION = f"pdf-image-ocr:{DocumentPreviewAssets.PDF_RENDERER_ID}:no-pdf-direct-v2"
@dataclass(slots=True) @dataclass(slots=True)
@@ -142,16 +144,6 @@ class OcrService:
cleanup_paths=cleanup_paths, cleanup_paths=cleanup_paths,
text_layer=text_layer, text_layer=text_layer,
) )
if self._has_usable_pdf_text_layer(text_layer):
document = self._build_text_layer_document(
filename=normalized_name,
media_type=resolved_media_type,
text_layer=text_layer,
pdf_inputs=pdf_inputs,
)
documents.append(document)
self._write_cached_document(cache_key, document)
continue
prepared_inputs.extend(pdf_inputs) prepared_inputs.extend(pdf_inputs)
for item in pdf_inputs: for item in pdf_inputs:
cache_keys_by_source.setdefault(item.source_key, cache_key) cache_keys_by_source.setdefault(item.source_key, cache_key)
@@ -257,6 +249,7 @@ class OcrService:
digest = hashlib.sha256(content).hexdigest() digest = hashlib.sha256(content).hexdigest()
return "|".join( return "|".join(
[ [
OCR_RESULT_CACHE_PIPELINE_VERSION,
self.settings.ocr_language, self.settings.ocr_language,
self.settings.ocr_device, self.settings.ocr_device,
self.settings.ocr_text_detection_model, self.settings.ocr_text_detection_model,
@@ -406,11 +399,15 @@ class OcrService:
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
cleanup_paths.append(output_dir) cleanup_paths.append(output_dir)
image_paths = self._convert_pdf_to_images(pdf_path=pdf_path, output_dir=output_dir) image_paths, preview_usable = self._convert_pdf_to_images(pdf_path=pdf_path, output_dir=output_dir)
if not image_paths: if not image_paths:
raise RuntimeError("PDF 转图片后未生成可识别页面。") raise RuntimeError("PDF 转图片后未生成可识别页面。")
preview_data_url = self._build_preview_data_url(image_paths[0], media_type="image/png") preview_data_url = (
self._build_preview_data_url(image_paths[0], media_type="image/png")
if preview_usable
else ""
)
source_key = uuid4().hex source_key = uuid4().hex
descriptors: list[PreparedOcrInput] = [] descriptors: list[PreparedOcrInput] = []
for page_index, image_path in enumerate(image_paths): for page_index, image_path in enumerate(image_paths):
@@ -421,7 +418,7 @@ class OcrService:
filename=filename, filename=filename,
media_type=media_type, media_type=media_type,
page_index=page_index, page_index=page_index,
preview_kind="image" if page_index == 0 else "", preview_kind="image" if page_index == 0 and preview_data_url else "",
preview_data_url=preview_data_url if page_index == 0 else "", preview_data_url=preview_data_url if page_index == 0 else "",
text_layer=text_layer if page_index == 0 else "", text_layer=text_layer if page_index == 0 else "",
) )
@@ -450,27 +447,17 @@ class OcrService:
return self._normalize_extracted_text(completed.stdout) return self._normalize_extracted_text(completed.stdout)
def _convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]: def _convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
prefix = output_dir / "page" try:
completed = subprocess.run( pages = DocumentPreviewAssets.render_pdf_pages(
[ pdf_path=pdf_path,
"pdftoppm", output_dir=output_dir,
"-png", timeout_seconds=self.settings.ocr_timeout_seconds,
"-r",
"160",
str(pdf_path),
str(prefix),
],
capture_output=True,
text=True,
timeout=self.settings.ocr_timeout_seconds,
check=False,
) )
if completed.returncode != 0: except RuntimeError as exc:
detail = (completed.stderr or completed.stdout or "").strip() raise RuntimeError(f"PDF 转图片失败:{exc}") from exc
raise RuntimeError(f"PDF 转图片失败:{detail or 'pdftoppm 返回非 0 状态码。'}")
return sorted(output_dir.glob("page-*.png"), key=self._extract_pdf_page_sort_key) return pages, True
@staticmethod @staticmethod
def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]: def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]:
@@ -595,30 +582,6 @@ class OcrService:
return documents return documents
def _build_text_layer_document(
self,
*,
filename: str,
media_type: str,
text_layer: str,
pdf_inputs: list[PreparedOcrInput],
) -> OcrRecognizeDocumentRead:
first_input = pdf_inputs[0] if pdf_inputs else None
aggregated = AggregatedOcrDocument(
filename=filename,
media_type=media_type,
source_key=first_input.source_key if first_input is not None else uuid4().hex,
page_count=max(1, len(pdf_inputs)),
preview_kind=str(first_input.preview_kind if first_input is not None else ""),
preview_data_url=str(first_input.preview_data_url if first_input is not None else ""),
)
aggregated.text_layer_fragments.append(text_layer)
return self._finalize_document(aggregated)
@classmethod
def _has_usable_pdf_text_layer(cls, text_layer: str) -> bool:
return cls._meaningful_char_count(text_layer) >= 8
@staticmethod @staticmethod
def _collect_descriptor_text_layer(descriptors: list[PreparedOcrInput]) -> str: def _collect_descriptor_text_layer(descriptors: list[PreparedOcrInput]) -> str:
for descriptor in descriptors: for descriptor in descriptors:
@@ -685,13 +648,6 @@ class OcrService:
summary = self._summarize_text(full_text) summary = self._summarize_text(full_text)
preview_kind = aggregated.preview_kind preview_kind = aggregated.preview_kind
preview_data_url = aggregated.preview_data_url preview_data_url = aggregated.preview_data_url
if (
used_text_layer
and aggregated.media_type == "application/pdf"
and self._placeholder_ratio(ocr_text) >= 0.12
):
preview_kind = ""
preview_data_url = ""
insight = self.document_intelligence_service.build_document_insight( insight = self.document_intelligence_service.build_document_insight(
filename=aggregated.filename, filename=aggregated.filename,
summary=summary, summary=summary,

View File

@@ -214,7 +214,7 @@ class OntologyValidationMixin:
labels = [self._display_slot_label(item) for item in missing_slots[:4]] labels = [self._display_slot_label(item) for item in missing_slots[:4]]
if not labels: if not labels:
return "请补充更多上下文后再继续。" return "请补充更多上下文后再继续。"
return f"请补充{''.join(labels)},我继续帮解析和处理。" return f"请补充{''.join(labels)},我继续帮解析和处理。"
@staticmethod @staticmethod
def _compute_confidence( def _compute_confidence(

View File

@@ -570,7 +570,7 @@ class OrchestratorDatabaseQueryBuilder:
scoped_to_current_user = True scoped_to_current_user = True
else: else:
conditions.append(ExpenseClaim.id == "__no_visible_claim__") conditions.append(ExpenseClaim.id == "__no_visible_claim__")
scope_label = "的报销单" scope_label = "的报销单"
scoped_to_current_user = True scoped_to_current_user = True
elif explicit_employee_names: elif explicit_employee_names:
conditions.append(ExpenseClaim.employee_name.in_(explicit_employee_names)) conditions.append(ExpenseClaim.employee_name.in_(explicit_employee_names))
@@ -586,7 +586,7 @@ class OrchestratorDatabaseQueryBuilder:
scoped_to_current_user = True scoped_to_current_user = True
else: else:
conditions.append(ExpenseClaim.id == "__no_visible_claim__") conditions.append(ExpenseClaim.id == "__no_visible_claim__")
scope_label = "的报销单" scope_label = "的报销单"
scoped_to_current_user = True scoped_to_current_user = True
else: else:
scope_label = "全部报销单" scope_label = "全部报销单"
@@ -703,7 +703,7 @@ class OrchestratorDatabaseQueryBuilder:
subject_name = (employee.name if employee is not None else "") or normalized_user_id subject_name = (employee.name if employee is not None else "") or normalized_user_id
if subject_name: if subject_name:
return conditions, "的报销单" return conditions, "的报销单"
return conditions, "当前用户的报销单" return conditions, "当前用户的报销单"
def _employee_name_is_unique(self, employee: Employee) -> bool: def _employee_name_is_unique(self, employee: Employee) -> bool:

View File

@@ -37,10 +37,18 @@ TRAIN_ROUTE_PATTERN = re.compile(
r"([\u4e00-\u9fa5]{2,12})站?\s*(?:至|到|→|->|—||-)\s*" r"([\u4e00-\u9fa5]{2,12})站?\s*(?:至|到|→|->|—||-)\s*"
r"([\u4e00-\u9fa5]{2,12})站?" r"([\u4e00-\u9fa5]{2,12})站?"
) )
TRAIN_ROUTE_WITH_NO_PATTERN = re.compile(
r"([\u4e00-\u9fa5]{2,12})站?\s+[GCDZKTLYS]\d{1,5}\s+"
r"([\u4e00-\u9fa5]{2,12})站?",
re.IGNORECASE,
)
TRAIN_NO_PATTERN = re.compile(r"(?:车次|列车号)\s*[:]?\s*([GCDZKTLYS]\d{1,5})", re.IGNORECASE) TRAIN_NO_PATTERN = re.compile(r"(?:车次|列车号)\s*[:]?\s*([GCDZKTLYS]\d{1,5})", re.IGNORECASE)
TRAIN_STANDALONE_NO_PATTERN = re.compile(r"(?<![A-Z0-9])([GCDZKTLYS]\d{1,5})(?![A-Z0-9])", re.IGNORECASE) TRAIN_STANDALONE_NO_PATTERN = re.compile(r"(?<![A-Z0-9])([GCDZKTLYS]\d{1,5})(?![A-Z0-9])", re.IGNORECASE)
TRAIN_PASSENGER_PATTERN = re.compile(r"(?:乘车人|旅客姓名|姓名)\s*[:]?\s*([\u4e00-\u9fa5·]{2,20})") TRAIN_PASSENGER_PATTERN = re.compile(r"(?:乘车人|乘客|旅客姓名|姓名)\s*[:]?\s*([\u4e00-\u9fa5·]{2,20})")
TRAIN_ID_PATTERN = re.compile(r"(?:有效身份证件号码|身份证件号码|证件号码|身份证号)\s*[:]?\s*([0-9Xx*]{6,24})") TRAIN_PURCHASER_NAME_PATTERN = re.compile(
r"购买方名称\s*[:]?\s*([·\u4e00-\u9fa5]{2,20}?)(?=\s*(?:统一社会信用代码|纳税人识别号|$))"
)
TRAIN_ID_PATTERN = re.compile(r"(?:有效身份证件号码|身份证件号码|证件号码|身份证号|证件号)\s*[:]?\s*([0-9Xx*]{6,24})")
TRAIN_ID_FALLBACK_PATTERN = re.compile(r"(?<![0-9A-Za-z])([0-9]{6,17}[0-9Xx*]{2,8})(?![0-9A-Za-z])") TRAIN_ID_FALLBACK_PATTERN = re.compile(r"(?<![0-9A-Za-z])([0-9]{6,17}[0-9Xx*]{2,8})(?![0-9A-Za-z])")
TRAIN_ETICKET_PATTERN = re.compile(r"(?:电子客票号|客票号)\s*[:]?\s*([A-Z0-9]{6,32})", re.IGNORECASE) TRAIN_ETICKET_PATTERN = re.compile(r"(?:电子客票号|客票号)\s*[:]?\s*([A-Z0-9]{6,32})", re.IGNORECASE)
TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座|一等卧|二等卧|软卧|硬卧|软座|硬座|无座)") TRAIN_SEAT_CLASS_PATTERN = re.compile(r"(商务座|特等座|一等座|二等座|一等卧|二等卧|软卧|硬卧|软座|硬座|无座)")
@@ -50,6 +58,28 @@ TRAIN_COMBINED_SEAT_PATTERN = re.compile(r"([0-9]{1,2})车\s*([0-9]{1,3}[A-F])
TRAIN_LOOSE_SEAT_PATTERN = re.compile(r"(?<!\d)([0-9]{1,2})\s+([0-9]{1,3}[A-F])(?![A-Za-z0-9])", re.IGNORECASE) TRAIN_LOOSE_SEAT_PATTERN = re.compile(r"(?<!\d)([0-9]{1,2})\s+([0-9]{1,3}[A-F])(?![A-Za-z0-9])", re.IGNORECASE)
TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[::¥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)") TRAIN_FARE_PATTERN = re.compile(r"(?:票价|金额)\s*[::¥¥\s]*([0-9]+(?:[.,][0-9]{1,2})?)")
TRAIN_LOOSE_FARE_PATTERN = re.compile(r"(?<!\d)([0-9]{1,6}\.\d{1,2})(?!\d)") TRAIN_LOOSE_FARE_PATTERN = re.compile(r"(?<!\d)([0-9]{1,6}\.\d{1,2})(?!\d)")
TRAIN_STATION_FIELD_KEYS = {"departure_station", "arrival_station"}
TRAIN_STATION_FIELD_LABELS = {"出发地点", "到达地点"}
TRAIN_INVALID_STATION_TOKENS = (
"",
"",
"扫码",
"无效",
"票价",
"金额",
"车厢",
"座位",
"乘客",
"证件",
"身份证",
"订单",
"单据",
"日期",
"渠道",
"官方",
"演示",
"不可报销",
)
class ReceiptFolderStorageMixin: class ReceiptFolderStorageMixin:
@@ -119,6 +149,29 @@ class ReceiptFolderStorageMixin:
"preview_media_type": preview_media_type, "preview_media_type": preview_media_type,
"preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type), "preview_rendered_with": DocumentPreviewAssets.renderer_id_for_source(media_type),
} }
if str(media_type or "").strip() == "application/pdf":
preview_path = receipt_dir / f"preview{DocumentPreviewAssets.PDF_PREVIEW_SUFFIX}"
try:
DocumentPreviewAssets.render_pdf_first_page(
pdf_path=source_path,
preview_path=preview_path,
timeout_seconds=get_settings().ocr_timeout_seconds,
)
except Exception:
return {
"previewable": True,
"preview_kind": "pdf",
"preview_file_name": source_path.name,
"preview_media_type": media_type,
"preview_rendered_with": "",
}
return {
"previewable": True,
"preview_kind": "image",
"preview_file_name": preview_path.name,
"preview_media_type": DocumentPreviewAssets.PDF_PREVIEW_MEDIA_TYPE,
"preview_rendered_with": DocumentPreviewAssets.PDF_RENDERER_ID,
}
if self._is_previewable(media_type): if self._is_previewable(media_type):
return { return {
"previewable": True, "previewable": True,
@@ -172,6 +225,16 @@ class ReceiptFolderStorageMixin:
timeout_seconds=get_settings().ocr_timeout_seconds, timeout_seconds=get_settings().ocr_timeout_seconds,
) )
except Exception: except Exception:
meta.update(
{
"previewable": True,
"preview_kind": "pdf",
"preview_file_name": source_path.name,
"preview_media_type": "application/pdf",
"preview_rendered_with": "",
}
)
self._write_meta(receipt_dir, meta)
return meta return meta
meta.update( meta.update(
@@ -543,6 +606,10 @@ class ReceiptFolderTrainTicketMixin:
value = str(field.get("value") or "").strip() value = str(field.get("value") or "").strip()
if not label or not value: if not label or not value:
continue continue
if key == "merchant_name" or label == "商户":
continue
if not cls._should_keep_train_ticket_field(key=key, label=label, value=value):
continue
if key == "trip_no" and label == "车次/航班": if key == "trip_no" and label == "车次/航班":
label = "车次" label = "车次"
if key == "route" and label == "行程": if key == "route" and label == "行程":
@@ -559,6 +626,8 @@ class ReceiptFolderTrainTicketMixin:
return return
normalized.append({"key": key, "label": label, "value": cleaned}) normalized.append({"key": key, "label": label, "value": cleaned})
add_field("merchant_name", "商户", "中国铁路")
invoice_date = cls._extract_train_invoice_date(text) invoice_date = cls._extract_train_invoice_date(text)
add_field("invoice_date", "开票日期", invoice_date) add_field("invoice_date", "开票日期", invoice_date)
@@ -690,6 +759,13 @@ class ReceiptFolderTrainTicketMixin:
@classmethod @classmethod
def _extract_train_route_points(cls, text: str) -> tuple[str, str]: def _extract_train_route_points(cls, text: str) -> tuple[str, str]:
raw_text = str(text or "") raw_text = str(text or "")
split_line_match = TRAIN_ROUTE_WITH_NO_PATTERN.search(raw_text)
if split_line_match:
departure = cls._clean_train_station(split_line_match.group(1))
arrival = cls._clean_train_station(split_line_match.group(2))
if cls._is_valid_train_station_value(departure) and cls._is_valid_train_station_value(arrival) and departure != arrival:
return departure, arrival
station_candidates: list[str] = [] station_candidates: list[str] = []
for line in raw_text.replace("\r", "\n").splitlines(): for line in raw_text.replace("\r", "\n").splitlines():
candidate = cls._clean_train_station(line) candidate = cls._clean_train_station(line)
@@ -697,7 +773,7 @@ class ReceiptFolderTrainTicketMixin:
continue continue
if not str(line or "").strip().endswith(""): if not str(line or "").strip().endswith(""):
continue continue
if any(token in candidate for token in ("发票", "客票", "铁路", "票价", "日期")): if not cls._is_valid_train_station_value(candidate):
continue continue
station_candidates.append(candidate) station_candidates.append(candidate)
if len(station_candidates) >= 2: if len(station_candidates) >= 2:
@@ -707,7 +783,7 @@ class ReceiptFolderTrainTicketMixin:
if match: if match:
departure = cls._clean_train_station(match.group(1)) departure = cls._clean_train_station(match.group(1))
arrival = cls._clean_train_station(match.group(2)) arrival = cls._clean_train_station(match.group(2))
if departure and arrival and departure != arrival: if cls._is_valid_train_station_value(departure) and cls._is_valid_train_station_value(arrival) and departure != arrival:
return departure, arrival return departure, arrival
return "", "" return "", ""
@@ -717,6 +793,25 @@ class ReceiptFolderTrainTicketMixin:
cleaned = re.sub(r"(?:火车站|高铁站|站)$", "", cleaned) cleaned = re.sub(r"(?:火车站|高铁站|站)$", "", cleaned)
return cleaned.strip() return cleaned.strip()
@classmethod
def _should_keep_train_ticket_field(cls, *, key: str, label: str, value: str) -> bool:
if key in TRAIN_STATION_FIELD_KEYS or label in TRAIN_STATION_FIELD_LABELS:
return cls._is_valid_train_station_value(value)
if key == "passenger_name" or label == "乘车人":
return bool(cls._clean_train_passenger_candidate(value))
return True
@classmethod
def _is_valid_train_station_value(cls, value: str) -> bool:
cleaned = cls._clean_train_station(value)
if not 2 <= len(cleaned) <= 12:
return False
if any(token in cleaned for token in TRAIN_INVALID_STATION_TOKENS):
return False
if re.search(r"[A-Za-z0-9]", cleaned):
return False
return True
@staticmethod @staticmethod
def _extract_first(pattern: re.Pattern[str], text: str) -> str: def _extract_first(pattern: re.Pattern[str], text: str) -> str:
match = pattern.search(str(text or "")) match = pattern.search(str(text or ""))
@@ -724,14 +819,19 @@ class ReceiptFolderTrainTicketMixin:
@classmethod @classmethod
def _extract_train_passenger_name(cls, text: str, *, id_number: str = "") -> str: def _extract_train_passenger_name(cls, text: str, *, id_number: str = "") -> str:
labeled = cls._extract_first(TRAIN_PASSENGER_PATTERN, text) lines = [line.strip() for line in str(text or "").replace("\r", "\n").splitlines() if line.strip()]
for line in lines:
labeled = cls._clean_train_passenger_candidate(cls._extract_first(TRAIN_PASSENGER_PATTERN, line))
if labeled: if labeled:
return labeled return labeled
lines = [line.strip() for line in str(text or "").replace("\r", "\n").splitlines() if line.strip()] if id_number:
for index, line in enumerate(lines): for index, line in enumerate(lines):
if id_number and id_number not in line: if id_number not in line:
continue continue
candidate = cls._clean_train_passenger_candidate(line.replace(id_number, " "))
if candidate:
return candidate
for offset in (1, -1, 2): for offset in (1, -1, 2):
target_index = index + offset target_index = index + offset
if target_index < 0 or target_index >= len(lines): if target_index < 0 or target_index >= len(lines):
@@ -740,8 +840,9 @@ class ReceiptFolderTrainTicketMixin:
if candidate: if candidate:
return candidate return candidate
for line in lines: for line in lines:
if "购买方名称" in line: purchase_match = TRAIN_PURCHASER_NAME_PATTERN.search(line)
candidate = cls._clean_train_passenger_candidate(line.split(":", 1)[-1].split("", 1)[-1]) if purchase_match:
candidate = cls._clean_train_passenger_candidate(purchase_match.group(1))
if candidate: if candidate:
return candidate return candidate
return "" return ""
@@ -764,6 +865,16 @@ class ReceiptFolderTrainTicketMixin:
"开票", "开票",
"日期", "日期",
"车厢", "车厢",
"席别",
"二等座",
"一等座",
"商务座",
"特等座",
"软座",
"硬座",
"无座",
"软卧",
"硬卧",
"座位", "座位",
"票价", "票价",
"金额", "金额",
@@ -771,6 +882,14 @@ class ReceiptFolderTrainTicketMixin:
"出发", "出发",
"到达", "到达",
"车次", "车次",
"公司",
"信用代码",
"纳税人",
"扫码",
"无效",
"二维码",
"座席",
"证件",
) )
): ):
return "" return ""

View File

@@ -266,7 +266,7 @@ class StewardModelPlanBuilder:
event_id="intent_agent_function_call", event_id="intent_agent_function_call",
stage="llm_function_call", stage="llm_function_call",
title="识别财务事项", title="识别财务事项",
content="我识别到这句话包含出差事项,但还需要确认要进入申请流程还是报销流程。", content="我识别到这句话包含出差事项,但还需要确认您是要进入申请流程还是报销流程。",
) )
] ]
raw_events = payload.get("thinking_events") raw_events = payload.get("thinking_events")
@@ -292,7 +292,7 @@ class StewardModelPlanBuilder:
event_id="intent_agent_pending_flow", event_id="intent_agent_pending_flow",
stage="flow_confirmation", stage="flow_confirmation",
title="等待确认流程方向", title="等待确认流程方向",
content=f"当前输入“{request.message}”缺少明确动作词,需要先由选择补办出差申请发起费用报销。", content=f"当前输入“{request.message}”缺少明确动作词,需要先由选择补办出差申请,还是发起费用报销。",
) )
) )
return events return events
@@ -302,7 +302,7 @@ class StewardModelPlanBuilder:
candidate_labels = [item.label for item in pending_flow_confirmation.candidate_flows if item.label] candidate_labels = [item.label for item in pending_flow_confirmation.candidate_flows if item.label]
if len(candidate_labels) >= 2: if len(candidate_labels) >= 2:
return ( return (
f"我识别到这是一次财务事项,但还不能确定要做的是" f"我识别到这是一次财务事项,但还不能确定要做的是"
f"**{candidate_labels[0]}**还是**{candidate_labels[1]}**。请先选择一个方向。" f"**{candidate_labels[0]}**还是**{candidate_labels[1]}**。请先选择一个方向。"
) )
return "我识别到这是一次财务事项,但还需要先确认具体流程方向。" return "我识别到这是一次财务事项,但还需要先确认具体流程方向。"

View File

@@ -335,7 +335,7 @@ class StewardPlannerFallbackMixin:
flow_id="travel_application", flow_id="travel_application",
label="先发起出差申请", label="先发起出差申请",
confidence=0.86, confidence=0.86,
reason="已先查询名下可关联的差旅申请单,暂未查到可关联单据,因此应先申请单据。", reason="已先查询名下可关联的差旅申请单,暂未查到可关联单据,因此应先申请单据。",
ontology_fields=application_fields, ontology_fields=application_fields,
missing_fields=self._resolve_missing_fields("expense_application", application_fields), missing_fields=self._resolve_missing_fields("expense_application", application_fields),
) )
@@ -345,7 +345,7 @@ class StewardPlannerFallbackMixin:
if gate.get("checked"): if gate.get("checked"):
candidate_count = int(gate.get("candidate_count") or 0) candidate_count = int(gate.get("candidate_count") or 0)
reimbursement_label = "关联已有申请单并发起报销" reimbursement_label = "关联已有申请单并发起报销"
reimbursement_reason = f"已先查到 {candidate_count} 个可关联申请单,选择后会先请关联具体单据。" reimbursement_reason = f"已先查到 {candidate_count} 个可关联申请单,选择后会先请关联具体单据。"
return [ return [
StewardCandidateFlow( StewardCandidateFlow(
flow_id="travel_application", flow_id="travel_application",
@@ -390,10 +390,10 @@ class StewardPlannerFallbackMixin:
@staticmethod @staticmethod
def _build_pending_flow_reason(gate: dict[str, Any]) -> str: def _build_pending_flow_reason(gate: dict[str, Any]) -> str:
if gate.get("checked") and int(gate.get("candidate_count") or 0) <= 0: if gate.get("checked") and int(gate.get("candidate_count") or 0) <= 0:
return "我已先查询名下可关联的差旅申请单,未查到可关联单据,所以当前应先申请单据。" return "我已先查询名下可关联的差旅申请单,未查到可关联单据,所以当前应先申请单据。"
if gate.get("checked"): if gate.get("checked"):
candidate_count = int(gate.get("candidate_count") or 0) candidate_count = int(gate.get("candidate_count") or 0)
return f"我已先查询名下的差旅申请单,查到 {candidate_count} 个可关联申请单,需要确认是否关联单据后发起报销。" return f"我已先查询名下的差旅申请单,查到 {candidate_count} 个可关联申请单,需要确认是否关联单据后发起报销。"
return "当前话术描述了出差事项,但没有明确说明要补办申请还是发起报销。" return "当前话术描述了出差事项,但没有明确说明要补办申请还是发起报销。"
@staticmethod @staticmethod
@@ -404,10 +404,10 @@ class StewardPlannerFallbackMixin:
candidate_count = int(gate.get("candidate_count") or 0) candidate_count = int(gate.get("candidate_count") or 0)
return ( return (
f"我已先查询可关联申请单,查到 {candidate_count} 个可关联申请单;" f"我已先查询可关联申请单,查到 {candidate_count} 个可关联申请单;"
"可以选择关联已有申请单发起报销,改为补办新的出差申请。" "可以选择关联已有申请单发起报销,也可以改为补办新的出差申请。"
) )
return ( return (
"我识别到这是一次出差事项,但还不能确定要做的是" "我识别到这是一次出差事项,但还不能确定要做的是"
"**补办出差申请**还是**发起费用报销**。请先选择一个方向。" "**补办出差申请**还是**发起费用报销**。请先选择一个方向。"
) )

View File

@@ -90,7 +90,7 @@ class StewardRuntimeDecisionAgent:
next_action="continue_selected_flow", next_action="continue_selected_flow",
target_task_id=selected_flow_id, target_task_id=selected_flow_id,
response_text=self._build_selected_flow_response_text(selected_flow_id), response_text=self._build_selected_flow_response_text(selected_flow_id),
rationale="已按选择的候选流程继续处理。", rationale="已按选择的候选流程继续处理。",
steward_state=next_state, steward_state=next_state,
model_call_traces=traces, model_call_traces=traces,
) )
@@ -268,7 +268,7 @@ class StewardRuntimeDecisionAgent:
next_action="submit_current_application", next_action="submit_current_application",
target_message_id=str(pending_application.get("message_id") or ""), target_message_id=str(pending_application.get("message_id") or ""),
target_task_id=str(pending_application.get("task_id") or ""), target_task_id=str(pending_application.get("task_id") or ""),
rationale="模型运行时决策暂不可用,我先按当前待提交申请单上下文处理的确认。", rationale="模型运行时决策暂不可用,我先按当前待提交申请单上下文处理的确认。",
model_call_traces=traces, model_call_traces=traces,
) )
if confirmation_text and pending_steward_action: if confirmation_text and pending_steward_action:
@@ -295,7 +295,7 @@ class StewardRuntimeDecisionAgent:
target_task_id=str(current_task.get("task_id") or ""), target_task_id=str(current_task.get("task_id") or ""),
field_key=field_key, field_key=field_key,
field_value=request.user_message, field_value=request.user_message,
rationale="模型运行时决策暂不可用,我先把的补充写入当前小财管家流程字段。", rationale="模型运行时决策暂不可用,我先把的补充写入当前小财管家流程字段。",
model_call_traces=traces, model_call_traces=traces,
) )
if field_key: if field_key:

View File

@@ -275,7 +275,7 @@ class StewardSlotDecisionAgent:
missing_fields=missing_fields, missing_fields=missing_fields,
question=self._build_fallback_question(field), question=self._build_fallback_question(field),
options=self._sanitize_options([], [field]), options=self._sanitize_options([], [field]),
rationale="模型字段决策暂不可用,我先按上游意图识别给出的本体缺口向确认。", rationale="模型字段决策暂不可用,我先按上游意图识别给出的本体缺口向确认。",
model_call_traces=traces, model_call_traces=traces,
) )
return StewardSlotDecisionResponse( return StewardSlotDecisionResponse(
@@ -285,7 +285,7 @@ class StewardSlotDecisionAgent:
missing_fields=[], missing_fields=[],
question="", question="",
options=[], options=[],
rationale="当前任务没有上游标记的关键字段缺口,可以先生成核对结果供确认。", rationale="当前任务没有上游标记的关键字段缺口,可以先生成核对结果供确认。",
model_call_traces=traces, model_call_traces=traces,
) )
@@ -293,7 +293,7 @@ class StewardSlotDecisionAgent:
def _build_fallback_question(field: str) -> str: def _build_fallback_question(field: str) -> str:
label = FIELD_CATALOG.get(field, {}).get("label") or field label = FIELD_CATALOG.get(field, {}).get("label") or field
if field == "transport_mode": if field == "transport_mode":
return "请问这次打算怎么出行?可以选择火车、飞机或轮船。" return "请问这次打算怎么出行?可以选择火车、飞机或轮船。"
return f"当前还缺少{label},请先补充后我再继续处理。" return f"当前还缺少{label},请先补充后我再继续处理。"
@staticmethod @staticmethod

View File

@@ -15,14 +15,16 @@ from app.schemas.reimbursement import (
TravelReimbursementCalculatorResponse, TravelReimbursementCalculatorResponse,
) )
from app.services.agent_assets import AgentAssetService from app.services.agent_assets import AgentAssetService
from app.services.application_location_semantics import validate_application_location_text
from app.services.expense_claims import ExpenseClaimService from app.services.expense_claims import ExpenseClaimService
from app.services.expense_rule_runtime import RuntimeTravelPolicy, ExpenseRuleRuntimeService from app.services.expense_rule_runtime import ExpenseRuleRuntimeService, RuntimeTravelPolicy
from app.services.travel_policy_grades import travel_policy_grade_key_candidates from app.services.travel_policy_grades import travel_policy_grade_key_candidates
from app.services.travel_reimbursement_regions import ( from app.services.travel_reimbursement_regions import (
AMBIGUOUS_PROVINCE_CITY_NAMES, AMBIGUOUS_PROVINCE_CITY_NAMES,
OTHER_REGION_LOCATION_KEYWORDS, OTHER_REGION_LOCATION_KEYWORDS,
OTHER_REGION_PROVINCE_KEYWORDS, OTHER_REGION_PROVINCE_KEYWORDS,
) )
from app.services.user_agent_application_locations import normalize_application_location
class TravelReimbursementCalculatorService: class TravelReimbursementCalculatorService:
@@ -35,9 +37,13 @@ class TravelReimbursementCalculatorService:
current_user: CurrentUserContext, current_user: CurrentUserContext,
) -> TravelReimbursementCalculatorResponse: ) -> TravelReimbursementCalculatorResponse:
days = max(1, int(payload.days)) days = max(1, int(payload.days))
location = str(payload.location or "").strip() raw_location = str(payload.location or "").strip()
if not location: if not raw_location:
raise ValueError("请先填写出差地点。") raise ValueError("请先填写出差地点。")
location = normalize_application_location(raw_location) or raw_location
location_error = validate_application_location_text(location)
if location_error:
raise ValueError(f"{location_error}请填写真实出差地点后再计算。")
policy = self._load_travel_policy() policy = self._load_travel_policy()
grade = self._resolve_grade(payload.grade, current_user) grade = self._resolve_grade(payload.grade, current_user)

View File

@@ -8,20 +8,25 @@ from sqlalchemy import or_, select
from app.api.deps import CurrentUserContext from app.api.deps import CurrentUserContext
from app.models.financial_record import ExpenseClaim from app.models.financial_record import ExpenseClaim
from app.schemas.reimbursement import TravelReimbursementCalculatorRequest
from app.schemas.user_agent import ( from app.schemas.user_agent import (
UserAgentDraftPayload, UserAgentDraftPayload,
UserAgentRequest, UserAgentRequest,
UserAgentResponse, UserAgentResponse,
UserAgentSuggestedAction, UserAgentSuggestedAction,
) )
from app.schemas.reimbursement import TravelReimbursementCalculatorRequest from app.services.application_location_semantics import (
from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy strip_route_location_prefix_with_jieba,
from app.services.expense_claim_risk_stage import with_risk_business_stage validate_application_location_text,
from app.services.travel_reimbursement_calculator import TravelReimbursementCalculatorService )
from app.services.application_system_estimate import apply_application_system_estimate_to_facts
from app.services.document_numbering import ( from app.services.document_numbering import (
build_document_number, build_document_number,
generate_unique_expense_claim_no, generate_unique_expense_claim_no,
) )
from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy
from app.services.expense_claim_risk_stage import with_risk_business_stage
from app.services.travel_reimbursement_calculator import TravelReimbursementCalculatorService
from app.services.user_agent_application_dates import ( from app.services.user_agent_application_dates import (
expand_application_time_with_days, expand_application_time_with_days,
resolve_application_date_range, resolve_application_date_range,
@@ -33,7 +38,6 @@ from app.services.user_agent_application_summary import (
build_application_summary_table, build_application_summary_table,
resolve_application_time_label, resolve_application_time_label,
) )
from app.services.application_system_estimate import apply_application_system_estimate_to_facts
APPLICATION_CONTEXT_VALUES = { APPLICATION_CONTEXT_VALUES = {
"application", "application",
@@ -182,6 +186,17 @@ class UserAgentApplicationSlotMixin:
if not str(facts.get(field) or "").strip() if not str(facts.get(field) or "").strip()
] ]
@staticmethod
def _resolve_application_validation_issues(facts: dict[str, str]) -> list[dict[str, str]]:
issues: list[dict[str, str]] = []
location_error = validate_application_location_text(facts.get("location", ""))
if location_error:
issues.append({
"field": "location",
"message": location_error,
})
return issues
def _resolve_application_missing_fields(self, facts: dict[str, str]) -> list[str]: def _resolve_application_missing_fields(self, facts: dict[str, str]) -> list[str]:
return [ return [
*self._resolve_application_missing_base_fields(facts), *self._resolve_application_missing_base_fields(facts),
@@ -391,6 +406,10 @@ class UserAgentApplicationSlotMixin:
if re.fullmatch(r"(?:去|到|前往)?[\u4e00-\u9fa5]{1,8}出差(?P<days>\d+|[一二两三四五六七八九十]{1,3})?天?", text): if re.fullmatch(r"(?:去|到|前往)?[\u4e00-\u9fa5]{1,8}出差(?P<days>\d+|[一二两三四五六七八九十]{1,3})?天?", text):
return "" return ""
tokenized = strip_route_location_prefix_with_jieba(text)
if tokenized != text:
text = tokenized
else:
text = re.sub(r"^.*?(?:出差|前往|去|到|赴)[\u4e00-\u9fa5]{1,8}(?:出差)?(?P<days>\d+|[一二两三四五六七八九十]{1,3})?天?[,\s]*", "", text) text = re.sub(r"^.*?(?:出差|前往|去|到|赴)[\u4e00-\u9fa5]{1,8}(?:出差)?(?P<days>\d+|[一二两三四五六七八九十]{1,3})?天?[,\s]*", "", text)
text = re.sub(r"^(?:出差|申请|费用申请|业务|本次|去|到|前往)\s*", "", text) text = re.sub(r"^(?:出差|申请|费用申请|业务|本次|去|到|前往)\s*", "", text)
text = text.strip(" :,。;;") text = text.strip(" :,。;;")
@@ -537,8 +556,16 @@ class UserAgentApplicationSlotMixin:
step: str, step: str,
facts: dict[str, str], facts: dict[str, str],
) -> list[UserAgentSuggestedAction]: ) -> list[UserAgentSuggestedAction]:
if step == "ask_missing": if step in {"ask_missing", "ask_invalid"}:
missing_fields = self._resolve_application_missing_fields(facts) missing_fields = (
self._resolve_application_missing_fields(facts)
if step == "ask_missing"
else [
issue.get("field", "")
for issue in self._resolve_application_validation_issues(facts)
if issue.get("field")
]
)
return [ return [
UserAgentSuggestedAction( UserAgentSuggestedAction(
label="一次性补充申请信息", label="一次性补充申请信息",
@@ -1209,7 +1236,22 @@ class UserAgentApplicationMixin(UserAgentApplicationSlotMixin, UserAgentApplicat
"我已按「费用申请 / 事前审批」来处理这条内容。", "我已按「费用申请 / 事前审批」来处理这条内容。",
"已识别信息:\n" + recognized_table, "已识别信息:\n" + recognized_table,
f"当前还需要补充:{missing_text}", f"当前还需要补充:{missing_text}",
"请一次性补齐上述字段,我会继续生成申请核对结果并让你确认是否提交。", "请一次性补齐上述字段,我会继续生成申请核对结果,并请您确认是否提交。",
]
)
if step == "ask_invalid":
issue_messages = [
item["message"]
for item in self._resolve_application_validation_issues(facts)
if str(item.get("message") or "").strip()
]
return "\n\n".join(
[
"我已识别到申请信息里有需要先修正的字段。",
"已识别信息:\n" + recognized_table,
*issue_messages,
"请把地点改为真实出差地点,业务事项放在事由中;修正后我再帮您提交申请。",
] ]
) )
@@ -1473,7 +1515,7 @@ class UserAgentApplicationMixin(UserAgentApplicationSlotMixin, UserAgentApplicat
pick("applicationType", "application_type") pick("applicationType", "application_type")
), ),
"time": pick("time", "timeRange", "time_range"), "time": pick("time", "timeRange", "time_range"),
"location": pick("location"), "location": normalize_application_location(pick("location")),
"reason": reason, "reason": reason,
"days": pick("days"), "days": pick("days"),
"transport_mode": pick("transportMode", "transport_mode"), "transport_mode": pick("transportMode", "transport_mode"),
@@ -1507,6 +1549,8 @@ class UserAgentApplicationMixin(UserAgentApplicationSlotMixin, UserAgentApplicat
payload: UserAgentRequest, payload: UserAgentRequest,
facts: dict[str, str], facts: dict[str, str],
) -> str: ) -> str:
if self._resolve_application_validation_issues(facts):
return "ask_invalid"
if self._is_application_save_draft_action(payload): if self._is_application_save_draft_action(payload):
return "draft" return "draft"
if self._resolve_application_missing_base_fields(facts): if self._resolve_application_missing_base_fields(facts):
@@ -1516,4 +1560,3 @@ class UserAgentApplicationMixin(UserAgentApplicationSlotMixin, UserAgentApplicat
if self._is_application_submit_confirmation(payload): if self._is_application_submit_confirmation(payload):
return "submitted" return "submitted"
return "preview" return "preview"

View File

@@ -2,7 +2,6 @@ from __future__ import annotations
import re import re
DIRECT_MUNICIPALITY_DISPLAY = { DIRECT_MUNICIPALITY_DISPLAY = {
"北京": "北京市", "北京": "北京市",
"北京市": "北京市", "北京市": "北京市",
@@ -79,7 +78,7 @@ CITY_TO_PROVINCE = {
} }
LOCATION_NOISE_PATTERN = re.compile( LOCATION_NOISE_PATTERN = re.compile(
r"(?:出差|驻场|现场|支撑|支持|部署|上线|实施|拜访|验收|会议|采购|培训|协助|处理|办理|参加|进行).*$" r"(?:出差|驻场|现场|支撑|支持|辅助|部署|上线|实施|拜访|验收|会议|采购|培训|协助|处理|办理|参加|进行).*$"
) )

View File

@@ -716,7 +716,7 @@ class UserAgentKnowledgeMixin(UserAgentKnowledgeHelpersMixin):
self._append_markdown_section( self._append_markdown_section(
answer_lines, answer_lines,
"说明", "说明",
["- 请补充费用类型、适用地区、职级或具体业务场景,我继续帮缩小范围。"], ["- 请补充费用类型、适用地区、职级或具体业务场景,我继续帮缩小范围。"],
) )
return "\n".join(answer_lines).strip() return "\n".join(answer_lines).strip()
@@ -729,7 +729,7 @@ class UserAgentKnowledgeMixin(UserAgentKnowledgeHelpersMixin):
self._append_markdown_section( self._append_markdown_section(
answer_lines, answer_lines,
"说明", "说明",
["- 以上只使用当前命中的知识库证据;没有在证据中出现的适用条件或金额,我不会替默认补齐。"], ["- 以上只使用当前命中的知识库证据;没有在证据中出现的适用条件或金额,我不会替默认补齐。"],
) )
return "\n".join(answer_lines).strip() return "\n".join(answer_lines).strip()

View File

@@ -61,7 +61,7 @@ class UserAgentResponseMixin:
if payload.ontology.intent == "draft": if payload.ontology.intent == "draft":
tool_message = str(payload.tool_payload.get("message") or "").strip() tool_message = str(payload.tool_payload.get("message") or "").strip()
if payload.tool_payload.get("draft_limit_reached"): if payload.tool_payload.get("draft_limit_reached"):
return tool_message or "当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。" return tool_message or "当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。"
if tool_message and ( if tool_message and (
str(payload.tool_payload.get("claim_id") or "").strip() str(payload.tool_payload.get("claim_id") or "").strip()
or str(payload.tool_payload.get("claim_no") or "").strip() or str(payload.tool_payload.get("claim_no") or "").strip()
@@ -88,12 +88,12 @@ class UserAgentResponseMixin:
elif attachment_names: elif attachment_names:
attachment_hint = ( attachment_hint = (
f" 我已带入 {len(attachment_names)} 份附件名称,但目前还不能直接读取附件内容," f" 我已带入 {len(attachment_names)} 份附件名称,但目前还不能直接读取附件内容,"
"需要补充关键信息。" "需要补充关键信息。"
) )
return ( return (
"可以帮发起报销。请补充费用类型、发生时间、金额、事由和相关对象," "可以帮发起报销。请补充费用类型、发生时间、金额、事由和相关对象,"
"或者直接上传票据附件,我继续帮判断能否报、缺什么材料,并整理待核对信息。" "或者直接上传票据附件,我继续帮判断能否报销、还缺哪些材料,并整理待核对信息。"
f"{attachment_hint}" f"{attachment_hint}"
) )
@@ -122,8 +122,8 @@ class UserAgentResponseMixin:
return ( return (
f"已识别到一笔{time_text}{expense_type}支出{amount_hint}" f"已识别到一笔{time_text}{expense_type}支出{amount_hint}"
"如果要继续整理报销核对信息,还需要补充客户单位、参与人员、费用明细和票据附件。" "如果要继续整理报销核对信息,还需要补充客户单位、参与人员、费用明细和票据附件。"
"也可以继续上传发票或图片,我会把这些信息带入后续对话。" "也可以继续上传发票或图片,我会把这些信息带入后续对话。"
) )
@@ -347,7 +347,7 @@ class UserAgentResponseMixin:
query_payload = self._build_query_payload(payload) query_payload = self._build_query_payload(payload)
scope_label = str(data.get("scope_label") or subject).strip() or subject scope_label = str(data.get("scope_label") or subject).strip() or subject
if query_payload is None: if query_payload is None:
return f"当前没有查到{scope_label}可以补充时间范围、单号或状态继续筛选。" return f"当前没有查到{scope_label}可以补充时间范围、单号或状态继续筛选。"
window_prefix = ( window_prefix = (
f"{query_payload.window_start_date}{query_payload.window_end_date}" f"{query_payload.window_start_date}{query_payload.window_end_date}"
@@ -367,10 +367,10 @@ class UserAgentResponseMixin:
f"另有 {query_payload.older_record_count} 笔超过 {query_payload.window_days} 日的单据," f"另有 {query_payload.older_record_count} 笔超过 {query_payload.window_days} 日的单据,"
"请前往个人报销中心查看。" "请前往个人报销中心查看。"
) )
return f"{window_prefix}没有查到{query_payload.scope_label}可以补充时间范围、单号或状态继续筛选。" return f"{window_prefix}没有查到{query_payload.scope_label}可以补充时间范围、单号或状态继续筛选。"
answer_parts = [ answer_parts = [
f"已按的筛选条件查询{query_payload.scope_label}", f"已按的筛选条件查询{query_payload.scope_label}",
f"下面先列出最近 {query_payload.preview_count} 条记录,点击任一单据即可查看详情。", f"下面先列出最近 {query_payload.preview_count} 条记录,点击任一单据即可查看详情。",
f"本次共命中 {query_payload.record_count} 笔,金额合计 {query_payload.total_amount:.2f} 元。", f"本次共命中 {query_payload.record_count} 笔,金额合计 {query_payload.total_amount:.2f} 元。",
] ]

View File

@@ -68,8 +68,8 @@ class UserAgentReviewCoreMixin:
if has_time: if has_time:
context_hint += ",并看到了业务发生时间" context_hint += ",并看到了业务发生时间"
return ( return (
f"{context_hint}。但还没有明确这笔单据属于哪类报销。" f"{context_hint}。但还没有明确这笔单据属于哪类报销。"
"请先在下面选择报销场景,我会按选择的场景继续识别时间、地点、事由、金额和所需票据," "请先在下面选择报销场景,我会按选择的场景继续识别时间、地点、事由、金额和所需票据,"
"避免系统先入为主把项目支持、部署等描述误判成差旅。" "避免系统先入为主把项目支持、部署等描述误判成差旅。"
) )

View File

@@ -164,7 +164,7 @@ class UserAgentReviewMessageMixin:
if payload.tool_payload.get("draft_limit_reached"): if payload.tool_payload.get("draft_limit_reached"):
return ( return (
str(payload.tool_payload.get("message") or "").strip() str(payload.tool_payload.get("message") or "").strip()
or "当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。" or "当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。"
) )
review_action = str(payload.context_json.get("review_action") or "").strip() review_action = str(payload.context_json.get("review_action") or "").strip()
@@ -254,11 +254,11 @@ class UserAgentReviewMessageMixin:
if claim_no: if claim_no:
return ( return (
f"已识别出本次上传的 {document_count} 张票据。" f"已识别出本次上传的 {document_count} 张票据。"
f"系统检测到已有草稿 {claim_no},请选择关联到该草稿,或单独建立一张新的报销单。" f"系统检测到已有草稿 {claim_no},请选择关联到该草稿,或单独新建一张报销单。"
) )
return ( return (
f"已识别出本次上传的 {document_count} 张票据。" f"已识别出本次上传的 {document_count} 张票据。"
"系统检测到已有可用草稿,请先选择关联到现有草稿,或单独建立一张新的报销单。" "系统检测到已有可用草稿,请先选择关联到现有草稿,或单独新建一张报销单。"
) )
blocked_reasons = self._resolve_submission_blocked_reasons(payload) blocked_reasons = self._resolve_submission_blocked_reasons(payload)

View File

@@ -668,7 +668,7 @@ class UserAgentReviewSlotMixin:
status="missing" if is_missing else "identified" if source in {"user_text", "user_form"} else "inferred", status="missing" if is_missing else "identified" if source in {"user_text", "user_form"} else "inferred",
hint=f"建议补充 {SLOT_LABELS.get(key, key)}" hint=f"建议补充 {SLOT_LABELS.get(key, key)}"
if is_missing and required if is_missing and required
else ("该字段来自系统辅助上下文,建议再核对一次。" if source in {"detail_context", "ocr"} else ""), else ("该字段来自系统辅助上下文,建议再核对一次。" if source in {"detail_context", "ocr"} else ""),
evidence=evidence, evidence=evidence,
) )

View File

@@ -888,6 +888,34 @@ def test_travel_reimbursement_calculator_rejects_unrecognized_location() -> None
) )
def test_travel_reimbursement_calculator_normalizes_location_mixed_with_business_content() -> None:
with build_session() as db:
db.add(
Employee(
employee_no="E9004",
name="混合地点员工",
email="mixed-location@example.com",
position="产品经理",
grade="P4",
)
)
db.commit()
result = TravelReimbursementCalculatorService(db).calculate(
TravelReimbursementCalculatorRequest(days=4, location="上海辅助国网仿生产服务器"),
CurrentUserContext(
username="mixed-location@example.com",
name="混合地点员工",
role_codes=[],
is_admin=False,
),
)
assert result.location == "上海市"
assert result.matched_city == "上海"
assert result.hotel_amount > 0
def test_agent_run_service_lists_seeded_trace_data() -> None: def test_agent_run_service_lists_seeded_trace_data() -> None:
with build_session() as db: with build_session() as db:
service = AgentRunService(db) service = AgentRunService(db)

View File

@@ -84,6 +84,33 @@ def test_document_intelligence_prefers_train_ticket_for_railway_e_ticket_invoice
assert any(field.label == "金额" and field.value == "354元" for field in insight.fields) assert any(field.label == "金额" and field.value == "354元" for field in insight.fields)
def test_document_intelligence_train_ticket_uses_railway_merchant_not_invoice_title() -> None:
insight = build_document_insight(
filename="2月20_武汉-上海.pdf",
summary="电子发票(铁路电子客票);发票监;统一 制",
text=(
"电子发票(铁路电子客票)\n"
"发票号码:26429165800002785705 湖北\n"
"开票日期:2026年05月18日\n"
"武汉站 G458 上海虹桥站\n"
"Wuhan Shanghaihongqiao\n"
"2026年02月20日 07:55开 06车01B号 二等座\n"
"票价:¥354.00\n"
"4201061987****1615 曹笑竹\n"
"电子客票号:6580061086021391007342026\n"
"购买方名称:曹笑竹 统一社会信用代码:\n"
"买票请到12306 发货请到95306\n"
"中国铁路祝您旅途愉快"
),
)
assert insight.document_type == "train_ticket"
fields = {field.label: field.value for field in insight.fields}
assert fields["商户"] == "中国铁路"
assert fields["金额"] == "354元"
assert fields["列车出发时间"] == "2026-02-20 07:55"
def test_document_intelligence_recovers_train_ticket_from_english_station_ocr_text() -> None: def test_document_intelligence_recovers_train_ticket_from_english_station_ocr_text() -> None:
insight = build_document_insight( insight = build_document_insight(
filename="2月20_武汉-上海.pdf", filename="2月20_武汉-上海.pdf",

View File

@@ -28,6 +28,7 @@ from app.schemas.reimbursement import (
) )
from app.services.agent_conversations import AgentConversationService from app.services.agent_conversations import AgentConversationService
from app.services.budget import BudgetService from app.services.budget import BudgetService
from app.services.document_preview import DocumentPreviewAssets
from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage
from app.services.expense_claim_budget_flow import ExpenseClaimBudgetFlowMixin from app.services.expense_claim_budget_flow import ExpenseClaimBudgetFlowMixin
from app.services.expense_claim_workflow_constants import ( from app.services.expense_claim_workflow_constants import (
@@ -3314,6 +3315,68 @@ def test_attachment_preview_resolves_legacy_filename_in_claim_item_directory(mon
assert filename == "legacy-ticket.pdf" assert filename == "legacy-ticket.pdf"
def test_attachment_pdf_preview_falls_back_to_source_when_render_fonts_missing(monkeypatch, tmp_path) -> None:
current_user = CurrentUserContext(
username="emp-1",
name="张三",
role_codes=[],
is_admin=False,
)
monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path)
with build_session() as db:
claim = build_claim(expense_type="train", location="上海")
db.add(claim)
db.commit()
attachment_dir = tmp_path / claim.id / claim.items[0].id
attachment_dir.mkdir(parents=True)
file_path = attachment_dir / "2月20_武汉-上海.pdf"
preview_path = attachment_dir / "2月20_武汉-上海.preview.png"
file_path.write_bytes(b"%PDF-1.7 fake")
preview_path.write_bytes(b"broken-preview")
claim.items[0].invoice_id = f"{claim.id}/{claim.items[0].id}/{file_path.name}"
db.commit()
storage = ExpenseClaimAttachmentStorage()
storage.write_meta(
file_path,
{
"file_name": file_path.name,
"storage_key": storage.to_storage_key(file_path),
"media_type": "application/pdf",
"previewable": True,
"preview_kind": "image",
"preview_storage_key": storage.to_storage_key(preview_path),
"preview_media_type": "image/png",
"preview_file_name": preview_path.name,
"preview_rendered_with": "pdftoppm-png-r160-poppler-data",
},
)
def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
raise RuntimeError("Missing language pack for 'Adobe-GB1' mapping")
monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
resolved_path, media_type, filename = ExpenseClaimService(db).get_claim_item_attachment_preview_content(
claim_id=claim.id,
item_id=claim.items[0].id,
current_user=current_user,
)
assert resolved_path == file_path
assert media_type == "application/pdf"
assert filename == file_path.name
refreshed_meta = storage.read_meta(file_path)
assert refreshed_meta["preview_kind"] == "pdf"
assert refreshed_meta["preview_storage_key"] == storage.to_storage_key(file_path)
assert refreshed_meta["preview_media_type"] == "application/pdf"
assert refreshed_meta["preview_file_name"] == file_path.name
assert refreshed_meta["preview_rendered_with"] == ""
def test_submit_claim_runs_ai_review_and_routes_to_direct_manager() -> None: def test_submit_claim_runs_ai_review_and_routes_to_direct_manager() -> None:
current_user = CurrentUserContext( current_user = CurrentUserContext(
username="emp-submit@example.com", username="emp-submit@example.com",
@@ -5199,6 +5262,103 @@ def test_admin_delete_claim_unlinks_receipt_folder_items(monkeypatch, tmp_path)
get_settings.cache_clear() get_settings.cache_clear()
def test_admin_delete_linked_reimbursement_resets_application_link_status() -> None:
admin_user = CurrentUserContext(
username="superadmin",
name="系统管理员",
role_codes=["admin"],
is_admin=True,
)
with build_session() as db:
application_claim = ExpenseClaim(
id="application-delete-linked-reimbursement",
claim_no="APP-DEL-LINKED-APPLICATION",
employee_name="张三",
department_name="交付部",
project_code="PRJ-A",
expense_type="travel_application",
reason="支撑国网仿生产环境部署",
location="上海",
amount=Decimal("3000.00"),
currency="CNY",
invoice_count=0,
occurred_at=datetime(2026, 6, 21, 22, 30, tzinfo=UTC),
submitted_at=datetime(2026, 6, 21, 22, 35, tzinfo=UTC),
status="approved",
approval_stage=APPLICATION_LINK_STATUS_STAGE,
risk_flags_json=[
{
"source": "manual_approval",
"event_type": "expense_application_approval",
"operator": "向万红",
"previous_approval_stage": DIRECT_MANAGER_APPROVAL_STAGE,
"next_status": "approved",
"next_approval_stage": APPLICATION_LINK_STATUS_STAGE,
"generated_draft_claim_id": "reimbursement-delete-linked-application",
"generated_draft_claim_no": "RDELETE01",
"created_at": "2026-06-21T22:45:00+00:00",
}
],
)
reimbursement_claim = ExpenseClaim(
id="reimbursement-delete-linked-application",
claim_no="RDELETE01",
employee_name="张三",
department_name="交付部",
project_code="PRJ-A",
expense_type="travel",
reason="支撑国网仿生产环境部署报销",
location="上海",
amount=Decimal("3000.00"),
currency="CNY",
invoice_count=1,
occurred_at=datetime(2026, 6, 21, 22, 30, tzinfo=UTC),
submitted_at=None,
status="draft",
approval_stage="待提交",
risk_flags_json=[
{
"source": "application_handoff",
"event_type": "expense_application_to_reimbursement_draft",
"application_claim_id": application_claim.id,
"application_claim_no": application_claim.claim_no,
}
],
)
db.add_all([application_claim, reimbursement_claim])
db.commit()
deleted = ExpenseClaimService(db).delete_claim(reimbursement_claim.id, admin_user)
assert deleted is not None
assert deleted.claim_no == "RDELETE01"
assert db.get(ExpenseClaim, reimbursement_claim.id) is None
db.refresh(application_claim)
assert application_claim.status == "approved"
assert application_claim.approval_stage == APPLICATION_LINK_STATUS_STAGE
approval_flag = next(
flag
for flag in application_claim.risk_flags_json
if isinstance(flag, dict) and flag.get("event_type") == "expense_application_approval"
)
assert "generated_draft_claim_id" not in approval_flag
assert "generated_draft_claim_no" not in approval_flag
sync_flag = next(
flag
for flag in application_claim.risk_flags_json
if isinstance(flag, dict) and flag.get("event_type") == "expense_application_reimbursement_deleted"
)
assert sync_flag["source"] == "application_link_sync"
assert sync_flag["severity"] == "info"
assert sync_flag["actionability"] == "system_trace"
assert sync_flag["deleted_reimbursement_claim_id"] == "reimbursement-delete-linked-application"
assert sync_flag["deleted_reimbursement_claim_no"] == "RDELETE01"
assert sync_flag["next_approval_stage"] == APPLICATION_LINK_STATUS_STAGE
def test_direct_manager_can_return_subordinate_claim_to_pending_submission() -> None: def test_direct_manager_can_return_subordinate_claim_to_pending_submission() -> None:
current_user = CurrentUserContext( current_user = CurrentUserContext(
username="manager-return@example.com", username="manager-return@example.com",

View File

@@ -85,6 +85,31 @@ def test_notification_state_service_persists_user_scoped_read_and_hidden_state()
assert other_saved.states[0].hidden_at is None assert other_saved.states[0].hidden_at is None
def test_notification_state_storage_ready_runs_once_per_database_bind(monkeypatch) -> None:
with build_session() as db:
service = NotificationStateService(db)
user = CurrentUserContext(username="alice", name="Alice", role_codes=[], is_admin=False)
calls: list[object] = []
original_create_all = Base.metadata.create_all
def track_create_all(*args, **kwargs):
calls.append(kwargs.get("bind"))
return original_create_all(*args, **kwargs)
monkeypatch.setattr(Base.metadata, "create_all", track_create_all)
service.list_states(user)
service.list_states(user)
service.patch_states(
NotificationStateBatchPatch(
states=[NotificationStatePatch(notification_id="workbench:todo:EXP-002", read=True)]
),
user,
)
assert len(calls) == 1
def test_notification_state_endpoint_reads_and_updates_current_user_state() -> None: def test_notification_state_endpoint_reads_and_updates_current_user_state() -> None:
client = build_client() client = build_client()
headers = {"x-auth-username": "alice", "x-auth-name": "Alice"} headers = {"x-auth-username": "alice", "x-auth-name": "Alice"}

View File

@@ -5,19 +5,23 @@ import subprocess
from pathlib import Path from pathlib import Path
from app.core.config import get_settings from app.core.config import get_settings
from app.services import document_preview
from app.services.ocr import OcrService from app.services.ocr import OcrService
def test_ocr_runtime_installers_include_poppler_cjk_data() -> None: def test_ocr_runtime_installers_include_cjk_safe_pdf_rendering_tools() -> None:
repo_root = Path(__file__).resolve().parents[2] repo_root = Path(__file__).resolve().parents[2]
dependency_sources = [ dependency_sources = [
repo_root / "docker-compose.yml", repo_root / "docker-compose.yml",
repo_root / "docker-compose.full.yml",
repo_root / "server" / "scripts" / "bootstrap_paddleocr_mobile.sh", repo_root / "server" / "scripts" / "bootstrap_paddleocr_mobile.sh",
repo_root / "server" / "scripts" / "bootstrap_paddleocr_gpu.sh", repo_root / "server" / "scripts" / "bootstrap_paddleocr_gpu.sh",
] ]
for path in dependency_sources: for path in dependency_sources:
assert "poppler-data" in path.read_text(encoding="utf-8") content = path.read_text(encoding="utf-8")
assert "poppler-data" in content
assert "mupdf-tools" in content
def test_ocr_service_uses_worker_runtime_and_keeps_unsupported_files_as_warnings( def test_ocr_service_uses_worker_runtime_and_keeps_unsupported_files_as_warnings(
@@ -163,6 +167,7 @@ def test_ocr_service_passes_configured_device_to_worker(
text: bool, text: bool,
timeout: int, timeout: int,
check: bool, check: bool,
env: dict[str, str] | None = None,
) -> subprocess.CompletedProcess[str]: ) -> subprocess.CompletedProcess[str]:
captured_commands.append(command) captured_commands.append(command)
return subprocess.CompletedProcess( return subprocess.CompletedProcess(
@@ -194,12 +199,12 @@ def test_ocr_service_converts_pdf_to_images_and_returns_image_preview(
monkeypatch, monkeypatch,
tmp_path: Path, tmp_path: Path,
) -> None: ) -> None:
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]: def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
first = output_dir / "page-1.png" first = output_dir / "page-1.png"
second = output_dir / "page-2.png" second = output_dir / "page-2.png"
first.write_bytes(b"fake-page-1") first.write_bytes(b"fake-page-1")
second.write_bytes(b"fake-page-2") second.write_bytes(b"fake-page-2")
return [first, second] return [first, second], True
def fake_invoke_worker( def fake_invoke_worker(
self, self,
@@ -281,26 +286,143 @@ def test_ocr_service_converts_pdf_to_images_and_returns_image_preview(
assert recognized.lines[1].page_index == 1 assert recognized.lines[1].page_index == 1
def test_ocr_service_uses_pdf_text_layer_without_worker_runtime( def test_ocr_service_rejects_pdf_ocr_when_rendered_image_fonts_are_broken(
monkeypatch, monkeypatch,
tmp_path: Path, tmp_path: Path,
) -> None: ) -> None:
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]: def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
page = output_dir / "page-1.png" raise RuntimeError("PDF 转图片失败:检测到中文字体映射缺失,未生成可 OCR 的图片。")
page.write_bytes(b"fake-rendered-page")
return [page]
def fail_resolve_python(self) -> str: def fake_invoke_worker(
raise AssertionError("PDF 文本层可用时不应强制解析 OCR worker。") self,
*,
def fail_invoke_worker(self, **kwargs) -> dict: python_bin: str,
raise AssertionError("PDF 文本层可用时不应调用 OCR worker。") worker_path: str,
input_paths: list[Path],
) -> dict:
raise AssertionError("PDF 转图片已确认丢中文时,不应继续调用 OCR worker。")
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", fail_resolve_python) monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py") monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images) monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fail_invoke_worker) monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
get_settings.cache_clear()
try:
result = OcrService().recognize_files(
[
("2月20_武汉-上海.pdf", b"%PDF-1.7 fake", "application/pdf"),
]
)
finally:
get_settings.cache_clear()
failed = result.documents[0]
assert failed.line_count == 0
assert failed.preview_kind == ""
assert failed.preview_data_url == ""
assert failed.warnings == ["PDF 转图片失败:检测到中文字体映射缺失,未生成可 OCR 的图片。"]
def test_ocr_pdf_conversion_tries_next_renderer_when_poppler_font_mapping_fails(
monkeypatch,
tmp_path: Path,
) -> None:
output_dir = tmp_path / "pages"
output_dir.mkdir()
calls: list[str] = []
def fake_run(
command: list[str],
*,
capture_output: bool,
text: bool,
timeout: int,
check: bool,
) -> subprocess.CompletedProcess[str]:
calls.append(Path(command[0]).name)
if Path(command[0]).name == "pdftoppm":
(output_dir / "page-1.png").write_bytes(b"broken-preview")
return subprocess.CompletedProcess(
args=command,
returncode=0,
stdout="",
stderr="Syntax Error: Missing language pack for 'Adobe-GB1' mapping",
)
(output_dir / "page-1.png").write_bytes(b"rendered-with-chinese")
return subprocess.CompletedProcess(
args=command,
returncode=0,
stdout="",
stderr="",
)
monkeypatch.setattr(
document_preview.shutil,
"which",
lambda name: f"/usr/bin/{name}" if name in {"pdftoppm", "mutool"} else None,
)
monkeypatch.setattr(subprocess, "run", fake_run)
pages, preview_usable = OcrService()._convert_pdf_to_images(
pdf_path=tmp_path / "ticket.pdf",
output_dir=output_dir,
)
assert pages == [output_dir / "page-1.png"]
assert preview_usable is True
assert calls == ["pdftoppm", "mutool"]
def test_ocr_service_invokes_worker_even_when_pdf_text_layer_is_usable(
monkeypatch,
tmp_path: Path,
) -> None:
calls = {"worker": 0}
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
page = output_dir / "page-1.png"
page.write_bytes(b"fake-rendered-page")
return [page], True
def fake_invoke_worker(
self,
*,
python_bin: str,
worker_path: str,
input_paths: list[Path],
) -> dict:
calls["worker"] += 1
return {
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"documents": [
{
"input_path": str(input_paths[0]),
"engine": "paddleocr_mobile",
"model": "PP-OCRv5_mobile",
"text": "电子发票(铁路电子客票) 武汉站 上海虹桥站 G458 票价 ¥354.00",
"summary": "铁路电子客票",
"avg_score": 0.95,
"line_count": 1,
"page_count": 1,
"warnings": [],
"lines": [
{
"text": "电子发票(铁路电子客票) 武汉站 上海虹桥站 G458 票价 ¥354.00",
"score": 0.95,
"box": [[1, 2], [10, 2], [10, 8], [1, 8]],
}
],
}
],
}
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python")
monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py")
monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images)
monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker)
monkeypatch.setattr( monkeypatch.setattr(
OcrService, OcrService,
"_extract_pdf_text_layer", "_extract_pdf_text_layer",
@@ -326,9 +448,9 @@ def test_ocr_service_uses_pdf_text_layer_without_worker_runtime(
recognized = result.documents[0] recognized = result.documents[0]
assert result.success_count == 1 assert result.success_count == 1
assert calls["worker"] == 1
assert recognized.document_type == "train_ticket" assert recognized.document_type == "train_ticket"
assert "电子发票(铁路电子客票)" in recognized.text assert "电子发票(铁路电子客票)" in recognized.text
assert "电子客票号:6580061086021391007342026" in recognized.text
assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields) assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields)
assert recognized.preview_kind == "image" assert recognized.preview_kind == "image"
assert recognized.preview_data_url.startswith("data:image/png;base64,") assert recognized.preview_data_url.startswith("data:image/png;base64,")
@@ -392,14 +514,22 @@ def test_ocr_service_reuses_cached_document_for_same_content(
assert second.documents[0].summary == first.documents[0].summary assert second.documents[0].summary == first.documents[0].summary
def test_ocr_cache_key_includes_pdf_render_pipeline_version() -> None:
cache_key = OcrService()._build_cache_key(b"same-pdf-content")
assert "pdf-image-ocr:" in cache_key
assert document_preview.DocumentPreviewAssets.PDF_RENDERER_ID in cache_key
assert "no-pdf-direct" in cache_key
def test_ocr_service_prefers_pdf_text_layer_when_rendered_ocr_is_placeholder_heavy( def test_ocr_service_prefers_pdf_text_layer_when_rendered_ocr_is_placeholder_heavy(
monkeypatch, monkeypatch,
tmp_path: Path, tmp_path: Path,
) -> None: ) -> None:
def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]: def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]:
page = output_dir / "page-1.png" page = output_dir / "page-1.png"
page.write_bytes(b"fake-page") page.write_bytes(b"fake-page")
return [page] return [page], True
def fake_invoke_worker( def fake_invoke_worker(
self, self,

View File

@@ -4,7 +4,7 @@ import base64
from app.api.deps import CurrentUserContext from app.api.deps import CurrentUserContext
from app.core.config import get_settings from app.core.config import get_settings
from app.schemas.ocr import OcrRecognizeDocumentRead from app.schemas.ocr import OcrRecognizeDocumentRead, OcrRecognizeFieldRead
from app.services.document_preview import DocumentPreviewAssets from app.services.document_preview import DocumentPreviewAssets
from app.services.receipt_folder import ReceiptFolderService from app.services.receipt_folder import ReceiptFolderService
@@ -72,6 +72,55 @@ def test_receipt_folder_train_ticket_uses_invoice_date_and_enriches_fields(monke
get_settings.cache_clear() get_settings.cache_clear()
def test_receipt_folder_pdf_save_eagerly_renders_image_preview(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
preview_path.write_bytes(b"rendered-preview")
return preview_path
monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月20_武汉-上海.pdf",
content=b"%PDF-1.4 fake",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text="铁路电子客票 武汉 上海虹桥 354.00",
summary="铁路电子客票,武汉至上海虹桥。",
),
)
receipt_dir = next(service.root.glob("pytest/*"))
preview_path = receipt_dir / "preview.png"
meta = service._read_meta(receipt_dir)
assert receipt.preview_kind == "image"
assert preview_path.read_bytes() == b"rendered-preview"
assert meta["preview_file_name"] == "preview.png"
assert meta["preview_media_type"] == "image/png"
assert meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID
resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user)
assert resolved_path == preview_path
assert media_type == "image/png"
assert file_name == "preview.png"
finally:
get_settings.cache_clear()
def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None: def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear() get_settings.cache_clear()
@@ -123,6 +172,213 @@ def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch,
get_settings.cache_clear() get_settings.cache_clear()
def test_receipt_folder_pdf_preview_falls_back_to_source_when_render_fonts_missing(
monkeypatch,
tmp_path,
) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
stale_preview = b"broken-preview"
preview_data_url = f"data:image/png;base64,{base64.b64encode(stale_preview).decode('ascii')}"
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月20_武汉-上海.pdf",
content=b"%PDF-1.7 fake",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
preview_kind="image",
preview_data_url=preview_data_url,
),
)
receipt_dir = next(service.root.glob("pytest/*"))
meta = service._read_meta(receipt_dir)
meta["preview_rendered_with"] = "pdftoppm-png-r160-poppler-data"
service._write_meta(receipt_dir, meta)
def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds):
raise RuntimeError("Missing language pack for 'Adobe-GB1' mapping")
monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page)
resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user)
assert resolved_path == receipt_dir / "2月20_武汉-上海.pdf"
assert media_type == "application/pdf"
assert file_name == "2月20_武汉-上海.pdf"
refreshed_meta = service._read_meta(receipt_dir)
assert refreshed_meta["preview_kind"] == "pdf"
assert refreshed_meta["preview_file_name"] == "2月20_武汉-上海.pdf"
assert refreshed_meta["preview_media_type"] == "application/pdf"
assert refreshed_meta["preview_rendered_with"] == ""
finally:
get_settings.cache_clear()
def test_receipt_folder_train_ticket_extracts_passenger_from_id_line_and_purchase_name(
monkeypatch,
tmp_path,
) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月20_武汉-上海.pdf",
content=b"%PDF-1.4 fake",
media_type="application/pdf",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月20_武汉-上海.pdf",
media_type="application/pdf",
text=(
"电子发票(铁路电子客票)\n"
"发票号码:26429165800002785705 湖北\n"
"开票日期:2026年05月18日\n"
"武汉站 G458 上海虹桥站\n"
"Wuhan Shanghaihongqiao\n"
"2026年02月20日 07:55开 06车01B号 二等座\n"
"票价:¥354.00\n"
"4201061987****1615 曹笑竹\n"
"电子客票号:6580061086021391007342026\n"
"购买方名称:曹笑竹 统一社会信用代码:\n"
"买票请到12306 发货请到95306\n"
"中国铁路祝您旅途愉快"
),
summary="电子发票(铁路电子客票);发票监;统一 制",
document_type="train_ticket",
document_type_label="火车/高铁票",
scene_code="travel",
scene_label="差旅票据",
document_fields=[
OcrRecognizeFieldRead(key="merchant_name", label="商户", value="电子发票(铁路"),
OcrRecognizeFieldRead(key="amount", label="金额", value="354元"),
OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-20 07:55"),
OcrRecognizeFieldRead(key="trip_no", label="车次", value="G458"),
OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"),
],
),
)
assert receipt.merchant_name == "中国铁路"
detail = service.get_receipt(receipt.id, current_user)
fields = {field.label: field.value for field in detail.fields}
assert fields["商户"] == "中国铁路"
assert fields["乘车人"] == "曹笑竹"
assert fields["出发地点"] == "武汉"
assert fields["到达地点"] == "上海虹桥"
assert fields["身份证号"] == "4201061987****1615"
assert fields["电子客票号"] == "6580061086021391007342026"
assert fields["开票日期"] == "2026-05-18"
assert fields["列车出发时间"] == "2026-02-20 07:55"
assert fields["车厢"] == "06车"
assert fields["座位号"] == "01B"
finally:
get_settings.cache_clear()
def test_receipt_folder_train_ticket_repairs_invalid_generated_fields_from_ocr_text(
monkeypatch,
tmp_path,
) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear()
try:
current_user = CurrentUserContext(
username="pytest",
name="Py Test",
role_codes=[],
is_admin=False,
)
service = ReceiptFolderService()
receipt = service.save_receipt(
filename="2月21日_上海-深圳.png",
content=b"fake image",
media_type="image/png",
current_user=current_user,
document=OcrRecognizeDocumentRead(
filename="2月21日_上海-深圳.png",
media_type="image/png",
text=(
"行程单示意\n"
"出票渠道:示例平台\n"
"非官方车票\n"
"不可报销\n"
"仅供演示\n"
"创建日期2026年02月15日\n"
"订单号DEMO202602210001\n"
"单据编号DEMO-IT-000001\n"
"上海虹桥\n"
"G999\n"
"深圳北\n"
"\n"
"\n"
"Shanghaihongqiao\n"
"Shenzhenbei\n"
"2026年02月21日\n"
"08:30出发\n"
"全程约7小时30分\n"
"15:00到达\n"
"DEMO\n"
"乘客:示例旅客\n"
"车厢05车\n"
"席别:二等座\n"
"-\n"
"扫码无效\n"
"证件号310101199001010000\n"
"座位08A\n"
"票价¥438.00\n"
"仅为演示"
),
summary="行程单示意;出票渠道:示例平台;非官方车票",
document_type="train_ticket",
document_type_label="火车/高铁票",
scene_code="travel",
scene_label="差旅票据",
document_fields=[
OcrRecognizeFieldRead(key="amount", label="金额", value="438元"),
OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-21 08:30"),
OcrRecognizeFieldRead(key="invoice_number", label="票据号码", value="DEMO202602210001"),
OcrRecognizeFieldRead(key="trip_no", label="车次", value="G999"),
OcrRecognizeFieldRead(key="route", label="行程", value="上海-深圳"),
OcrRecognizeFieldRead(key="departure_station", label="出发地点", value="二等座"),
OcrRecognizeFieldRead(key="arrival_station", label="到达地点", value="扫码无效"),
OcrRecognizeFieldRead(key="passenger_name", label="乘车人", value="席别二等座"),
],
),
)
detail = service.get_receipt(receipt.id, current_user)
fields = {field.label: field.value for field in detail.fields}
assert fields["出发地点"] == "上海虹桥"
assert fields["到达地点"] == "深圳北"
assert fields["乘车人"] == "示例旅客"
assert fields["身份证号"] == "310101199001010000"
assert fields["席别"] == "二等座"
assert fields["车厢"] == "05车"
assert fields["座位号"] == "08A"
assert fields["票价"] == "438.00元"
finally:
get_settings.cache_clear()
def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -> None: def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -> None:
monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage"))
get_settings.cache_clear() get_settings.cache_clear()

View File

@@ -15,6 +15,7 @@ from app.models.financial_record import ExpenseClaim
from app.schemas.ontology import OntologyParseRequest from app.schemas.ontology import OntologyParseRequest
from app.schemas.user_agent import UserAgentCitation, UserAgentRequest, UserAgentReviewRiskBrief from app.schemas.user_agent import UserAgentCitation, UserAgentRequest, UserAgentReviewRiskBrief
from app.services.agent_assets import AgentAssetService from app.services.agent_assets import AgentAssetService
from app.services.application_location_semantics import resolve_jieba_tokens
from app.services.ontology import SemanticOntologyService from app.services.ontology import SemanticOntologyService
from app.services.user_agent import UserAgentService from app.services.user_agent import UserAgentService
from app.services.user_agent_documents import UserAgentDocumentService from app.services.user_agent_documents import UserAgentDocumentService
@@ -763,6 +764,67 @@ def test_user_agent_application_submit_blocks_overlapping_travel_dates() -> None
assert response.draft_payload is None assert response.draft_payload is None
def test_user_agent_application_submit_normalizes_location_mixed_with_business_content() -> None:
session_factory = build_session_factory()
with session_factory() as db:
response = build_application_user_agent_response(
db,
"确认提交",
context_overrides={
"manager_name": "向万红",
"application_preview": {
"fields": {
"applicationType": "差旅费用申请",
"time": "2026-02-20 至 2026-02-23",
"location": "上海辅助国网仿生产服务器",
"reason": "辅助国网仿生产服务器部署",
"days": "4天",
"transportMode": "火车",
"amount": "2120元",
}
},
},
)
claim = application_claim_query(db).one()
assert claim.location == "上海市"
assert claim.reason == "辅助国网仿生产服务器部署"
assert "申请单据已生成" in response.answer
assert response.draft_payload is not None
def test_user_agent_application_submit_splits_location_and_reason_from_raw_sentence() -> None:
session_factory = build_session_factory()
with session_factory() as db:
response = build_application_user_agent_response(
db,
"确认提交",
history=[
{
"role": "user",
"content": "2026-02-20 至 2026-02-23去上海辅助国网仿生产服务器部署火车",
}
],
context_overrides={
"manager_name": "向万红",
"grade": "P5",
"department_name": "技术部",
},
)
claim = application_claim_query(db).one()
assert claim.location == "上海市"
assert claim.reason == "辅助国网仿生产服务器部署"
assert "申请单据已生成" in response.answer
def test_application_sentence_jieba_tokenizer_recognizes_location_boundary() -> None:
tokens = resolve_jieba_tokens("上海辅助国网仿生产服务器部署")
assert ("上海", "ns") in tokens
assert [word for word, _ in tokens] == ["上海", "辅助", "国网", "仿生产", "服务器", "部署"]
def test_user_agent_application_maps_preview_travel_type_label() -> None: def test_user_agent_application_maps_preview_travel_type_label() -> None:
session_factory = build_session_factory() session_factory = build_session_factory()
with session_factory() as db: with session_factory() as db:
@@ -2155,7 +2217,7 @@ def test_user_agent_returns_draft_limit_message_when_save_is_blocked() -> None:
context_json={"review_action": "save_draft"}, context_json={"review_action": "save_draft"},
tool_payload={ tool_payload={
"draft_limit_reached": True, "draft_limit_reached": True,
"message": "当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。", "message": "当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。",
"status": "blocked", "status": "blocked",
}, },
) )
@@ -2163,7 +2225,7 @@ def test_user_agent_returns_draft_limit_message_when_save_is_blocked() -> None:
assert ( assert (
response.answer response.answer
== "当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。" == "当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。"
) )