From 0264a4b5b4c4c4c81da2bdebed8602ff8283c0a5 Mon Sep 17 00:00:00 2001 From: caoxiaozhu Date: Wed, 24 Jun 2026 10:42:24 +0800 Subject: [PATCH] =?UTF-8?q?refactor(server):=20user=5Fagent/steward/ocr=20?= =?UTF-8?q?=E7=AD=89=E6=9C=8D=E5=8A=A1=E9=87=8D=E6=9E=84=E5=B9=B6=E9=80=82?= =?UTF-8?q?=E9=85=8D=E5=85=B3=E8=81=94=E4=BB=BB=E5=8A=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - user_agent 拆分 application/locations/knowledge/response/review 四个子模块,接入申请位置语义与关联草稿分支 - steward planner/runtime/slot/plan_builder 决策链路重构,travel_reimbursement_calculator/orchestrator_expense_query 适配 - ocr/document_preview/document_intelligence/receipt_folder 复用预览与资产缓存,expense_claim_draft_flow/application_handoff 适配 - pyproject.toml 新增依赖,paddleocr bootstrap 脚本与 server_start.sh 调整 - 更新差旅/交通/通信等财务规则表,同步 document_intelligence/ocr/receipt_folder/user_agent 等测试 --- server/pyproject.toml | 1 + .../rules/finance-rules/交通工具等级标准.xlsx | Bin 6072 -> 6071 bytes .../rules/finance-rules/交通费用预估表.xlsx | Bin 7197 -> 7196 bytes .../finance-rules/公司通信费报销规则.xlsx | Bin 5934 -> 5933 bytes server/rules/finance-rules/出差补助标准.xlsx | Bin 5931 -> 5930 bytes .../rules/finance-rules/地区淡旺季映射表.xlsx | Bin 11427 -> 11430 bytes .../rules/finance-rules/差旅住宿费标准.xlsx | Bin 14708 -> 14706 bytes .../rules/finance-rules/差旅职级映射表.xlsx | Bin 5783 -> 5782 bytes server/scripts/bootstrap_paddleocr_gpu.sh | 2 +- server/scripts/bootstrap_paddleocr_mobile.sh | 2 +- server/server_start.sh | 2 +- .../src/app/services/document_intelligence.py | 2 +- server/src/app/services/document_preview.py | 132 +++++++-- .../expense_claim_application_handoff.py | 153 ++++++++++- .../expense_claim_attachment_operations.py | 11 + .../app/services/expense_claim_draft_flow.py | 6 +- server/src/app/services/expense_claims.py | 8 +- .../src/app/services/notification_states.py | 15 +- server/src/app/services/ocr.py | 84 ++---- .../src/app/services/ontology_validation.py | 2 +- .../services/orchestrator_expense_query.py | 6 +- server/src/app/services/receipt_folder.py | 153 +++++++++-- .../services/steward_model_plan_builder.py | 6 +- .../app/services/steward_planner_fallback.py | 12 +- .../steward_runtime_decision_agent.py | 6 +- .../services/steward_slot_decision_agent.py | 6 +- .../travel_reimbursement_calculator.py | 12 +- .../app/services/user_agent_application.py | 65 ++++- .../user_agent_application_locations.py | 3 +- .../src/app/services/user_agent_knowledge.py | 4 +- .../src/app/services/user_agent_response.py | 18 +- .../app/services/user_agent_review_core.py | 4 +- .../services/user_agent_review_messages.py | 6 +- .../app/services/user_agent_review_slots.py | 2 +- server/tests/test_agent_asset_service.py | 28 ++ server/tests/test_document_intelligence.py | 27 ++ server/tests/test_expense_claim_service.py | 160 +++++++++++ server/tests/test_notification_states.py | 25 ++ server/tests/test_ocr_service.py | 168 ++++++++++-- server/tests/test_receipt_folder_service.py | 258 +++++++++++++++++- server/tests/test_user_agent_service.py | 66 ++++- 41 files changed, 1273 insertions(+), 182 deletions(-) diff --git a/server/pyproject.toml b/server/pyproject.toml index 4ffe96f..a3864bc 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "python-dotenv>=1.0.1,<2.0.0", "email-validator>=2.2.0,<3.0.0", "python-multipart>=0.0.20,<1.0.0", + "jieba>=0.42.1,<0.43.0", "openpyxl>=3.1.5,<4.0.0", "lightrag-hku>=1.4.16,<1.5.0", "qdrant-client>=1.18.0,<2.0.0", diff --git a/server/rules/finance-rules/交通工具等级标准.xlsx b/server/rules/finance-rules/交通工具等级标准.xlsx index 382aedd944c6da57132b7300e3a80832a4bd96b7..e268e3cc2d4a945a2a71c294c56902398e701a38 100644 GIT binary patch delta 576 zcmdm?zg?dJR!IHsEQCPkptwbXKq2 zB0X^d^$8PZusjm8yeD>XOJLHKcW*OKUu%4D%KqQ9=XMz_#9o#6T}FVDAGMxIdI+4}1Dz8BYx&GSMIy?%W7^PZY_w~j1xJynt$ zn5c3%_R3|~hbAVbHQLvn-;8J7w7RhAQSgG?GS9ec+sk+V_VHF+9TT$i&YPByMIq+1 zx1apF_`lYdxuwxJCws2F62^&xuKaEqe{LZ1rSzaXAH$WyY@`?^+8~`J0h}K?+5O dk?>@HVHtijD<&6+OEZd3?h#jGOBMyG2LSQt%6|X= delta 546 zcmdn4zeArlz?+#xgn@y9gWnR930?t1^|ZG0@|eII}S*tqtJmB(9UJ6Cywpzci58|PG4 z9-bxJ*~H@Ut0Qb_jmjN~cePu(j-}f;1pJDX6<~QT8ljRZqB)}_GNR|r8k4HPw*DJu znHDozZ+W5NI8nSHm9u`aZ@t->lTLgqV^{2Z|9Rh>orQ~;;^gJ$J%76Snre`ra(33@ zrF|<-98cS&F{f=^TmI2sHosRlq?!6nDB;?XHvP%cJ@?;iTO$|~BDvm3vFwlEwC$UA zRL+jE|Fz!fweRly|A~7zW=YlkjsDDu5)LslW9BFeurX}@$Y{pK3=EOYS{!+-V8(1- zLuL?T@_B)0V1XTiEsS6Rd*Mf50TmH%h(M93H6k=8Zxb~GD|;_0&3Jb*x0n)`XCNlc zC^y+hOx+F??oZq!v^bd=7#evQ7=Q>48W{V5+=?9i;*!do)MCAg+?)V!MkWzvc+5@S z2sG&MVZRv}7^L8aGB7kS8VgVM6PB5rAkG8U TmM<>NC^5NPT#YS76r>IS8z?+#xgn@y9gTY$h#zfu&oIol@r)PuqtBKd->JR!IHsEQCPkptwbXKq2 zB0X^d^$8PZusjm8yeD>XOJLHKcW*OKUu%4D%KqQ9=XMz_#9o#6T}FVDAGMxIdI+4}1Dz8BYx&GSMIy?%W7^PZY_w~j1xJynt$ zn5c3%_R3|~hbAVbHQLvn-;8J7w7RhAQSgG?GS9ec+sk+V_VHF+9TT$i&YPByMIq+1 zx1apF_`lYdxuwyTZzybkD*>ltx7=(cV4+9O1 zcP1A~DJg)|^svk^Uc|}3P~*BwDjjWAPQ;XeR6=bG^5bu9BDbQp}j!f j^vNrwHNozk-R9IH&B(xTof+M|HIqL|tFfJs1la@tnN`gB delta 615 zcmbPZG1r1Oz?+#xgn@y9gWtH^h>E(z>ibwXwj5Cr*sk$$Th}w^&cNOE{?Xg`Sj_uA{{FFX?G-DJx5{>| z@&-ZOnWi_+sjfUcOSZF##p72;*wPx6I}-0|w{#s#w{Zyg6)P*i@?11RB~?UoMoVNw z&zm(SRe^2&H_kFGX0+b&Lc?*QctL$C=VIS_voj~1_*TZQ*!TYPzBxM!7c<4l%g=lM zbn`XUAV1~oti?)^_x(_wIgl%lcjs^zuC4%FeXHD zy^&(sAHQkaH|?mL9b^A%z0+&o-TD6$_i)UTs{0%LnG+>QVrIt7Q5M+zp3#Dh85lI1 zwK%SFf*DLghRh(wWHZrcU;%ltCPuKp67ff1fldi;h`@D8YfgBuPL`H3lLzs@2D33R z%u#1x5C#S|3^Xv_om?QL1Xk0-GRJrkCj&!`I|G9ROaYK?Xq210QA!7*0#U+(Fsl|E~xj6yej7%cT@Q97s?*6*#08pbC69WS`vO$bmq6`f2MX5Q(`g$M| zP2*&LX=z5`$=T9!V7q#Nycv^MNNa*!H@nTLMH-kyt}~HX*IT!k|3J^JG9^T diff --git a/server/rules/finance-rules/公司通信费报销规则.xlsx b/server/rules/finance-rules/公司通信费报销规则.xlsx index 17a754fdd3c97335b7ce81fb0b502a026ad89c50..9994d9138d8ae22b4098ea4ce6f48159000c667c 100644 GIT binary patch delta 588 zcmZ3dw^olgz?+#xgn@y9gTY$h#zfu&oIomuPq@G2)x>LR^;4=}Z%MXwiPcl^Q&2SI zJfiEhXYHb&0ZCWh-Jd*dE%U+N%%5Kw%t!8OQjVv}b?%h40(ycUb6{^4)d&Jeg6yt5vFPq0BYo>E^2*eEs;aGH&0C7^SBsMmME&OI%#-PQ6@I zAoW{ppXlw+Z`L!-3Z19$LZsld-pSLyzU7slPdWQ~*#qCe`)?|AD^$&9Z$J5U@qeu? z_ey@hlRwt&ut`YeZ;H))h6A2Zb%V9YAZqpn>tug1JbtL{dLy? zpfWKg1_o|qZH!g|3=Hu_sX4{^dLS~un~_O`86NGE6UC$%l_%GV$$<@>2js1oyh}_I h?B3aJPA$@m3=G$qAz=bDw}G*DGLyI(+f9%$3;;#S&7J@N delta 583 zcmZ3hw@!~Yz?+#xgn@y9gJCw$^@+R(IDu3Q-?;}tuP0tptG`nHy3T<=aQbOAKLtfY z&Lg@-dvv2FyLATcuJ@1L#>Zmb_wo0SjcZL_u$DE>bAGP5Vc`y`oY#vS_WSjniDPiF znG|X$&1v5DTQB11V`G^E+BPeh9cMl_jYwK5sy(A+Wqi*`*Oi5VzxpqnW9oI#jVzK` zc!F!kMxTSn?4_TjD7M#Uh^>16d(HZ^ygMrzwl6Vk^_KD6u{boJW%c~ZSA8uZ@+NrkwYTE8{@);ubyH*fe>HnX6#vJ}jG3b>z{bEZN1cH|n1NyQPewg9W?+bH*5Y`^0%i#F z8Zv_zlidZLfdvc%n;5|YTZJBh1!f6*Lj+!kSR+DnvW}>kJSc3@tN;c*1T-+-o!ls@ z1Xi;$$JBZ=3j;$wKLdjVOaYK?Xq20LNK_qc-5XJ9M)t`ZVlp-$Q@6Xn?m7TeA;!eO zz>Tbf(OQ6kA-*Uzr&wPPL>GKnz5V|;Rgm^7oxB^@>0Jtb z!{k~vC9FCB))Ir9&C10A44nN>e6`#b$sVp1-}~^RB)Pd4E1 zHni5()jRb;aCT&Z;0i^5U3HCap>_2-!CzjUZ?lX%p}4d4)$e^Tt{a=@g&cbQ`0(dF zHSca6S>}4GBsVZo<#6nk%dQViOiXLEuRXsR&$?-KVbi1F1-WIOao4t&@BZ!Mt++ZS zWaphXEg_3S%x7;u`E~JstuJ#+tH1LtunR930?t1^|ZG0@|eII}S*tqtJmB(9UJ6Cywpzci58|PG4 z9-bxJ*~H@Ut0Qb_jmjN~cePu(j-}f;1pJDX6<~QT8ljRZqB)}_GNR|r8k4HPw*DJu znHDozZ+W5NI8nSHm9u`aZ@t->lTLgqV^{2Z|9Rh>orQ~;;^gJ$J%76Snre`ra(33@ zrF|<-98cS&F{f=^TmI2sHosRlq?!6nDB;?XHvP%cJ@?;iTO$|~BDvm3vFwlEwC$UA zRL+jE|Fz!fweRly|A~7zW=YlkjsDDu5)LslW9BFeurX}@$Y{pK3=EOYS{$!gzzhjq zLuL?TvX8(suz;yx6(d++lh7luz;t16h`>`3YeZ;H))Y092Zb%vXoflJ3=G1+fQNww z#=Dd2M3um5mb}@tZU+kk!yW+!1__t~Al=X?H+jFPI@r3GqS7`zybKH#Ir_yVl{u-! zdKI}j0p5&EBFymkjM?t~y6XT?qZktd12?ikj1H3>#H7Fm#fnKYYEG^clLPZ+iAggp k6PkQbNJbmou-R=+Ez*n(4A+^_UEV$UpO_lkb&$Ob0J3G=_y7O^ diff --git a/server/rules/finance-rules/地区淡旺季映射表.xlsx b/server/rules/finance-rules/地区淡旺季映射表.xlsx index a73c2ca5d46d97cf69f8ac7c0ee2bf515d7e5c7e..d9623f1bcc0d644007f68b8f20ab1b444e333c24 100644 GIT binary patch delta 634 zcmZ1+xh#@5z?+#xgn@y9gTY$h#zfu&^*}17S7!O)cR<0@j0_BdKsqHqIiM)NpjbaS zzbI9&A~$F1L|^~I20U&5Pfe+Qz2$aQ$F4YsIcA)lo(g$^k8%aiwhJvT`#*16&Ru~R zi(~H}AKwViwo91HGy}W`A!Qs2bl?Hx7=y z{`79qqqbhIircyI%bprP7V=y^!E|ewlk1cx$;NRa=W3%5$8Owr=AYhzI38h^xy*St zOe!I`RK?V(t8MjJ^^a72wcAyv^yPlbwd`Vp|0e6+_k=qJA6{Rdy^jGU zls3O&v}0oihSO$kjwmTGqeID%8N`^pNA($4V5wR&BUnIN;}KXuK+_u{5T|9$2@m7R zi?qzZ%5G~(Gd`I7T}uhflhKxDQskIy#UZ5~;LXS+!VHh0m?!QLTAa)b42`@D3_t`2 z4UBb@>$GLS+UIIZ*Kbl~V2CeD%_-K`1Ci*)EKATl8wreEYbFK;ez+kF3=NF4)fgBm za`emdi?WmQ^Rq#*jjq}F_2XS@ftr2;4Uj_7tf&Fj9A7lqTt^Y?mS`PmCN|B<@|vQP MdvyfZg0w*D0i$ZwJOBUy delta 580 zcmZ1$xj2$Hz?+#xgn@y9gWVZ^@_rb?IUjqeCGcqs;0_l|eUGE>gjgQ5= z@8jNn^-)4b%ZUgQMn`Wu69e;v2+`Ufcjst zvH~p6MI%&FMKou$L`L+ySz}Ta*w%mJEYo5}>n$%d94Cqwq;f9ytv5S!(ur?n?23Kw zKku8fvv4s}oV@(J=TA3ZQw{P{&dyrAv~R_U<7vAz=CrMA%Rl2W&JjLeg(bkx}VwLt0sOefw} diff --git a/server/rules/finance-rules/差旅住宿费标准.xlsx b/server/rules/finance-rules/差旅住宿费标准.xlsx index 2bb1594397202db1bb3134fcc9a078aaa83823ef..b8df7baaeaba5da26428f0063dcf561cf0f514ec 100644 GIT binary patch delta 601 zcmexT^r?t9z?+#xgn@y9gW*ix^@+R(>VZ_uwuegBUjqeCGcqs;0_l|eMS9}P<~lef33Wt2sfc#D?A95$d;4CQ49{@8+9$I~sEuGjxq z*UzGKUc2DPt&D31oWc7gCU;+Yu;`2Y!kMgWjZ#HdidvMk9~?7c75|?zV{)s^qSzJt z-f!MFXJ_GJrZ{=|dC#8)-*&sE8?-2^S%^9FUDGVRPtO;wd>FlP-)buc4F-;f!^m^{Jg8CamwxPcKY@W}z>|y3 z&A`gmm`gL>oqWPv3Cw$IF3lvzG1-PgN;|-tkx7IZ9z8Kn+#|F&nHd-wc^Mdh2o4$; zeJA@{$bhxySx7TBPVTahiUC=4^o?s!7%&zsm>3xN;p!O}8W^*T85k;Z^vm;$vXk=j yvq7m?eShby@wMJ| z`hfWB|6e?hU1CZ6)p33!zr@O;Tcl#2Pc}$A+~?xRpi=CzJa{JO{KK{KbC=chvUAkm zs(HcebheV2&okOrN-a@POxap{ZRC`P#$}riNFCslt?~+(DjE}c=*6Y^yv3U)Bxi|d zNwgJ3oR7J;qG|i`$>f4_x+u}@tOItZU@gFU-Msa zqXg3CSB!RS%)nsUtj(dL4rWB@8#03!llzRGfd$Hq8yLX?zf2y11@4=ALj-iqtvTUA zJh{-^46JOGxisVb$;ZrJR!IHsEQCPkptwbXKq2 zB0X^d^$8PZusjm8yeD>XOJLHKcW*OKUu%4D%KqQ9=XMz_#9o#6T}FVDAGMxIdI+4}1Dz8BYx&GSMIy?%W7^PZY_w~j1xJynt$ zn5c3%_R3|~hbAVbHQLvn-;8J7w7RhAQSgG?GS9ec+sk+V_VHF+9TT$i&YPByMIq+1 zx1apF_`lYdxuw@@MdHZVTQ-t znR930?t1^|ZG0@|eII}S*tqtJmB(9UJ6Cywpzci58|PG4 z9-bxJ*~H@Ut0Qb_jmjN~cePu(j-}f;1pJDX6<~QT8ljRZqB)}_GNR|r8k4HPw*DJu znHDozZ+W5NI8nSHm9u`aZ@t->lTLgqV^{2Z|9Rh>orQ~;;^gJ$J%76Snre`ra(33@ zrF|<-98cS&F{f=^TmI2sHosRlq?!6nDB;?XHvP%cJ@?;iTO$|~BDvm3vFwlEwC$UA zRL+jE|Fz!fweRly|A~7zW=YlkjsDDu5)LslW9BFeurX}@$Y{pK3=EOYS{#8aU`8E} zAv1_Ec@zILu)u7A21c-eoX{h%0JE?+L?BSa8WEb4r;348W@Wv7mCV&bxZ*2ST=c$s1%rYQdHW$Ux0z3 zB1gYGzbHE?KR;WqA~z?%n~_O`86K}O#;+gmS_?GpHzNas6x>(_h6Y9JGL!Aa Wc);fQiAghlpPVJ8#^wyt%>V$*;nkM_ diff --git a/server/scripts/bootstrap_paddleocr_gpu.sh b/server/scripts/bootstrap_paddleocr_gpu.sh index 7780c57..0ccec97 100644 --- a/server/scripts/bootstrap_paddleocr_gpu.sh +++ b/server/scripts/bootstrap_paddleocr_gpu.sh @@ -14,7 +14,7 @@ if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then fi apt-get update -apt-get install -y --no-install-recommends libgl1 libglib2.0-0 poppler-utils poppler-data +apt-get install -y --no-install-recommends libgl1 libglib2.0-0 poppler-utils poppler-data mupdf-tools rm -rf "${OCR_VENV_DIR}" "${PYTHON_BIN}" -m venv "${OCR_VENV_DIR}" diff --git a/server/scripts/bootstrap_paddleocr_mobile.sh b/server/scripts/bootstrap_paddleocr_mobile.sh index 90ab90c..64be2da 100644 --- a/server/scripts/bootstrap_paddleocr_mobile.sh +++ b/server/scripts/bootstrap_paddleocr_mobile.sh @@ -13,7 +13,7 @@ if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then fi apt-get update -apt-get install -y --no-install-recommends libgl1 libglib2.0-0 poppler-utils poppler-data +apt-get install -y --no-install-recommends libgl1 libglib2.0-0 poppler-utils poppler-data mupdf-tools "${PYTHON_BIN}" -m venv "${OCR_VENV_DIR}" "${OCR_VENV_DIR}/bin/pip" install --upgrade pip diff --git a/server/server_start.sh b/server/server_start.sh index 991c938..e497cf2 100755 --- a/server/server_start.sh +++ b/server/server_start.sh @@ -272,7 +272,7 @@ run_bootstrap_python() { } dependencies_ready() { - "$PYTHON_BIN" -c "import alembic, dotenv, email_validator, fastapi, jwt, lightrag, multipart, openpyxl, psycopg, pydantic_settings, qdrant_client, sqlalchemy, uvicorn" >/dev/null 2>&1 + "$PYTHON_BIN" -c "import alembic, dotenv, email_validator, fastapi, jieba, jwt, lightrag, multipart, openpyxl, psycopg, pydantic_settings, qdrant_client, sqlalchemy, uvicorn" >/dev/null 2>&1 } pip_ready() { diff --git a/server/src/app/services/document_intelligence.py b/server/src/app/services/document_intelligence.py index 0250ce5..5306b12 100644 --- a/server/src/app/services/document_intelligence.py +++ b/server/src/app/services/document_intelligence.py @@ -562,7 +562,7 @@ def _extract_document_fields(text: str, document_type: str = "") -> list[Documen if date_value: append_field("date", "日期", date_value) - merchant = _extract_merchant(text) + merchant = "中国铁路" if normalized_type == "train_ticket" else _extract_merchant(text) if merchant: append_field("merchant_name", "商户", merchant) diff --git a/server/src/app/services/document_preview.py b/server/src/app/services/document_preview.py index d4589ef..e9ab01b 100644 --- a/server/src/app/services/document_preview.py +++ b/server/src/app/services/document_preview.py @@ -3,6 +3,7 @@ from __future__ import annotations import base64 import binascii import mimetypes +import os import re import shutil import subprocess @@ -11,9 +12,19 @@ from pathlib import Path class DocumentPreviewAssets: - PDF_RENDERER_ID = "pdftoppm-png-r160-poppler-data" + PDF_RENDERER_ID = "pdf-raster-cjk-safe-v3" PDF_PREVIEW_MEDIA_TYPE = "image/png" PDF_PREVIEW_SUFFIX = ".png" + PDF_UNUSABLE_PREVIEW_ERRORS = ( + "Missing language pack", + "Unknown font tag", + "No font in show", + ) + POPPLER_DATA_DIR_CANDIDATES = ( + "/usr/share/poppler", + "/usr/local/share/poppler", + "/opt/homebrew/share/poppler", + ) @staticmethod def decode_data_url(payload: str) -> tuple[str, bytes] | None: @@ -64,30 +75,117 @@ class DocumentPreviewAssets: ) -> Path: preview_path.parent.mkdir(parents=True, exist_ok=True) with tempfile.TemporaryDirectory(prefix=".pdf-preview-", dir=str(preview_path.parent)) as temp_dir: - prefix = Path(temp_dir) / "page" + pages = cls.render_pdf_pages( + pdf_path=pdf_path, + output_dir=Path(temp_dir), + timeout_seconds=timeout_seconds, + ) + shutil.copyfile(pages[0], preview_path) + return preview_path + + @classmethod + def render_pdf_pages( + cls, + *, + pdf_path: Path, + output_dir: Path, + timeout_seconds: int | float, + resolution: int = 160, + ) -> list[Path]: + output_dir.mkdir(parents=True, exist_ok=True) + errors: list[str] = [] + for renderer_name, command in cls._pdf_render_commands( + pdf_path=pdf_path, + output_dir=output_dir, + resolution=resolution, + ): + executable = shutil.which(renderer_name) + if not executable: + errors.append(f"{renderer_name}: executable not found") + continue + + cls._clear_rendered_pdf_pages(output_dir) + command[0] = executable completed = subprocess.run( - [ - "pdftoppm", - "-png", - "-r", - "160", - str(pdf_path), - str(prefix), - ], + command, capture_output=True, text=True, timeout=timeout_seconds, check=False, + env=cls._pdf_render_env(), ) + detail = (completed.stderr or completed.stdout or "").strip() if completed.returncode != 0: - detail = (completed.stderr or completed.stdout or "").strip() - raise RuntimeError(detail or "pdftoppm failed to render PDF preview.") + errors.append(f"{renderer_name}: {detail or 'renderer returned non-zero status'}") + continue + if cls.render_output_indicates_unusable_pdf_preview(detail): + errors.append(f"{renderer_name}: {detail or 'renderer produced unusable output'}") + continue - pages = sorted(Path(temp_dir).glob("page-*.png"), key=cls._extract_pdf_page_sort_key) - if not pages: - raise RuntimeError("pdftoppm did not generate a preview image.") - shutil.copyfile(pages[0], preview_path) - return preview_path + pages = sorted(output_dir.glob("page-*.png"), key=cls._extract_pdf_page_sort_key) + if pages: + return pages + errors.append(f"{renderer_name}: renderer did not generate PNG pages") + + cls._clear_rendered_pdf_pages(output_dir) + detail = ";".join(errors[-3:]) + raise RuntimeError(detail or "no PDF renderer generated usable PNG pages") + + @classmethod + def render_output_indicates_unusable_pdf_preview(cls, output: str) -> bool: + return any(token in str(output or "") for token in cls.PDF_UNUSABLE_PREVIEW_ERRORS) + + @classmethod + def _pdf_render_commands( + cls, + *, + pdf_path: Path, + output_dir: Path, + resolution: int, + ) -> list[tuple[str, list[str]]]: + prefix = output_dir / "page" + page_pattern = output_dir / "page-%d.png" + return [ + ( + "pdftoppm", + ["pdftoppm", "-png", "-r", str(resolution), str(pdf_path), str(prefix)], + ), + ( + "mutool", + ["mutool", "draw", "-r", str(resolution), "-o", str(page_pattern), str(pdf_path)], + ), + ( + "gs", + [ + "gs", + "-dSAFER", + "-dBATCH", + "-dNOPAUSE", + "-sDEVICE=png16m", + f"-r{resolution}", + f"-sOutputFile={page_pattern}", + str(pdf_path), + ], + ), + ( + "pdftocairo", + ["pdftocairo", "-png", "-r", str(resolution), str(pdf_path), str(prefix)], + ), + ] + + @classmethod + def _pdf_render_env(cls) -> dict[str, str]: + env = os.environ.copy() + for candidate in cls.POPPLER_DATA_DIR_CANDIDATES: + if (Path(candidate) / "cMap").exists(): + env.setdefault("POPPLER_DATADIR", candidate) + break + return env + + @staticmethod + def _clear_rendered_pdf_pages(output_dir: Path) -> None: + for page in output_dir.glob("page-*.png"): + page.unlink(missing_ok=True) @staticmethod def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]: diff --git a/server/src/app/services/expense_claim_application_handoff.py b/server/src/app/services/expense_claim_application_handoff.py index 9d435dd..d023c88 100644 --- a/server/src/app/services/expense_claim_application_handoff.py +++ b/server/src/app/services/expense_claim_application_handoff.py @@ -9,7 +9,10 @@ from sqlalchemy import or_, select from app.models.financial_record import ExpenseClaim from app.services.expense_claim_risk_stage import with_risk_business_stage -from app.services.expense_claim_workflow_constants import APPLICATION_ARCHIVE_STAGE +from app.services.expense_claim_workflow_constants import ( + APPLICATION_ARCHIVE_STAGE, + APPLICATION_LINK_STATUS_STAGE, +) APPLICATION_REIMBURSEMENT_TYPE_MAP = { @@ -248,3 +251,151 @@ class ExpenseClaimApplicationHandoffMixin: ) return archived_applications + + @staticmethod + def _reference_matches_deleted_reimbursement( + flag: dict[str, Any], + *, + reimbursement_claim_id: str, + reimbursement_claim_no: str, + ) -> bool: + reference_ids = { + str(flag.get(key) or "").strip() + for key in ( + "generated_draft_claim_id", + "generatedDraftClaimId", + "reimbursement_claim_id", + "reimbursementClaimId", + ) + } + reference_nos = { + str(flag.get(key) or "").strip().upper() + for key in ( + "generated_draft_claim_no", + "generatedDraftClaimNo", + "reimbursement_claim_no", + "reimbursementClaimNo", + ) + } + return ( + bool(reimbursement_claim_id and reimbursement_claim_id in reference_ids) + or bool(reimbursement_claim_no and reimbursement_claim_no.upper() in reference_nos) + ) + + @classmethod + def _remove_deleted_reimbursement_link_references( + cls, + risk_flags: list[Any], + *, + reimbursement_claim_id: str, + reimbursement_claim_no: str, + ) -> tuple[list[Any], bool]: + next_flags: list[Any] = [] + changed = False + stale_link_keys = { + "generated_draft_claim_id", + "generatedDraftClaimId", + "generated_draft_claim_no", + "generatedDraftClaimNo", + "reimbursement_claim_id", + "reimbursementClaimId", + "reimbursement_claim_no", + "reimbursementClaimNo", + "handoff_event_type", + "handoffEventType", + "handoff_message", + "handoffMessage", + } + for flag in list(risk_flags or []): + if not isinstance(flag, dict): + next_flags.append(flag) + continue + if not cls._reference_matches_deleted_reimbursement( + flag, + reimbursement_claim_id=reimbursement_claim_id, + reimbursement_claim_no=reimbursement_claim_no, + ): + next_flags.append(flag) + continue + + next_flag = dict(flag) + for key in stale_link_keys: + if key in next_flag: + next_flag.pop(key, None) + changed = True + next_flags.append(next_flag) + return next_flags, changed + + def _sync_linked_applications_after_reimbursement_deleted( + self, + *, + reimbursement_claim: ExpenseClaim, + operator: str, + current_user: Any, + ) -> list[dict[str, str]]: + reimbursement_claim_id = str(reimbursement_claim.id or "").strip() + reimbursement_claim_no = str(reimbursement_claim.claim_no or "").strip() + synced_applications: list[dict[str, str]] = [] + + for application_claim in self._find_linked_application_claims(reimbursement_claim): + previous_status = str(application_claim.status or "").strip() + previous_stage = str(application_claim.approval_stage or "").strip() + before_json = self._serialize_claim(application_claim) + next_flags, removed_link_references = self._remove_deleted_reimbursement_link_references( + list(application_claim.risk_flags_json or []), + reimbursement_claim_id=reimbursement_claim_id, + reimbursement_claim_no=reimbursement_claim_no, + ) + sync_flag = with_risk_business_stage( + { + "source": "application_link_sync", + "event_type": "expense_application_reimbursement_deleted", + "sync_event_id": str(uuid.uuid4()), + "severity": "info", + "actionability": "system_trace", + "label": "关联报销单已删除", + "message": ( + f"关联报销单 {reimbursement_claim_no or reimbursement_claim_id} 已删除," + "申请单已回到待关联状态。" + ), + "operator": operator, + "operator_username": getattr(current_user, "username", ""), + "operator_role_codes": [ + str(item).strip().lower() + for item in getattr(current_user, "role_codes", []) + if str(item).strip() + ], + "application_claim_id": application_claim.id, + "application_claim_no": application_claim.claim_no, + "deleted_reimbursement_claim_id": reimbursement_claim_id, + "deleted_reimbursement_claim_no": reimbursement_claim_no, + "previous_status": previous_status, + "previous_approval_stage": previous_stage, + "next_status": "approved", + "next_approval_stage": APPLICATION_LINK_STATUS_STAGE, + "removed_link_references": removed_link_references, + "created_at": datetime.now(UTC).isoformat(), + }, + "expense_application", + ) + + application_claim.status = "approved" + application_claim.approval_stage = APPLICATION_LINK_STATUS_STAGE + application_claim.risk_flags_json = [*next_flags, sync_flag] + synced_applications.append( + { + "application_claim_id": application_claim.id, + "application_claim_no": str(application_claim.claim_no or "").strip(), + "next_approval_stage": APPLICATION_LINK_STATUS_STAGE, + } + ) + self.audit_service.log_action( + actor=operator, + action="expense_application.unlink_deleted_reimbursement", + resource_type="expense_claim", + resource_id=application_claim.id, + before_json=before_json, + after_json=self._serialize_claim(application_claim), + ) + + return synced_applications diff --git a/server/src/app/services/expense_claim_attachment_operations.py b/server/src/app/services/expense_claim_attachment_operations.py index 6f7a185..1603dc5 100644 --- a/server/src/app/services/expense_claim_attachment_operations.py +++ b/server/src/app/services/expense_claim_attachment_operations.py @@ -714,6 +714,17 @@ class ExpenseClaimAttachmentOperationsMixin: timeout_seconds=OcrService(self.db).settings.ocr_timeout_seconds, ) except Exception: + metadata.update( + { + "previewable": True, + "preview_kind": "pdf", + "preview_storage_key": self._attachment_storage.to_storage_key(file_path), + "preview_media_type": "application/pdf", + "preview_file_name": file_path.name, + "preview_rendered_with": "", + } + ) + self._attachment_storage.write_meta(file_path, metadata) return metadata metadata.update( diff --git a/server/src/app/services/expense_claim_draft_flow.py b/server/src/app/services/expense_claim_draft_flow.py index f456860..bf8748a 100644 --- a/server/src/app/services/expense_claim_draft_flow.py +++ b/server/src/app/services/expense_claim_draft_flow.py @@ -827,8 +827,8 @@ class ExpenseClaimDraftFlowMixin(ExpenseClaimApplicationLinkMixin, ExpenseClaimD document_count = max(len(context_documents), len(attachment_names), self._resolve_attachment_count(context_json)) return { "message": ( - f"检测到你已有草稿 {association_candidate.claim_no}," - f"当前新上传了 {document_count} 张票据,请先选择关联到现有草稿,或单独建立新的报销单。" + f"检测到您已有草稿 {association_candidate.claim_no}," + f"当前新上传了 {document_count} 张票据,请先选择关联到现有草稿,或单独新建一张报销单。" ), "draft_only": False, "status": "pending_association_decision", @@ -859,7 +859,7 @@ class ExpenseClaimDraftFlowMixin(ExpenseClaimApplicationLinkMixin, ExpenseClaimD if existing_draft_count >= MAX_DRAFT_CLAIMS_PER_USER: return { "message": ( - f"你当前已保存 {MAX_DRAFT_CLAIMS_PER_USER} 个草稿,请先完成已保存的草稿," + f"您当前已保存 {MAX_DRAFT_CLAIMS_PER_USER} 个草稿,请先完成已保存的草稿," "才能再次新建草稿。" ), "draft_limit_reached": True, diff --git a/server/src/app/services/expense_claims.py b/server/src/app/services/expense_claims.py index e74ded3..f7cd885 100644 --- a/server/src/app/services/expense_claims.py +++ b/server/src/app/services/expense_claims.py @@ -688,6 +688,13 @@ class ExpenseClaimItemActionMixin: before_json = self._serialize_claim(claim) resource_id = claim.id + operator = self._access_policy.resolve_current_user_display_name(current_user) + if not self._is_expense_application_claim(claim): + self._sync_linked_applications_after_reimbursement_deleted( + reimbursement_claim=claim, + operator=operator, + current_user=current_user, + ) self._release_budget_for_delete(claim, current_user) self._delete_claim_analysis_records(resource_id) @@ -1008,4 +1015,3 @@ class ExpenseClaimService(ExpenseClaimStandardAdjustmentMixin, ExpenseClaimItemA ) return claim - diff --git a/server/src/app/services/notification_states.py b/server/src/app/services/notification_states.py index 8487f88..b1d13f3 100644 --- a/server/src/app/services/notification_states.py +++ b/server/src/app/services/notification_states.py @@ -1,6 +1,7 @@ from __future__ import annotations from datetime import UTC, datetime +from threading import Lock from sqlalchemy import select from sqlalchemy.orm import Session @@ -16,11 +17,23 @@ from app.schemas.notification_state import ( class NotificationStateService: + _storage_ready_bind_ids: set[int] = set() + _storage_ready_lock = Lock() + def __init__(self, db: Session) -> None: self.db = db def ensure_storage_ready(self) -> None: - Base.metadata.create_all(bind=self.db.get_bind(), tables=[NotificationState.__table__]) + bind = self.db.get_bind() + bind_id = id(bind) + if bind_id in self._storage_ready_bind_ids: + return + + with self._storage_ready_lock: + if bind_id in self._storage_ready_bind_ids: + return + Base.metadata.create_all(bind=bind, tables=[NotificationState.__table__]) + self._storage_ready_bind_ids.add(bind_id) def list_states(self, current_user: CurrentUserContext) -> NotificationStateListRead: self.ensure_storage_ready() diff --git a/server/src/app/services/ocr.py b/server/src/app/services/ocr.py index 0ededfa..704e3f6 100644 --- a/server/src/app/services/ocr.py +++ b/server/src/app/services/ocr.py @@ -16,11 +16,13 @@ from sqlalchemy.orm import Session from app.core.config import SERVER_DIR, get_settings from app.schemas.ocr import OcrRecognizeBatchRead, OcrRecognizeDocumentRead, OcrRecognizeFieldRead, OcrRecognizeLineRead +from app.services.document_preview import DocumentPreviewAssets from app.services.document_intelligence import DocumentIntelligenceService WORKER_JSON_PREFIX = "__OCR_JSON__=" SUPPORTED_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".pdf"} OCR_RESULT_CACHE_LIMIT = 32 +OCR_RESULT_CACHE_PIPELINE_VERSION = f"pdf-image-ocr:{DocumentPreviewAssets.PDF_RENDERER_ID}:no-pdf-direct-v2" @dataclass(slots=True) @@ -142,16 +144,6 @@ class OcrService: cleanup_paths=cleanup_paths, text_layer=text_layer, ) - if self._has_usable_pdf_text_layer(text_layer): - document = self._build_text_layer_document( - filename=normalized_name, - media_type=resolved_media_type, - text_layer=text_layer, - pdf_inputs=pdf_inputs, - ) - documents.append(document) - self._write_cached_document(cache_key, document) - continue prepared_inputs.extend(pdf_inputs) for item in pdf_inputs: cache_keys_by_source.setdefault(item.source_key, cache_key) @@ -257,6 +249,7 @@ class OcrService: digest = hashlib.sha256(content).hexdigest() return "|".join( [ + OCR_RESULT_CACHE_PIPELINE_VERSION, self.settings.ocr_language, self.settings.ocr_device, self.settings.ocr_text_detection_model, @@ -406,11 +399,15 @@ class OcrService: output_dir.mkdir(parents=True, exist_ok=True) cleanup_paths.append(output_dir) - image_paths = self._convert_pdf_to_images(pdf_path=pdf_path, output_dir=output_dir) + image_paths, preview_usable = self._convert_pdf_to_images(pdf_path=pdf_path, output_dir=output_dir) if not image_paths: raise RuntimeError("PDF 转图片后未生成可识别页面。") - preview_data_url = self._build_preview_data_url(image_paths[0], media_type="image/png") + preview_data_url = ( + self._build_preview_data_url(image_paths[0], media_type="image/png") + if preview_usable + else "" + ) source_key = uuid4().hex descriptors: list[PreparedOcrInput] = [] for page_index, image_path in enumerate(image_paths): @@ -421,7 +418,7 @@ class OcrService: filename=filename, media_type=media_type, page_index=page_index, - preview_kind="image" if page_index == 0 else "", + preview_kind="image" if page_index == 0 and preview_data_url else "", preview_data_url=preview_data_url if page_index == 0 else "", text_layer=text_layer if page_index == 0 else "", ) @@ -450,27 +447,17 @@ class OcrService: return self._normalize_extracted_text(completed.stdout) - def _convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]: - prefix = output_dir / "page" - completed = subprocess.run( - [ - "pdftoppm", - "-png", - "-r", - "160", - str(pdf_path), - str(prefix), - ], - capture_output=True, - text=True, - timeout=self.settings.ocr_timeout_seconds, - check=False, - ) - if completed.returncode != 0: - detail = (completed.stderr or completed.stdout or "").strip() - raise RuntimeError(f"PDF 转图片失败:{detail or 'pdftoppm 返回非 0 状态码。'}") + def _convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]: + try: + pages = DocumentPreviewAssets.render_pdf_pages( + pdf_path=pdf_path, + output_dir=output_dir, + timeout_seconds=self.settings.ocr_timeout_seconds, + ) + except RuntimeError as exc: + raise RuntimeError(f"PDF 转图片失败:{exc}") from exc - return sorted(output_dir.glob("page-*.png"), key=self._extract_pdf_page_sort_key) + return pages, True @staticmethod def _extract_pdf_page_sort_key(path: Path) -> tuple[int, str]: @@ -595,30 +582,6 @@ class OcrService: return documents - def _build_text_layer_document( - self, - *, - filename: str, - media_type: str, - text_layer: str, - pdf_inputs: list[PreparedOcrInput], - ) -> OcrRecognizeDocumentRead: - first_input = pdf_inputs[0] if pdf_inputs else None - aggregated = AggregatedOcrDocument( - filename=filename, - media_type=media_type, - source_key=first_input.source_key if first_input is not None else uuid4().hex, - page_count=max(1, len(pdf_inputs)), - preview_kind=str(first_input.preview_kind if first_input is not None else ""), - preview_data_url=str(first_input.preview_data_url if first_input is not None else ""), - ) - aggregated.text_layer_fragments.append(text_layer) - return self._finalize_document(aggregated) - - @classmethod - def _has_usable_pdf_text_layer(cls, text_layer: str) -> bool: - return cls._meaningful_char_count(text_layer) >= 8 - @staticmethod def _collect_descriptor_text_layer(descriptors: list[PreparedOcrInput]) -> str: for descriptor in descriptors: @@ -685,13 +648,6 @@ class OcrService: summary = self._summarize_text(full_text) preview_kind = aggregated.preview_kind preview_data_url = aggregated.preview_data_url - if ( - used_text_layer - and aggregated.media_type == "application/pdf" - and self._placeholder_ratio(ocr_text) >= 0.12 - ): - preview_kind = "" - preview_data_url = "" insight = self.document_intelligence_service.build_document_insight( filename=aggregated.filename, summary=summary, diff --git a/server/src/app/services/ontology_validation.py b/server/src/app/services/ontology_validation.py index 4e8ef01..867b0c3 100644 --- a/server/src/app/services/ontology_validation.py +++ b/server/src/app/services/ontology_validation.py @@ -214,7 +214,7 @@ class OntologyValidationMixin: labels = [self._display_slot_label(item) for item in missing_slots[:4]] if not labels: return "请补充更多上下文后再继续。" - return f"请补充{'、'.join(labels)},我再继续帮你解析和处理。" + return f"请补充{'、'.join(labels)},我会继续帮您解析和处理。" @staticmethod def _compute_confidence( diff --git a/server/src/app/services/orchestrator_expense_query.py b/server/src/app/services/orchestrator_expense_query.py index f4f5579..4fc58e6 100644 --- a/server/src/app/services/orchestrator_expense_query.py +++ b/server/src/app/services/orchestrator_expense_query.py @@ -570,7 +570,7 @@ class OrchestratorDatabaseQueryBuilder: scoped_to_current_user = True else: conditions.append(ExpenseClaim.id == "__no_visible_claim__") - scope_label = "你的报销单" + scope_label = "您的报销单" scoped_to_current_user = True elif explicit_employee_names: conditions.append(ExpenseClaim.employee_name.in_(explicit_employee_names)) @@ -586,7 +586,7 @@ class OrchestratorDatabaseQueryBuilder: scoped_to_current_user = True else: conditions.append(ExpenseClaim.id == "__no_visible_claim__") - scope_label = "你的报销单" + scope_label = "您的报销单" scoped_to_current_user = True else: scope_label = "全部报销单" @@ -703,7 +703,7 @@ class OrchestratorDatabaseQueryBuilder: subject_name = (employee.name if employee is not None else "") or normalized_user_id if subject_name: - return conditions, "你的报销单" + return conditions, "您的报销单" return conditions, "当前用户的报销单" def _employee_name_is_unique(self, employee: Employee) -> bool: diff --git a/server/src/app/services/receipt_folder.py b/server/src/app/services/receipt_folder.py index 5fa9387..b43f9b0 100644 --- a/server/src/app/services/receipt_folder.py +++ b/server/src/app/services/receipt_folder.py @@ -37,10 +37,18 @@ TRAIN_ROUTE_PATTERN = re.compile( r"([\u4e00-\u9fa5]{2,12})站?\s*(?:至|到|→|->|—|–|-)\s*" r"([\u4e00-\u9fa5]{2,12})站?" ) +TRAIN_ROUTE_WITH_NO_PATTERN = re.compile( + r"([\u4e00-\u9fa5]{2,12})站?\s+[GCDZKTLYS]\d{1,5}\s+" + r"([\u4e00-\u9fa5]{2,12})站?", + re.IGNORECASE, +) TRAIN_NO_PATTERN = re.compile(r"(?:车次|列车号)\s*[::]?\s*([GCDZKTLYS]\d{1,5})", re.IGNORECASE) TRAIN_STANDALONE_NO_PATTERN = re.compile(r"(? tuple[str, str]: raw_text = str(text or "") + split_line_match = TRAIN_ROUTE_WITH_NO_PATTERN.search(raw_text) + if split_line_match: + departure = cls._clean_train_station(split_line_match.group(1)) + arrival = cls._clean_train_station(split_line_match.group(2)) + if cls._is_valid_train_station_value(departure) and cls._is_valid_train_station_value(arrival) and departure != arrival: + return departure, arrival + station_candidates: list[str] = [] for line in raw_text.replace("\r", "\n").splitlines(): candidate = cls._clean_train_station(line) @@ -697,7 +773,7 @@ class ReceiptFolderTrainTicketMixin: continue if not str(line or "").strip().endswith("站"): continue - if any(token in candidate for token in ("发票", "客票", "铁路", "票价", "日期")): + if not cls._is_valid_train_station_value(candidate): continue station_candidates.append(candidate) if len(station_candidates) >= 2: @@ -707,7 +783,7 @@ class ReceiptFolderTrainTicketMixin: if match: departure = cls._clean_train_station(match.group(1)) arrival = cls._clean_train_station(match.group(2)) - if departure and arrival and departure != arrival: + if cls._is_valid_train_station_value(departure) and cls._is_valid_train_station_value(arrival) and departure != arrival: return departure, arrival return "", "" @@ -717,6 +793,25 @@ class ReceiptFolderTrainTicketMixin: cleaned = re.sub(r"(?:火车站|高铁站|站)$", "", cleaned) return cleaned.strip() + @classmethod + def _should_keep_train_ticket_field(cls, *, key: str, label: str, value: str) -> bool: + if key in TRAIN_STATION_FIELD_KEYS or label in TRAIN_STATION_FIELD_LABELS: + return cls._is_valid_train_station_value(value) + if key == "passenger_name" or label == "乘车人": + return bool(cls._clean_train_passenger_candidate(value)) + return True + + @classmethod + def _is_valid_train_station_value(cls, value: str) -> bool: + cleaned = cls._clean_train_station(value) + if not 2 <= len(cleaned) <= 12: + return False + if any(token in cleaned for token in TRAIN_INVALID_STATION_TOKENS): + return False + if re.search(r"[A-Za-z0-9]", cleaned): + return False + return True + @staticmethod def _extract_first(pattern: re.Pattern[str], text: str) -> str: match = pattern.search(str(text or "")) @@ -724,24 +819,30 @@ class ReceiptFolderTrainTicketMixin: @classmethod def _extract_train_passenger_name(cls, text: str, *, id_number: str = "") -> str: - labeled = cls._extract_first(TRAIN_PASSENGER_PATTERN, text) - if labeled: - return labeled - lines = [line.strip() for line in str(text or "").replace("\r", "\n").splitlines() if line.strip()] - for index, line in enumerate(lines): - if id_number and id_number not in line: - continue - for offset in (1, -1, 2): - target_index = index + offset - if target_index < 0 or target_index >= len(lines): + for line in lines: + labeled = cls._clean_train_passenger_candidate(cls._extract_first(TRAIN_PASSENGER_PATTERN, line)) + if labeled: + return labeled + + if id_number: + for index, line in enumerate(lines): + if id_number not in line: continue - candidate = cls._clean_train_passenger_candidate(lines[target_index]) + candidate = cls._clean_train_passenger_candidate(line.replace(id_number, " ")) if candidate: return candidate + for offset in (1, -1, 2): + target_index = index + offset + if target_index < 0 or target_index >= len(lines): + continue + candidate = cls._clean_train_passenger_candidate(lines[target_index]) + if candidate: + return candidate for line in lines: - if "购买方名称" in line: - candidate = cls._clean_train_passenger_candidate(line.split(":", 1)[-1].split(":", 1)[-1]) + purchase_match = TRAIN_PURCHASER_NAME_PATTERN.search(line) + if purchase_match: + candidate = cls._clean_train_passenger_candidate(purchase_match.group(1)) if candidate: return candidate return "" @@ -764,6 +865,16 @@ class ReceiptFolderTrainTicketMixin: "开票", "日期", "车厢", + "席别", + "二等座", + "一等座", + "商务座", + "特等座", + "软座", + "硬座", + "无座", + "软卧", + "硬卧", "座位", "票价", "金额", @@ -771,6 +882,14 @@ class ReceiptFolderTrainTicketMixin: "出发", "到达", "车次", + "公司", + "信用代码", + "纳税人", + "扫码", + "无效", + "二维码", + "座席", + "证件", ) ): return "" diff --git a/server/src/app/services/steward_model_plan_builder.py b/server/src/app/services/steward_model_plan_builder.py index e6236cb..da95c35 100644 --- a/server/src/app/services/steward_model_plan_builder.py +++ b/server/src/app/services/steward_model_plan_builder.py @@ -266,7 +266,7 @@ class StewardModelPlanBuilder: event_id="intent_agent_function_call", stage="llm_function_call", title="识别财务事项", - content="我识别到这句话包含出差事项,但还需要确认你要进入申请流程还是报销流程。", + content="我识别到这句话包含出差事项,但还需要确认您是要进入申请流程还是报销流程。", ) ] raw_events = payload.get("thinking_events") @@ -292,7 +292,7 @@ class StewardModelPlanBuilder: event_id="intent_agent_pending_flow", stage="flow_confirmation", title="等待确认流程方向", - content=f"当前输入“{request.message}”缺少明确动作词,需要先由你选择补办出差申请或发起费用报销。", + content=f"当前输入“{request.message}”缺少明确的动作词,需要先由您选择是补办出差申请,还是发起费用报销。", ) ) return events @@ -302,7 +302,7 @@ class StewardModelPlanBuilder: candidate_labels = [item.label for item in pending_flow_confirmation.candidate_flows if item.label] if len(candidate_labels) >= 2: return ( - f"我识别到这是一次财务事项,但还不能确定你要做的是" + f"我识别到这是一次财务事项,但还不能确定您要做的是" f"**{candidate_labels[0]}**还是**{candidate_labels[1]}**。请先选择一个方向。" ) return "我识别到这是一次财务事项,但还需要先确认具体流程方向。" diff --git a/server/src/app/services/steward_planner_fallback.py b/server/src/app/services/steward_planner_fallback.py index 6b29302..ab057af 100644 --- a/server/src/app/services/steward_planner_fallback.py +++ b/server/src/app/services/steward_planner_fallback.py @@ -335,7 +335,7 @@ class StewardPlannerFallbackMixin: flow_id="travel_application", label="先发起出差申请", confidence=0.86, - reason="已先查询你名下可关联的差旅申请单,暂未查到可关联单据,因此应先申请单据。", + reason="已先查询您名下可关联的差旅申请单,暂未查到可关联单据,因此应先申请单据。", ontology_fields=application_fields, missing_fields=self._resolve_missing_fields("expense_application", application_fields), ) @@ -345,7 +345,7 @@ class StewardPlannerFallbackMixin: if gate.get("checked"): candidate_count = int(gate.get("candidate_count") or 0) reimbursement_label = "关联已有申请单并发起报销" - reimbursement_reason = f"已先查到 {candidate_count} 个可关联申请单,选择后会先请你关联具体单据。" + reimbursement_reason = f"已先查到 {candidate_count} 个可关联申请单,选择后会先请您关联具体单据。" return [ StewardCandidateFlow( flow_id="travel_application", @@ -390,10 +390,10 @@ class StewardPlannerFallbackMixin: @staticmethod def _build_pending_flow_reason(gate: dict[str, Any]) -> str: if gate.get("checked") and int(gate.get("candidate_count") or 0) <= 0: - return "我已经先查询你名下可关联的差旅申请单,未查到可关联单据,所以当前应先申请单据。" + return "我已先查询您名下可关联的差旅申请单,未查到可关联单据,所以当前应先申请单据。" if gate.get("checked"): candidate_count = int(gate.get("candidate_count") or 0) - return f"我已经先查询你名下的差旅申请单,查到 {candidate_count} 个可关联申请单,需要你确认是否关联单据后发起报销。" + return f"我已先查询您名下的差旅申请单,查到 {candidate_count} 个可关联申请单,需要您确认是否关联单据后发起报销。" return "当前话术描述了出差事项,但没有明确说明要补办申请还是发起报销。" @staticmethod @@ -404,10 +404,10 @@ class StewardPlannerFallbackMixin: candidate_count = int(gate.get("candidate_count") or 0) return ( f"我已先查询可关联申请单,查到 {candidate_count} 个可关联申请单;" - "你可以选择关联已有申请单发起报销,或改为补办新的出差申请。" + "您可以选择关联已有申请单发起报销,也可以改为补办新的出差申请。" ) return ( - "我识别到这是一次出差事项,但还不能确定你要做的是" + "我识别到这是一次出差事项,但还不能确定您要做的是" "**补办出差申请**还是**发起费用报销**。请先选择一个方向。" ) diff --git a/server/src/app/services/steward_runtime_decision_agent.py b/server/src/app/services/steward_runtime_decision_agent.py index b6bf95c..a95e7bd 100644 --- a/server/src/app/services/steward_runtime_decision_agent.py +++ b/server/src/app/services/steward_runtime_decision_agent.py @@ -90,7 +90,7 @@ class StewardRuntimeDecisionAgent: next_action="continue_selected_flow", target_task_id=selected_flow_id, response_text=self._build_selected_flow_response_text(selected_flow_id), - rationale="已按你选择的候选流程继续处理。", + rationale="已按您选择的候选流程继续处理。", steward_state=next_state, model_call_traces=traces, ) @@ -268,7 +268,7 @@ class StewardRuntimeDecisionAgent: next_action="submit_current_application", target_message_id=str(pending_application.get("message_id") or ""), target_task_id=str(pending_application.get("task_id") or ""), - rationale="模型运行时决策暂不可用,我先按当前待提交申请单上下文处理你的确认。", + rationale="模型运行时决策暂不可用,我先按当前待提交申请单的上下文处理您的确认。", model_call_traces=traces, ) if confirmation_text and pending_steward_action: @@ -295,7 +295,7 @@ class StewardRuntimeDecisionAgent: target_task_id=str(current_task.get("task_id") or ""), field_key=field_key, field_value=request.user_message, - rationale="模型运行时决策暂不可用,我先把你的补充写入当前小财管家流程字段。", + rationale="模型运行时决策暂不可用,我先把您的补充写入当前小财管家流程字段。", model_call_traces=traces, ) if field_key: diff --git a/server/src/app/services/steward_slot_decision_agent.py b/server/src/app/services/steward_slot_decision_agent.py index bee1b48..88032f5 100644 --- a/server/src/app/services/steward_slot_decision_agent.py +++ b/server/src/app/services/steward_slot_decision_agent.py @@ -275,7 +275,7 @@ class StewardSlotDecisionAgent: missing_fields=missing_fields, question=self._build_fallback_question(field), options=self._sanitize_options([], [field]), - rationale="模型字段决策暂不可用,我先按上游意图识别给出的本体缺口向你确认。", + rationale="模型字段决策暂不可用,我先按上游意图识别给出的本体缺口向您确认。", model_call_traces=traces, ) return StewardSlotDecisionResponse( @@ -285,7 +285,7 @@ class StewardSlotDecisionAgent: missing_fields=[], question="", options=[], - rationale="当前任务没有上游标记的关键字段缺口,可以先生成核对结果供你确认。", + rationale="当前任务没有上游标记的关键字段缺口,可以先生成核对结果供您确认。", model_call_traces=traces, ) @@ -293,7 +293,7 @@ class StewardSlotDecisionAgent: def _build_fallback_question(field: str) -> str: label = FIELD_CATALOG.get(field, {}).get("label") or field if field == "transport_mode": - return "请问你这次打算怎么出行?可以选择火车、飞机或轮船。" + return "请问您这次打算怎么出行?可以选择火车、飞机或轮船。" return f"当前还缺少{label},请先补充后我再继续处理。" @staticmethod diff --git a/server/src/app/services/travel_reimbursement_calculator.py b/server/src/app/services/travel_reimbursement_calculator.py index 3d339c0..edeca3b 100644 --- a/server/src/app/services/travel_reimbursement_calculator.py +++ b/server/src/app/services/travel_reimbursement_calculator.py @@ -15,14 +15,16 @@ from app.schemas.reimbursement import ( TravelReimbursementCalculatorResponse, ) from app.services.agent_assets import AgentAssetService +from app.services.application_location_semantics import validate_application_location_text from app.services.expense_claims import ExpenseClaimService -from app.services.expense_rule_runtime import RuntimeTravelPolicy, ExpenseRuleRuntimeService +from app.services.expense_rule_runtime import ExpenseRuleRuntimeService, RuntimeTravelPolicy from app.services.travel_policy_grades import travel_policy_grade_key_candidates from app.services.travel_reimbursement_regions import ( AMBIGUOUS_PROVINCE_CITY_NAMES, OTHER_REGION_LOCATION_KEYWORDS, OTHER_REGION_PROVINCE_KEYWORDS, ) +from app.services.user_agent_application_locations import normalize_application_location class TravelReimbursementCalculatorService: @@ -35,9 +37,13 @@ class TravelReimbursementCalculatorService: current_user: CurrentUserContext, ) -> TravelReimbursementCalculatorResponse: days = max(1, int(payload.days)) - location = str(payload.location or "").strip() - if not location: + raw_location = str(payload.location or "").strip() + if not raw_location: raise ValueError("请先填写出差地点。") + location = normalize_application_location(raw_location) or raw_location + location_error = validate_application_location_text(location) + if location_error: + raise ValueError(f"{location_error}请填写真实出差地点后再计算。") policy = self._load_travel_policy() grade = self._resolve_grade(payload.grade, current_user) diff --git a/server/src/app/services/user_agent_application.py b/server/src/app/services/user_agent_application.py index 84a48ed..0f3b611 100644 --- a/server/src/app/services/user_agent_application.py +++ b/server/src/app/services/user_agent_application.py @@ -8,20 +8,25 @@ from sqlalchemy import or_, select from app.api.deps import CurrentUserContext from app.models.financial_record import ExpenseClaim +from app.schemas.reimbursement import TravelReimbursementCalculatorRequest from app.schemas.user_agent import ( UserAgentDraftPayload, UserAgentRequest, UserAgentResponse, UserAgentSuggestedAction, ) -from app.schemas.reimbursement import TravelReimbursementCalculatorRequest -from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy -from app.services.expense_claim_risk_stage import with_risk_business_stage -from app.services.travel_reimbursement_calculator import TravelReimbursementCalculatorService +from app.services.application_location_semantics import ( + strip_route_location_prefix_with_jieba, + validate_application_location_text, +) +from app.services.application_system_estimate import apply_application_system_estimate_to_facts from app.services.document_numbering import ( build_document_number, generate_unique_expense_claim_no, ) +from app.services.expense_claim_access_policy import ExpenseClaimAccessPolicy +from app.services.expense_claim_risk_stage import with_risk_business_stage +from app.services.travel_reimbursement_calculator import TravelReimbursementCalculatorService from app.services.user_agent_application_dates import ( expand_application_time_with_days, resolve_application_date_range, @@ -33,7 +38,6 @@ from app.services.user_agent_application_summary import ( build_application_summary_table, resolve_application_time_label, ) -from app.services.application_system_estimate import apply_application_system_estimate_to_facts APPLICATION_CONTEXT_VALUES = { "application", @@ -182,6 +186,17 @@ class UserAgentApplicationSlotMixin: if not str(facts.get(field) or "").strip() ] + @staticmethod + def _resolve_application_validation_issues(facts: dict[str, str]) -> list[dict[str, str]]: + issues: list[dict[str, str]] = [] + location_error = validate_application_location_text(facts.get("location", "")) + if location_error: + issues.append({ + "field": "location", + "message": location_error, + }) + return issues + def _resolve_application_missing_fields(self, facts: dict[str, str]) -> list[str]: return [ *self._resolve_application_missing_base_fields(facts), @@ -391,7 +406,11 @@ class UserAgentApplicationSlotMixin: if re.fullmatch(r"(?:去|到|前往)?[\u4e00-\u9fa5]{1,8}出差(?P\d+|[一二两三四五六七八九十]{1,3})?天?", text): return "" - text = re.sub(r"^.*?(?:出差|前往|去|到|赴)[\u4e00-\u9fa5]{1,8}(?:出差)?(?P\d+|[一二两三四五六七八九十]{1,3})?天?[,,\s]*", "", text) + tokenized = strip_route_location_prefix_with_jieba(text) + if tokenized != text: + text = tokenized + else: + text = re.sub(r"^.*?(?:出差|前往|去|到|赴)[\u4e00-\u9fa5]{1,8}(?:出差)?(?P\d+|[一二两三四五六七八九十]{1,3})?天?[,,\s]*", "", text) text = re.sub(r"^(?:出差|申请|费用申请|业务|本次|去|到|前往)\s*", "", text) text = text.strip(" ::,,。;;") if not text: @@ -537,8 +556,16 @@ class UserAgentApplicationSlotMixin: step: str, facts: dict[str, str], ) -> list[UserAgentSuggestedAction]: - if step == "ask_missing": - missing_fields = self._resolve_application_missing_fields(facts) + if step in {"ask_missing", "ask_invalid"}: + missing_fields = ( + self._resolve_application_missing_fields(facts) + if step == "ask_missing" + else [ + issue.get("field", "") + for issue in self._resolve_application_validation_issues(facts) + if issue.get("field") + ] + ) return [ UserAgentSuggestedAction( label="一次性补充申请信息", @@ -1209,7 +1236,22 @@ class UserAgentApplicationMixin(UserAgentApplicationSlotMixin, UserAgentApplicat "我已按「费用申请 / 事前审批」来处理这条内容。", "已识别信息:\n" + recognized_table, f"当前还需要补充:{missing_text}。", - "请一次性补齐上述字段,我会继续生成申请核对结果并让你确认是否提交。", + "请一次性补齐上述字段,我会继续生成申请核对结果,并请您确认是否提交。", + ] + ) + + if step == "ask_invalid": + issue_messages = [ + item["message"] + for item in self._resolve_application_validation_issues(facts) + if str(item.get("message") or "").strip() + ] + return "\n\n".join( + [ + "我已识别到申请信息里有需要先修正的字段。", + "已识别信息:\n" + recognized_table, + *issue_messages, + "请把地点改为真实出差地点,业务事项放在事由中;修正后我再帮您提交申请。", ] ) @@ -1473,7 +1515,7 @@ class UserAgentApplicationMixin(UserAgentApplicationSlotMixin, UserAgentApplicat pick("applicationType", "application_type") ), "time": pick("time", "timeRange", "time_range"), - "location": pick("location"), + "location": normalize_application_location(pick("location")), "reason": reason, "days": pick("days"), "transport_mode": pick("transportMode", "transport_mode"), @@ -1507,6 +1549,8 @@ class UserAgentApplicationMixin(UserAgentApplicationSlotMixin, UserAgentApplicat payload: UserAgentRequest, facts: dict[str, str], ) -> str: + if self._resolve_application_validation_issues(facts): + return "ask_invalid" if self._is_application_save_draft_action(payload): return "draft" if self._resolve_application_missing_base_fields(facts): @@ -1516,4 +1560,3 @@ class UserAgentApplicationMixin(UserAgentApplicationSlotMixin, UserAgentApplicat if self._is_application_submit_confirmation(payload): return "submitted" return "preview" - diff --git a/server/src/app/services/user_agent_application_locations.py b/server/src/app/services/user_agent_application_locations.py index 1f6b10d..ce791e9 100644 --- a/server/src/app/services/user_agent_application_locations.py +++ b/server/src/app/services/user_agent_application_locations.py @@ -2,7 +2,6 @@ from __future__ import annotations import re - DIRECT_MUNICIPALITY_DISPLAY = { "北京": "北京市", "北京市": "北京市", @@ -79,7 +78,7 @@ CITY_TO_PROVINCE = { } LOCATION_NOISE_PATTERN = re.compile( - r"(?:出差|驻场|现场|支撑|支持|部署|上线|实施|拜访|验收|会议|采购|培训|协助|处理|办理|参加|进行).*$" + r"(?:出差|驻场|现场|支撑|支持|辅助|部署|上线|实施|拜访|验收|会议|采购|培训|协助|处理|办理|参加|进行).*$" ) diff --git a/server/src/app/services/user_agent_knowledge.py b/server/src/app/services/user_agent_knowledge.py index 71043c2..275d4be 100644 --- a/server/src/app/services/user_agent_knowledge.py +++ b/server/src/app/services/user_agent_knowledge.py @@ -716,7 +716,7 @@ class UserAgentKnowledgeMixin(UserAgentKnowledgeHelpersMixin): self._append_markdown_section( answer_lines, "说明", - ["- 请补充费用类型、适用地区、职级或具体业务场景,我再继续帮你缩小范围。"], + ["- 请补充费用类型、适用地区、职级或具体业务场景,我会继续帮您缩小范围。"], ) return "\n".join(answer_lines).strip() @@ -729,7 +729,7 @@ class UserAgentKnowledgeMixin(UserAgentKnowledgeHelpersMixin): self._append_markdown_section( answer_lines, "说明", - ["- 以上只使用当前命中的知识库证据;没有在证据中出现的适用条件或金额,我不会替你默认补齐。"], + ["- 以上只使用当前命中的知识库证据;没有在证据中出现的适用条件或金额,我不会替您默认补齐。"], ) return "\n".join(answer_lines).strip() diff --git a/server/src/app/services/user_agent_response.py b/server/src/app/services/user_agent_response.py index db8b204..78e2985 100644 --- a/server/src/app/services/user_agent_response.py +++ b/server/src/app/services/user_agent_response.py @@ -61,7 +61,7 @@ class UserAgentResponseMixin: if payload.ontology.intent == "draft": tool_message = str(payload.tool_payload.get("message") or "").strip() if payload.tool_payload.get("draft_limit_reached"): - return tool_message or "你当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。" + return tool_message or "您当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。" if tool_message and ( str(payload.tool_payload.get("claim_id") or "").strip() or str(payload.tool_payload.get("claim_no") or "").strip() @@ -88,12 +88,12 @@ class UserAgentResponseMixin: elif attachment_names: attachment_hint = ( f" 我已带入 {len(attachment_names)} 份附件名称,但目前还不能直接读取附件内容," - "仍需要你补充关键信息。" + "还需要您补充关键信息。" ) return ( - "可以帮你发起报销。请补充费用类型、发生时间、金额、事由和相关对象," - "或者直接上传票据附件,我再继续帮你判断能否报、缺什么材料,并整理待核对信息。" + "可以帮您发起报销。请补充费用类型、发生时间、金额、事由和相关对象," + "或者直接上传票据附件,我会继续帮您判断能否报销、还缺哪些材料,并整理出待核对信息。" f"{attachment_hint}" ) @@ -122,8 +122,8 @@ class UserAgentResponseMixin: return ( f"已识别到一笔{time_text}的{expense_type}支出{amount_hint}。" - "如果要继续整理报销核对信息,还需要补充客户单位、参与人员、费用明细和票据附件。" - "你也可以继续上传发票或图片,我会把这些信息带入后续对话。" + "如果需要继续整理报销核对信息,还需要补充客户单位、参与人员、费用明细和票据附件。" + "您也可以继续上传发票或图片,我会把这些信息带入后续对话。" ) @@ -347,7 +347,7 @@ class UserAgentResponseMixin: query_payload = self._build_query_payload(payload) scope_label = str(data.get("scope_label") or subject).strip() or subject if query_payload is None: - return f"当前没有查到{scope_label}。你可以补充时间范围、单号或状态继续筛选。" + return f"当前没有查到{scope_label}。您可以补充时间范围、单号或状态继续筛选。" window_prefix = ( f"{query_payload.window_start_date} 至 {query_payload.window_end_date}" @@ -367,10 +367,10 @@ class UserAgentResponseMixin: f"另有 {query_payload.older_record_count} 笔超过 {query_payload.window_days} 日的单据," "请前往个人报销中心查看。" ) - return f"{window_prefix}没有查到{query_payload.scope_label}。你可以补充时间范围、单号或状态继续筛选。" + return f"{window_prefix}没有查到{query_payload.scope_label}。您可以补充时间范围、单号或状态继续筛选。" answer_parts = [ - f"已按你的筛选条件查询{query_payload.scope_label}。", + f"已按您的筛选条件查询{query_payload.scope_label}。", f"下面先列出最近 {query_payload.preview_count} 条记录,点击任一单据即可查看详情。", f"本次共命中 {query_payload.record_count} 笔,金额合计 {query_payload.total_amount:.2f} 元。", ] diff --git a/server/src/app/services/user_agent_review_core.py b/server/src/app/services/user_agent_review_core.py index 74808a4..0c66d85 100644 --- a/server/src/app/services/user_agent_review_core.py +++ b/server/src/app/services/user_agent_review_core.py @@ -68,8 +68,8 @@ class UserAgentReviewCoreMixin: if has_time: context_hint += ",并看到了业务发生时间" return ( - f"{context_hint}。但你还没有明确这笔单据属于哪类报销。" - "请先在下面选择报销场景,我会按你选择的场景再继续识别时间、地点、事由、金额和所需票据," + f"{context_hint}。但您还没有明确这笔单据属于哪类报销。" + "请先在下面选择报销场景,我会按您选择的场景继续识别时间、地点、事由、金额和所需票据," "避免系统先入为主把项目支持、部署等描述误判成差旅。" ) diff --git a/server/src/app/services/user_agent_review_messages.py b/server/src/app/services/user_agent_review_messages.py index bc1407a..82ddbeb 100644 --- a/server/src/app/services/user_agent_review_messages.py +++ b/server/src/app/services/user_agent_review_messages.py @@ -164,7 +164,7 @@ class UserAgentReviewMessageMixin: if payload.tool_payload.get("draft_limit_reached"): return ( str(payload.tool_payload.get("message") or "").strip() - or "你当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。" + or "您当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。" ) review_action = str(payload.context_json.get("review_action") or "").strip() @@ -254,11 +254,11 @@ class UserAgentReviewMessageMixin: if claim_no: return ( f"已识别出本次上传的 {document_count} 张票据。" - f"系统检测到你已有草稿 {claim_no},请选择关联到该草稿,或单独建立一张新的报销单。" + f"系统检测到您已有草稿 {claim_no},请选择关联到该草稿,或单独新建一张报销单。" ) return ( f"已识别出本次上传的 {document_count} 张票据。" - "系统检测到你已有可用草稿,请先选择关联到现有草稿,或单独建立一张新的报销单。" + "系统检测到您已有可用草稿,请先选择关联到现有草稿,或单独新建一张报销单。" ) blocked_reasons = self._resolve_submission_blocked_reasons(payload) diff --git a/server/src/app/services/user_agent_review_slots.py b/server/src/app/services/user_agent_review_slots.py index 3c58a28..928045d 100644 --- a/server/src/app/services/user_agent_review_slots.py +++ b/server/src/app/services/user_agent_review_slots.py @@ -668,7 +668,7 @@ class UserAgentReviewSlotMixin: status="missing" if is_missing else "identified" if source in {"user_text", "user_form"} else "inferred", hint=f"建议补充 {SLOT_LABELS.get(key, key)}。" if is_missing and required - else ("该字段来自系统辅助上下文,建议你再核对一次。" if source in {"detail_context", "ocr"} else ""), + else ("该字段来自系统辅助上下文,建议您再核对一次。" if source in {"detail_context", "ocr"} else ""), evidence=evidence, ) diff --git a/server/tests/test_agent_asset_service.py b/server/tests/test_agent_asset_service.py index 53b8225..2730b70 100644 --- a/server/tests/test_agent_asset_service.py +++ b/server/tests/test_agent_asset_service.py @@ -888,6 +888,34 @@ def test_travel_reimbursement_calculator_rejects_unrecognized_location() -> None ) +def test_travel_reimbursement_calculator_normalizes_location_mixed_with_business_content() -> None: + with build_session() as db: + db.add( + Employee( + employee_no="E9004", + name="混合地点员工", + email="mixed-location@example.com", + position="产品经理", + grade="P4", + ) + ) + db.commit() + + result = TravelReimbursementCalculatorService(db).calculate( + TravelReimbursementCalculatorRequest(days=4, location="上海辅助国网仿生产服务器"), + CurrentUserContext( + username="mixed-location@example.com", + name="混合地点员工", + role_codes=[], + is_admin=False, + ), + ) + + assert result.location == "上海市" + assert result.matched_city == "上海" + assert result.hotel_amount > 0 + + def test_agent_run_service_lists_seeded_trace_data() -> None: with build_session() as db: service = AgentRunService(db) diff --git a/server/tests/test_document_intelligence.py b/server/tests/test_document_intelligence.py index a94f0a5..01e9697 100644 --- a/server/tests/test_document_intelligence.py +++ b/server/tests/test_document_intelligence.py @@ -84,6 +84,33 @@ def test_document_intelligence_prefers_train_ticket_for_railway_e_ticket_invoice assert any(field.label == "金额" and field.value == "354元" for field in insight.fields) +def test_document_intelligence_train_ticket_uses_railway_merchant_not_invoice_title() -> None: + insight = build_document_insight( + filename="2月20_武汉-上海.pdf", + summary="电子发票(铁路电子客票);发票监;统一 制", + text=( + "电子发票(铁路电子客票)\n" + "发票号码:26429165800002785705 湖北\n" + "开票日期:2026年05月18日\n" + "武汉站 G458 上海虹桥站\n" + "Wuhan Shanghaihongqiao\n" + "2026年02月20日 07:55开 06车01B号 二等座\n" + "票价:¥354.00\n" + "4201061987****1615 曹笑竹\n" + "电子客票号:6580061086021391007342026\n" + "购买方名称:曹笑竹 统一社会信用代码:\n" + "买票请到12306 发货请到95306\n" + "中国铁路祝您旅途愉快" + ), + ) + + assert insight.document_type == "train_ticket" + fields = {field.label: field.value for field in insight.fields} + assert fields["商户"] == "中国铁路" + assert fields["金额"] == "354元" + assert fields["列车出发时间"] == "2026-02-20 07:55" + + def test_document_intelligence_recovers_train_ticket_from_english_station_ocr_text() -> None: insight = build_document_insight( filename="2月20_武汉-上海.pdf", diff --git a/server/tests/test_expense_claim_service.py b/server/tests/test_expense_claim_service.py index e911bcf..5c18d1a 100644 --- a/server/tests/test_expense_claim_service.py +++ b/server/tests/test_expense_claim_service.py @@ -28,6 +28,7 @@ from app.schemas.reimbursement import ( ) from app.services.agent_conversations import AgentConversationService from app.services.budget import BudgetService +from app.services.document_preview import DocumentPreviewAssets from app.services.expense_claim_attachment_storage import ExpenseClaimAttachmentStorage from app.services.expense_claim_budget_flow import ExpenseClaimBudgetFlowMixin from app.services.expense_claim_workflow_constants import ( @@ -3314,6 +3315,68 @@ def test_attachment_preview_resolves_legacy_filename_in_claim_item_directory(mon assert filename == "legacy-ticket.pdf" +def test_attachment_pdf_preview_falls_back_to_source_when_render_fonts_missing(monkeypatch, tmp_path) -> None: + current_user = CurrentUserContext( + username="emp-1", + name="张三", + role_codes=[], + is_admin=False, + ) + + monkeypatch.setattr(ExpenseClaimAttachmentStorage, "root", lambda self: tmp_path) + + with build_session() as db: + claim = build_claim(expense_type="train", location="上海") + db.add(claim) + db.commit() + + attachment_dir = tmp_path / claim.id / claim.items[0].id + attachment_dir.mkdir(parents=True) + file_path = attachment_dir / "2月20_武汉-上海.pdf" + preview_path = attachment_dir / "2月20_武汉-上海.preview.png" + file_path.write_bytes(b"%PDF-1.7 fake") + preview_path.write_bytes(b"broken-preview") + claim.items[0].invoice_id = f"{claim.id}/{claim.items[0].id}/{file_path.name}" + db.commit() + + storage = ExpenseClaimAttachmentStorage() + storage.write_meta( + file_path, + { + "file_name": file_path.name, + "storage_key": storage.to_storage_key(file_path), + "media_type": "application/pdf", + "previewable": True, + "preview_kind": "image", + "preview_storage_key": storage.to_storage_key(preview_path), + "preview_media_type": "image/png", + "preview_file_name": preview_path.name, + "preview_rendered_with": "pdftoppm-png-r160-poppler-data", + }, + ) + + def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds): + raise RuntimeError("Missing language pack for 'Adobe-GB1' mapping") + + monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page) + + resolved_path, media_type, filename = ExpenseClaimService(db).get_claim_item_attachment_preview_content( + claim_id=claim.id, + item_id=claim.items[0].id, + current_user=current_user, + ) + + assert resolved_path == file_path + assert media_type == "application/pdf" + assert filename == file_path.name + refreshed_meta = storage.read_meta(file_path) + assert refreshed_meta["preview_kind"] == "pdf" + assert refreshed_meta["preview_storage_key"] == storage.to_storage_key(file_path) + assert refreshed_meta["preview_media_type"] == "application/pdf" + assert refreshed_meta["preview_file_name"] == file_path.name + assert refreshed_meta["preview_rendered_with"] == "" + + def test_submit_claim_runs_ai_review_and_routes_to_direct_manager() -> None: current_user = CurrentUserContext( username="emp-submit@example.com", @@ -5199,6 +5262,103 @@ def test_admin_delete_claim_unlinks_receipt_folder_items(monkeypatch, tmp_path) get_settings.cache_clear() +def test_admin_delete_linked_reimbursement_resets_application_link_status() -> None: + admin_user = CurrentUserContext( + username="superadmin", + name="系统管理员", + role_codes=["admin"], + is_admin=True, + ) + + with build_session() as db: + application_claim = ExpenseClaim( + id="application-delete-linked-reimbursement", + claim_no="APP-DEL-LINKED-APPLICATION", + employee_name="张三", + department_name="交付部", + project_code="PRJ-A", + expense_type="travel_application", + reason="支撑国网仿生产环境部署", + location="上海", + amount=Decimal("3000.00"), + currency="CNY", + invoice_count=0, + occurred_at=datetime(2026, 6, 21, 22, 30, tzinfo=UTC), + submitted_at=datetime(2026, 6, 21, 22, 35, tzinfo=UTC), + status="approved", + approval_stage=APPLICATION_LINK_STATUS_STAGE, + risk_flags_json=[ + { + "source": "manual_approval", + "event_type": "expense_application_approval", + "operator": "向万红", + "previous_approval_stage": DIRECT_MANAGER_APPROVAL_STAGE, + "next_status": "approved", + "next_approval_stage": APPLICATION_LINK_STATUS_STAGE, + "generated_draft_claim_id": "reimbursement-delete-linked-application", + "generated_draft_claim_no": "RDELETE01", + "created_at": "2026-06-21T22:45:00+00:00", + } + ], + ) + reimbursement_claim = ExpenseClaim( + id="reimbursement-delete-linked-application", + claim_no="RDELETE01", + employee_name="张三", + department_name="交付部", + project_code="PRJ-A", + expense_type="travel", + reason="支撑国网仿生产环境部署报销", + location="上海", + amount=Decimal("3000.00"), + currency="CNY", + invoice_count=1, + occurred_at=datetime(2026, 6, 21, 22, 30, tzinfo=UTC), + submitted_at=None, + status="draft", + approval_stage="待提交", + risk_flags_json=[ + { + "source": "application_handoff", + "event_type": "expense_application_to_reimbursement_draft", + "application_claim_id": application_claim.id, + "application_claim_no": application_claim.claim_no, + } + ], + ) + db.add_all([application_claim, reimbursement_claim]) + db.commit() + + deleted = ExpenseClaimService(db).delete_claim(reimbursement_claim.id, admin_user) + + assert deleted is not None + assert deleted.claim_no == "RDELETE01" + assert db.get(ExpenseClaim, reimbursement_claim.id) is None + db.refresh(application_claim) + assert application_claim.status == "approved" + assert application_claim.approval_stage == APPLICATION_LINK_STATUS_STAGE + + approval_flag = next( + flag + for flag in application_claim.risk_flags_json + if isinstance(flag, dict) and flag.get("event_type") == "expense_application_approval" + ) + assert "generated_draft_claim_id" not in approval_flag + assert "generated_draft_claim_no" not in approval_flag + + sync_flag = next( + flag + for flag in application_claim.risk_flags_json + if isinstance(flag, dict) and flag.get("event_type") == "expense_application_reimbursement_deleted" + ) + assert sync_flag["source"] == "application_link_sync" + assert sync_flag["severity"] == "info" + assert sync_flag["actionability"] == "system_trace" + assert sync_flag["deleted_reimbursement_claim_id"] == "reimbursement-delete-linked-application" + assert sync_flag["deleted_reimbursement_claim_no"] == "RDELETE01" + assert sync_flag["next_approval_stage"] == APPLICATION_LINK_STATUS_STAGE + + def test_direct_manager_can_return_subordinate_claim_to_pending_submission() -> None: current_user = CurrentUserContext( username="manager-return@example.com", diff --git a/server/tests/test_notification_states.py b/server/tests/test_notification_states.py index 39d525c..4ce91b8 100644 --- a/server/tests/test_notification_states.py +++ b/server/tests/test_notification_states.py @@ -85,6 +85,31 @@ def test_notification_state_service_persists_user_scoped_read_and_hidden_state() assert other_saved.states[0].hidden_at is None +def test_notification_state_storage_ready_runs_once_per_database_bind(monkeypatch) -> None: + with build_session() as db: + service = NotificationStateService(db) + user = CurrentUserContext(username="alice", name="Alice", role_codes=[], is_admin=False) + calls: list[object] = [] + original_create_all = Base.metadata.create_all + + def track_create_all(*args, **kwargs): + calls.append(kwargs.get("bind")) + return original_create_all(*args, **kwargs) + + monkeypatch.setattr(Base.metadata, "create_all", track_create_all) + + service.list_states(user) + service.list_states(user) + service.patch_states( + NotificationStateBatchPatch( + states=[NotificationStatePatch(notification_id="workbench:todo:EXP-002", read=True)] + ), + user, + ) + + assert len(calls) == 1 + + def test_notification_state_endpoint_reads_and_updates_current_user_state() -> None: client = build_client() headers = {"x-auth-username": "alice", "x-auth-name": "Alice"} diff --git a/server/tests/test_ocr_service.py b/server/tests/test_ocr_service.py index 2e86a40..a3d2222 100644 --- a/server/tests/test_ocr_service.py +++ b/server/tests/test_ocr_service.py @@ -5,19 +5,23 @@ import subprocess from pathlib import Path from app.core.config import get_settings +from app.services import document_preview from app.services.ocr import OcrService -def test_ocr_runtime_installers_include_poppler_cjk_data() -> None: +def test_ocr_runtime_installers_include_cjk_safe_pdf_rendering_tools() -> None: repo_root = Path(__file__).resolve().parents[2] dependency_sources = [ repo_root / "docker-compose.yml", + repo_root / "docker-compose.full.yml", repo_root / "server" / "scripts" / "bootstrap_paddleocr_mobile.sh", repo_root / "server" / "scripts" / "bootstrap_paddleocr_gpu.sh", ] for path in dependency_sources: - assert "poppler-data" in path.read_text(encoding="utf-8") + content = path.read_text(encoding="utf-8") + assert "poppler-data" in content + assert "mupdf-tools" in content def test_ocr_service_uses_worker_runtime_and_keeps_unsupported_files_as_warnings( @@ -163,6 +167,7 @@ def test_ocr_service_passes_configured_device_to_worker( text: bool, timeout: int, check: bool, + env: dict[str, str] | None = None, ) -> subprocess.CompletedProcess[str]: captured_commands.append(command) return subprocess.CompletedProcess( @@ -194,12 +199,12 @@ def test_ocr_service_converts_pdf_to_images_and_returns_image_preview( monkeypatch, tmp_path: Path, ) -> None: - def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]: + def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]: first = output_dir / "page-1.png" second = output_dir / "page-2.png" first.write_bytes(b"fake-page-1") second.write_bytes(b"fake-page-2") - return [first, second] + return [first, second], True def fake_invoke_worker( self, @@ -281,26 +286,143 @@ def test_ocr_service_converts_pdf_to_images_and_returns_image_preview( assert recognized.lines[1].page_index == 1 -def test_ocr_service_uses_pdf_text_layer_without_worker_runtime( +def test_ocr_service_rejects_pdf_ocr_when_rendered_image_fonts_are_broken( monkeypatch, tmp_path: Path, ) -> None: - def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]: - page = output_dir / "page-1.png" - page.write_bytes(b"fake-rendered-page") - return [page] + def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]: + raise RuntimeError("PDF 转图片失败:检测到中文字体映射缺失,未生成可 OCR 的图片。") - def fail_resolve_python(self) -> str: - raise AssertionError("PDF 文本层可用时不应强制解析 OCR worker。") - - def fail_invoke_worker(self, **kwargs) -> dict: - raise AssertionError("PDF 文本层可用时不应调用 OCR worker。") + def fake_invoke_worker( + self, + *, + python_bin: str, + worker_path: str, + input_paths: list[Path], + ) -> dict: + raise AssertionError("PDF 转图片已确认丢中文时,不应继续调用 OCR worker。") monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) - monkeypatch.setattr(OcrService, "_resolve_python_bin", fail_resolve_python) + monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python") monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py") monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images) - monkeypatch.setattr(OcrService, "_invoke_worker", fail_invoke_worker) + monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker) + get_settings.cache_clear() + try: + result = OcrService().recognize_files( + [ + ("2月20_武汉-上海.pdf", b"%PDF-1.7 fake", "application/pdf"), + ] + ) + finally: + get_settings.cache_clear() + + failed = result.documents[0] + assert failed.line_count == 0 + assert failed.preview_kind == "" + assert failed.preview_data_url == "" + assert failed.warnings == ["PDF 转图片失败:检测到中文字体映射缺失,未生成可 OCR 的图片。"] + + +def test_ocr_pdf_conversion_tries_next_renderer_when_poppler_font_mapping_fails( + monkeypatch, + tmp_path: Path, +) -> None: + output_dir = tmp_path / "pages" + output_dir.mkdir() + calls: list[str] = [] + + def fake_run( + command: list[str], + *, + capture_output: bool, + text: bool, + timeout: int, + check: bool, + ) -> subprocess.CompletedProcess[str]: + calls.append(Path(command[0]).name) + if Path(command[0]).name == "pdftoppm": + (output_dir / "page-1.png").write_bytes(b"broken-preview") + return subprocess.CompletedProcess( + args=command, + returncode=0, + stdout="", + stderr="Syntax Error: Missing language pack for 'Adobe-GB1' mapping", + ) + (output_dir / "page-1.png").write_bytes(b"rendered-with-chinese") + return subprocess.CompletedProcess( + args=command, + returncode=0, + stdout="", + stderr="", + ) + + monkeypatch.setattr( + document_preview.shutil, + "which", + lambda name: f"/usr/bin/{name}" if name in {"pdftoppm", "mutool"} else None, + ) + monkeypatch.setattr(subprocess, "run", fake_run) + + pages, preview_usable = OcrService()._convert_pdf_to_images( + pdf_path=tmp_path / "ticket.pdf", + output_dir=output_dir, + ) + + assert pages == [output_dir / "page-1.png"] + assert preview_usable is True + assert calls == ["pdftoppm", "mutool"] + + +def test_ocr_service_invokes_worker_even_when_pdf_text_layer_is_usable( + monkeypatch, + tmp_path: Path, +) -> None: + calls = {"worker": 0} + + def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]: + page = output_dir / "page-1.png" + page.write_bytes(b"fake-rendered-page") + return [page], True + + def fake_invoke_worker( + self, + *, + python_bin: str, + worker_path: str, + input_paths: list[Path], + ) -> dict: + calls["worker"] += 1 + return { + "engine": "paddleocr_mobile", + "model": "PP-OCRv5_mobile", + "documents": [ + { + "input_path": str(input_paths[0]), + "engine": "paddleocr_mobile", + "model": "PP-OCRv5_mobile", + "text": "电子发票(铁路电子客票) 武汉站 上海虹桥站 G458 票价 ¥354.00", + "summary": "铁路电子客票", + "avg_score": 0.95, + "line_count": 1, + "page_count": 1, + "warnings": [], + "lines": [ + { + "text": "电子发票(铁路电子客票) 武汉站 上海虹桥站 G458 票价 ¥354.00", + "score": 0.95, + "box": [[1, 2], [10, 2], [10, 8], [1, 8]], + } + ], + } + ], + } + + monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) + monkeypatch.setattr(OcrService, "_resolve_python_bin", lambda self: "python") + monkeypatch.setattr(OcrService, "_resolve_worker_path", lambda self: "worker.py") + monkeypatch.setattr(OcrService, "_convert_pdf_to_images", fake_convert_pdf_to_images) + monkeypatch.setattr(OcrService, "_invoke_worker", fake_invoke_worker) monkeypatch.setattr( OcrService, "_extract_pdf_text_layer", @@ -326,9 +448,9 @@ def test_ocr_service_uses_pdf_text_layer_without_worker_runtime( recognized = result.documents[0] assert result.success_count == 1 + assert calls["worker"] == 1 assert recognized.document_type == "train_ticket" assert "电子发票(铁路电子客票)" in recognized.text - assert "电子客票号:6580061086021391007342026" in recognized.text assert any(field.label == "金额" and field.value == "354元" for field in recognized.document_fields) assert recognized.preview_kind == "image" assert recognized.preview_data_url.startswith("data:image/png;base64,") @@ -392,14 +514,22 @@ def test_ocr_service_reuses_cached_document_for_same_content( assert second.documents[0].summary == first.documents[0].summary +def test_ocr_cache_key_includes_pdf_render_pipeline_version() -> None: + cache_key = OcrService()._build_cache_key(b"same-pdf-content") + + assert "pdf-image-ocr:" in cache_key + assert document_preview.DocumentPreviewAssets.PDF_RENDERER_ID in cache_key + assert "no-pdf-direct" in cache_key + + def test_ocr_service_prefers_pdf_text_layer_when_rendered_ocr_is_placeholder_heavy( monkeypatch, tmp_path: Path, ) -> None: - def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> list[Path]: + def fake_convert_pdf_to_images(self, *, pdf_path: Path, output_dir: Path) -> tuple[list[Path], bool]: page = output_dir / "page-1.png" page.write_bytes(b"fake-page") - return [page] + return [page], True def fake_invoke_worker( self, diff --git a/server/tests/test_receipt_folder_service.py b/server/tests/test_receipt_folder_service.py index bc814f5..62cf2e8 100644 --- a/server/tests/test_receipt_folder_service.py +++ b/server/tests/test_receipt_folder_service.py @@ -4,7 +4,7 @@ import base64 from app.api.deps import CurrentUserContext from app.core.config import get_settings -from app.schemas.ocr import OcrRecognizeDocumentRead +from app.schemas.ocr import OcrRecognizeDocumentRead, OcrRecognizeFieldRead from app.services.document_preview import DocumentPreviewAssets from app.services.receipt_folder import ReceiptFolderService @@ -72,6 +72,55 @@ def test_receipt_folder_train_ticket_uses_invoice_date_and_enriches_fields(monke get_settings.cache_clear() +def test_receipt_folder_pdf_save_eagerly_renders_image_preview(monkeypatch, tmp_path) -> None: + monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) + get_settings.cache_clear() + try: + current_user = CurrentUserContext( + username="pytest", + name="Py Test", + role_codes=[], + is_admin=False, + ) + + def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds): + preview_path.write_bytes(b"rendered-preview") + return preview_path + + monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page) + + service = ReceiptFolderService() + receipt = service.save_receipt( + filename="2月20_武汉-上海.pdf", + content=b"%PDF-1.4 fake", + media_type="application/pdf", + current_user=current_user, + document=OcrRecognizeDocumentRead( + filename="2月20_武汉-上海.pdf", + media_type="application/pdf", + text="铁路电子客票 武汉 上海虹桥 354.00", + summary="铁路电子客票,武汉至上海虹桥。", + ), + ) + + receipt_dir = next(service.root.glob("pytest/*")) + preview_path = receipt_dir / "preview.png" + meta = service._read_meta(receipt_dir) + + assert receipt.preview_kind == "image" + assert preview_path.read_bytes() == b"rendered-preview" + assert meta["preview_file_name"] == "preview.png" + assert meta["preview_media_type"] == "image/png" + assert meta["preview_rendered_with"] == DocumentPreviewAssets.PDF_RENDERER_ID + + resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user) + assert resolved_path == preview_path + assert media_type == "image/png" + assert file_name == "preview.png" + finally: + get_settings.cache_clear() + + def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, tmp_path) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() @@ -123,6 +172,213 @@ def test_receipt_folder_pdf_preview_regenerates_stale_cached_image(monkeypatch, get_settings.cache_clear() +def test_receipt_folder_pdf_preview_falls_back_to_source_when_render_fonts_missing( + monkeypatch, + tmp_path, +) -> None: + monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) + get_settings.cache_clear() + try: + current_user = CurrentUserContext( + username="pytest", + name="Py Test", + role_codes=[], + is_admin=False, + ) + stale_preview = b"broken-preview" + preview_data_url = f"data:image/png;base64,{base64.b64encode(stale_preview).decode('ascii')}" + service = ReceiptFolderService() + receipt = service.save_receipt( + filename="2月20_武汉-上海.pdf", + content=b"%PDF-1.7 fake", + media_type="application/pdf", + current_user=current_user, + document=OcrRecognizeDocumentRead( + filename="2月20_武汉-上海.pdf", + media_type="application/pdf", + preview_kind="image", + preview_data_url=preview_data_url, + ), + ) + + receipt_dir = next(service.root.glob("pytest/*")) + meta = service._read_meta(receipt_dir) + meta["preview_rendered_with"] = "pdftoppm-png-r160-poppler-data" + service._write_meta(receipt_dir, meta) + + def fake_render_pdf_first_page(*, pdf_path, preview_path, timeout_seconds): + raise RuntimeError("Missing language pack for 'Adobe-GB1' mapping") + + monkeypatch.setattr(DocumentPreviewAssets, "render_pdf_first_page", fake_render_pdf_first_page) + + resolved_path, media_type, file_name = service.resolve_preview(receipt.id, current_user) + + assert resolved_path == receipt_dir / "2月20_武汉-上海.pdf" + assert media_type == "application/pdf" + assert file_name == "2月20_武汉-上海.pdf" + refreshed_meta = service._read_meta(receipt_dir) + assert refreshed_meta["preview_kind"] == "pdf" + assert refreshed_meta["preview_file_name"] == "2月20_武汉-上海.pdf" + assert refreshed_meta["preview_media_type"] == "application/pdf" + assert refreshed_meta["preview_rendered_with"] == "" + finally: + get_settings.cache_clear() + + +def test_receipt_folder_train_ticket_extracts_passenger_from_id_line_and_purchase_name( + monkeypatch, + tmp_path, +) -> None: + monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) + get_settings.cache_clear() + try: + current_user = CurrentUserContext( + username="pytest", + name="Py Test", + role_codes=[], + is_admin=False, + ) + service = ReceiptFolderService() + receipt = service.save_receipt( + filename="2月20_武汉-上海.pdf", + content=b"%PDF-1.4 fake", + media_type="application/pdf", + current_user=current_user, + document=OcrRecognizeDocumentRead( + filename="2月20_武汉-上海.pdf", + media_type="application/pdf", + text=( + "电子发票(铁路电子客票)\n" + "发票号码:26429165800002785705 湖北\n" + "开票日期:2026年05月18日\n" + "武汉站 G458 上海虹桥站\n" + "Wuhan Shanghaihongqiao\n" + "2026年02月20日 07:55开 06车01B号 二等座\n" + "票价:¥354.00\n" + "4201061987****1615 曹笑竹\n" + "电子客票号:6580061086021391007342026\n" + "购买方名称:曹笑竹 统一社会信用代码:\n" + "买票请到12306 发货请到95306\n" + "中国铁路祝您旅途愉快" + ), + summary="电子发票(铁路电子客票);发票监;统一 制", + document_type="train_ticket", + document_type_label="火车/高铁票", + scene_code="travel", + scene_label="差旅票据", + document_fields=[ + OcrRecognizeFieldRead(key="merchant_name", label="商户", value="电子发票(铁路"), + OcrRecognizeFieldRead(key="amount", label="金额", value="354元"), + OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-20 07:55"), + OcrRecognizeFieldRead(key="trip_no", label="车次", value="G458"), + OcrRecognizeFieldRead(key="route", label="行程", value="武汉-上海"), + ], + ), + ) + + assert receipt.merchant_name == "中国铁路" + + detail = service.get_receipt(receipt.id, current_user) + fields = {field.label: field.value for field in detail.fields} + assert fields["商户"] == "中国铁路" + assert fields["乘车人"] == "曹笑竹" + assert fields["出发地点"] == "武汉" + assert fields["到达地点"] == "上海虹桥" + assert fields["身份证号"] == "4201061987****1615" + assert fields["电子客票号"] == "6580061086021391007342026" + assert fields["开票日期"] == "2026-05-18" + assert fields["列车出发时间"] == "2026-02-20 07:55" + assert fields["车厢"] == "06车" + assert fields["座位号"] == "01B" + finally: + get_settings.cache_clear() + + +def test_receipt_folder_train_ticket_repairs_invalid_generated_fields_from_ocr_text( + monkeypatch, + tmp_path, +) -> None: + monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) + get_settings.cache_clear() + try: + current_user = CurrentUserContext( + username="pytest", + name="Py Test", + role_codes=[], + is_admin=False, + ) + service = ReceiptFolderService() + receipt = service.save_receipt( + filename="2月21日_上海-深圳.png", + content=b"fake image", + media_type="image/png", + current_user=current_user, + document=OcrRecognizeDocumentRead( + filename="2月21日_上海-深圳.png", + media_type="image/png", + text=( + "行程单示意\n" + "出票渠道:示例平台\n" + "非官方车票\n" + "不可报销\n" + "仅供演示\n" + "创建日期:2026年02月15日\n" + "订单号:DEMO202602210001\n" + "单据编号:DEMO-IT-000001\n" + "上海虹桥\n" + "G999\n" + "深圳北\n" + "站\n" + "站\n" + "Shanghaihongqiao\n" + "Shenzhenbei\n" + "2026年02月21日\n" + "08:30出发\n" + "全程约7小时30分\n" + "15:00到达\n" + "DEMO\n" + "乘客:示例旅客\n" + "车厢:05车\n" + "席别:二等座\n" + "-\n" + "扫码无效\n" + "证件号:310101199001010000\n" + "座位:08A\n" + "票价:¥438.00\n" + "仅为演示" + ), + summary="行程单示意;出票渠道:示例平台;非官方车票", + document_type="train_ticket", + document_type_label="火车/高铁票", + scene_code="travel", + scene_label="差旅票据", + document_fields=[ + OcrRecognizeFieldRead(key="amount", label="金额", value="438元"), + OcrRecognizeFieldRead(key="date", label="列车出发时间", value="2026-02-21 08:30"), + OcrRecognizeFieldRead(key="invoice_number", label="票据号码", value="DEMO202602210001"), + OcrRecognizeFieldRead(key="trip_no", label="车次", value="G999"), + OcrRecognizeFieldRead(key="route", label="行程", value="上海-深圳"), + OcrRecognizeFieldRead(key="departure_station", label="出发地点", value="二等座"), + OcrRecognizeFieldRead(key="arrival_station", label="到达地点", value="扫码无效"), + OcrRecognizeFieldRead(key="passenger_name", label="乘车人", value="席别二等座"), + ], + ), + ) + + detail = service.get_receipt(receipt.id, current_user) + fields = {field.label: field.value for field in detail.fields} + assert fields["出发地点"] == "上海虹桥" + assert fields["到达地点"] == "深圳北" + assert fields["乘车人"] == "示例旅客" + assert fields["身份证号"] == "310101199001010000" + assert fields["席别"] == "二等座" + assert fields["车厢"] == "05车" + assert fields["座位号"] == "08A" + assert fields["票价"] == "438.00元" + finally: + get_settings.cache_clear() + + def test_receipt_folder_delete_removes_duplicate_marker(monkeypatch, tmp_path) -> None: monkeypatch.setenv("STORAGE_ROOT_DIR", str(tmp_path / "storage")) get_settings.cache_clear() diff --git a/server/tests/test_user_agent_service.py b/server/tests/test_user_agent_service.py index e0573c7..3921b06 100644 --- a/server/tests/test_user_agent_service.py +++ b/server/tests/test_user_agent_service.py @@ -15,6 +15,7 @@ from app.models.financial_record import ExpenseClaim from app.schemas.ontology import OntologyParseRequest from app.schemas.user_agent import UserAgentCitation, UserAgentRequest, UserAgentReviewRiskBrief from app.services.agent_assets import AgentAssetService +from app.services.application_location_semantics import resolve_jieba_tokens from app.services.ontology import SemanticOntologyService from app.services.user_agent import UserAgentService from app.services.user_agent_documents import UserAgentDocumentService @@ -763,6 +764,67 @@ def test_user_agent_application_submit_blocks_overlapping_travel_dates() -> None assert response.draft_payload is None +def test_user_agent_application_submit_normalizes_location_mixed_with_business_content() -> None: + session_factory = build_session_factory() + with session_factory() as db: + response = build_application_user_agent_response( + db, + "确认提交", + context_overrides={ + "manager_name": "向万红", + "application_preview": { + "fields": { + "applicationType": "差旅费用申请", + "time": "2026-02-20 至 2026-02-23", + "location": "上海辅助国网仿生产服务器", + "reason": "辅助国网仿生产服务器部署", + "days": "4天", + "transportMode": "火车", + "amount": "2120元", + } + }, + }, + ) + + claim = application_claim_query(db).one() + assert claim.location == "上海市" + assert claim.reason == "辅助国网仿生产服务器部署" + assert "申请单据已生成" in response.answer + assert response.draft_payload is not None + + +def test_user_agent_application_submit_splits_location_and_reason_from_raw_sentence() -> None: + session_factory = build_session_factory() + with session_factory() as db: + response = build_application_user_agent_response( + db, + "确认提交", + history=[ + { + "role": "user", + "content": "2026-02-20 至 2026-02-23,去上海辅助国网仿生产服务器部署,火车", + } + ], + context_overrides={ + "manager_name": "向万红", + "grade": "P5", + "department_name": "技术部", + }, + ) + + claim = application_claim_query(db).one() + assert claim.location == "上海市" + assert claim.reason == "辅助国网仿生产服务器部署" + assert "申请单据已生成" in response.answer + + +def test_application_sentence_jieba_tokenizer_recognizes_location_boundary() -> None: + tokens = resolve_jieba_tokens("上海辅助国网仿生产服务器部署") + + assert ("上海", "ns") in tokens + assert [word for word, _ in tokens] == ["上海", "辅助", "国网", "仿生产", "服务器", "部署"] + + def test_user_agent_application_maps_preview_travel_type_label() -> None: session_factory = build_session_factory() with session_factory() as db: @@ -2155,7 +2217,7 @@ def test_user_agent_returns_draft_limit_message_when_save_is_blocked() -> None: context_json={"review_action": "save_draft"}, tool_payload={ "draft_limit_reached": True, - "message": "你当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。", + "message": "您当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。", "status": "blocked", }, ) @@ -2163,7 +2225,7 @@ def test_user_agent_returns_draft_limit_message_when_save_is_blocked() -> None: assert ( response.answer - == "你当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。" + == "您当前已保存 3 个草稿,请先完成已保存的草稿,才能再次新建草稿。" )