feat: 增强知识库索引与设置页面模块化拆分

扩展知识库索引任务和 RAG 检索支持增量入库和文档去重,优
化本体检测和规则匹配精度,前端设置页面拆分为 LLM、邮件
和 Hermes 员工同步子面板并重构样式,新增日志详情组件和
知识入库日志模型,补充单元测试覆盖。
This commit is contained in:
caoxiaozhu
2026-05-22 23:47:28 +08:00
parent 88ff04bef8
commit 5b388d08c0
84 changed files with 10170 additions and 2599 deletions

View File

@@ -120,7 +120,7 @@ class UserAgentReviewDocumentCard(BaseModel):
filename: str = Field(description="原始文件名。")
document_type: str = Field(default="other", description="票据候选类型。")
suggested_expense_type: str = Field(default="other", description="建议归属费用类型。")
scene_label: str = Field(default="", description="面向用户展示的场景标签。")
scene_label: str = Field(default="", description="面向用户展示的票据类型标签。")
summary: str = Field(default="", description="逐票据摘要。")
avg_score: float = Field(default=0.0, ge=0.0, le=1.0, description="OCR 平均得分。")
preview_kind: str = Field(default="", description="票据预览类型,例如 image。")

View File

@@ -3,6 +3,8 @@ from __future__ import annotations
import re
from decimal import Decimal
from app.services.expense_type_keywords import iter_expense_keywords
EXPENSE_TYPE_LABELS = {
"travel": "差旅",
"train_ticket": "火车票",
@@ -12,10 +14,10 @@ EXPENSE_TYPE_LABELS = {
"travel_allowance": "出差补贴",
"hotel": "住宿",
"transport": "交通",
"meal": "餐费",
"meal": "业务招待",
"meeting": "会务",
"entertainment": "招待",
"office": "办公",
"office": "办公用品",
"training": "培训",
"communication": "通讯",
"welfare": "福利",
@@ -131,40 +133,19 @@ DOCUMENT_ROUTE_DESTINATION_LABELS = {
GENERIC_ATTACHMENT_BACKFILL_ITEM_TYPES = {"", "other", "travel", "transport", "hotel"}
LOCATION_REQUIRED_EXPENSE_TYPES = {"travel", "meeting", "entertainment"}
EXPENSE_SCENE_KEYWORDS = {
"travel": ("差旅", "出差", "行程"),
"hotel": ("酒店", "住宿", "房费", "客房", "入住", "离店"),
"transport": (
"交通",
"打车",
"出租车",
"网约车",
"滴滴",
"出行",
"乘车",
"用车",
"叫车",
"车费",
"车资",
"的士",
"高铁",
"动车",
"火车",
"机票",
"航班",
"行程单",
"登机",
"客票",
"公交",
"地铁",
"过路费",
"通行费",
"停车",
),
"meal": ("餐饮", "餐费", "用餐", "外卖", "快餐", "酒楼", "饭店", "饭馆", "食品", "咖啡"),
"entertainment": ("招待", "宴请", "接待", "客户餐", "商务餐", "业务招待"),
"office": ("办公", "办公用品", "文具", "耗材", "打印", "纸张", "硒鼓", "墨盒", "鼠标", "键盘", "电脑"),
"meeting": ("会议", "会务", "会展", "会议室", "会场", "场地费", "论坛"),
"training": ("培训", "课程", "讲师", "教材", "学费", "认证"),
code: tuple(iter_expense_keywords(code))
for code in (
"travel",
"hotel",
"transport",
"meal",
"entertainment",
"office",
"meeting",
"training",
"communication",
"welfare",
)
}
EXPENSE_TYPE_ALLOWED_DOCUMENT_SCENES = {
"travel": {"travel", "hotel", "transport", "meal"},
@@ -185,7 +166,7 @@ DOCUMENT_SCENE_LABELS = {
"travel": "差旅",
"hotel": "住宿",
"transport": "交通",
"meal": "餐饮",
"meal": "业务招待",
"entertainment": "业务招待",
"office": "办公用品",
"meeting": "会务",

View File

@@ -87,6 +87,7 @@ from app.services.expense_claim_constants import (
TRAVEL_POLICY_TRAIN_CLASS_PATTERNS,
TRAVEL_POLICY_HOTEL_NIGHT_PATTERN,
)
from app.services.expense_type_keywords import resolve_expense_type_code_from_text
from app.services.expense_claim_risk_review import ExpenseClaimRiskReviewMixin
from app.services.expense_amounts import (
extract_amount_candidates,
@@ -209,26 +210,7 @@ class ExpenseClaimOntologyResolverMixin:
or ""
).replace(" ", "")
if compact:
if "招待" in compact or ("客户" in compact and any(word in compact for word in ("吃饭", "宴请", "请客", "用餐"))):
return "entertainment"
if any(word in compact for word in ("差旅", "出差", "机票", "行程")):
return "travel"
if any(word in compact for word in ("住宿", "酒店", "宾馆")):
return "hotel"
if any(word in compact for word in ("交通", "打车", "网约车", "出租车", "乘车", "用车", "叫车", "车费", "车资", "的士", "停车")):
return "transport"
if any(word in compact for word in ("餐费", "用餐", "午餐", "晚餐", "早餐", "伙食")):
return "meal"
if "会务" in compact:
return "meeting"
if any(word in compact for word in ("办公费", "办公用品", "文具", "耗材", "办公耗材", "打印纸", "办公设备", "键盘", "鼠标", "白板")):
return "office"
if any(word in compact for word in ("培训费", "培训", "讲师费", "课时费", "课程费")):
return "training"
if any(word in compact for word in ("通讯费", "话费", "流量费", "宽带费")):
return "communication"
if any(word in compact for word in ("福利费", "团建", "慰问", "节日福利", "体检费")):
return "welfare"
return resolve_expense_type_code_from_text(compact)
return None
@staticmethod

View File

@@ -538,8 +538,8 @@ class ExpenseRuleRuntimeService:
if any(keyword in normalized for keyword in ("市内交通", "打车", "网约车", "出租车")):
return "transport"
if "招待" in normalized and "" in normalized:
return "entertainment"
if "餐补" in normalized or normalized == "餐费":
return "meal"
if "餐补" in normalized or normalized in {"餐费", "业务招待费"}:
return "meal"
return ""
@@ -547,7 +547,7 @@ class ExpenseRuleRuntimeService:
def _spreadsheet_metric_label(expense_type: str) -> str:
return {
"transport": "单笔交通金额",
"meal": "差旅餐补金额",
"meal": "业务招待费金额",
"entertainment": "人均招待餐费",
}.get(expense_type, "金额")

View File

@@ -8,6 +8,7 @@ EXPENSE_RULE_CODE_BLOCK_PATTERN = re.compile(r"```expense-rule\s*(\{.*?\})\s*```
DOCUMENT_TYPE_LABELS = {
"flight_itinerary": "机票/航班行程单",
"train_ticket": "火车/高铁票",
"ship_ticket": "轮船票",
"hotel_invoice": "酒店住宿票据",
"taxi_receipt": "出租车/网约车票据",
"parking_toll_receipt": "停车/通行费票据",
@@ -24,9 +25,9 @@ SCENE_LABELS = {
"travel": "差旅",
"hotel": "住宿",
"transport": "交通",
"meal": "餐饮",
"meal": "业务招待",
"entertainment": "业务招待",
"office": "办公",
"office": "办公用品",
"meeting": "会务",
"training": "培训",
"communication": "通讯",
@@ -73,7 +74,7 @@ DEFAULT_SCENE_MATRIX_CONFIG: dict[str, Any] = {
},
},
"meal": {
"label": "",
"label": "业务招待",
"location_required": False,
"min_attachment_count": 1,
"allowed_scene_codes": ["meal"],
@@ -84,7 +85,7 @@ DEFAULT_SCENE_MATRIX_CONFIG: dict[str, Any] = {
"warn_amount": "300.00",
"block_amount": "800.00",
"exception_keywords": ["客户接待", "团队活动", "加班", "展会", "超标说明"],
"metric_label": "费合计",
"metric_label": "业务招待费合计",
},
},
"entertainment": {
@@ -103,7 +104,7 @@ DEFAULT_SCENE_MATRIX_CONFIG: dict[str, Any] = {
},
},
"office": {
"label": "办公费",
"label": "办公用品",
"location_required": False,
"min_attachment_count": 1,
"allowed_scene_codes": ["office"],
@@ -114,7 +115,7 @@ DEFAULT_SCENE_MATRIX_CONFIG: dict[str, Any] = {
"warn_amount": "1500.00",
"block_amount": "5000.00",
"exception_keywords": ["批量采购", "固定资产", "部门集中采购", "超标说明"],
"metric_label": "办公费合计",
"metric_label": "办公用品费合计",
},
},
"meeting": {

View File

@@ -0,0 +1,245 @@
from __future__ import annotations
from typing import Iterable
EXPENSE_TYPE_KEYWORD_GROUPS: tuple[tuple[str, str, tuple[str, ...]], ...] = (
(
"travel",
"差旅费",
(
"差旅费",
"差旅",
"出差",
"外地出差",
"跨城交通",
"往返车票",
"机票",
"飞机票",
"航班",
"登机牌",
"行程单",
"火车票",
"高铁票",
"动车票",
"铁路客票",
"客票",
),
),
(
"hotel",
"住宿费",
(
"住宿费",
"住宿",
"酒店发票",
"酒店",
"宾馆",
"民宿",
"房费",
"客房",
"住店",
"入住",
"离店",
"住宿清单",
),
),
(
"transport",
"交通费",
(
"交通费",
"交通",
"市内交通",
"打车",
"网约车",
"出租车票",
"出租车",
"的士票",
"的士",
"滴滴",
"曹操出行",
"T3出行",
"出行",
"乘车费",
"乘车",
"用车",
"叫车",
"车费",
"车资",
"公交",
"地铁",
"停车费",
"停车",
"过路费",
"通行费",
"高速费",
"燃油费",
"油费",
),
),
(
"meal",
"业务招待费",
(
"业务招待费",
"业务招待",
"招待费",
"招待",
"客户招待",
"客户接待",
"商务接待",
"商务宴请",
"宴请",
"请客",
"请客户",
"客户用餐",
"客户餐",
"客户吃饭",
"陪同用餐",
"接待餐",
"餐费",
"伙食费",
"伙食",
"工作餐",
"餐饮",
"用餐",
"早餐",
"午餐",
"晚餐",
"夜宵",
"盒饭",
"茶歇",
"餐票",
"饭票",
),
),
(
"meeting",
"会务费",
(
"会务费",
"会务",
"会议费",
"会议",
"参会",
"会场",
"场地费",
"论坛",
"展会",
"研讨会",
"峰会",
"布展",
),
),
(
"office",
"办公用品费",
(
"办公用品费",
"办公费",
"办公用品",
"办公耗材",
"办公设备",
"办公",
"文具",
"耗材",
"打印纸",
"打印",
"纸张",
"硒鼓",
"墨盒",
"键盘",
"鼠标",
"白板",
"电脑配件",
),
),
(
"training",
"培训费",
(
"培训费",
"培训",
"讲师费",
"讲师",
"课时费",
"课程费",
"课程",
"教材",
"学费",
"考试费",
"认证费",
"认证",
),
),
(
"communication",
"通讯费",
(
"通讯费",
"通讯",
"话费",
"电话费",
"手机费",
"流量费",
"流量",
"宽带费",
"宽带",
"网络费",
),
),
(
"welfare",
"福利费",
(
"福利费",
"福利",
"团建",
"慰问",
"节日福利",
"体检费",
"体检",
"员工关怀",
),
),
)
EXPENSE_TYPE_LABEL_BY_CODE = {
code: label for code, label, _keywords in EXPENSE_TYPE_KEYWORD_GROUPS
}
EXPENSE_TYPE_LABEL_BY_CODE.setdefault("entertainment", "业务招待费")
def build_expense_type_keyword_map() -> dict[str, str]:
mapping: dict[str, str] = {}
for code, _label, keywords in EXPENSE_TYPE_KEYWORD_GROUPS:
for keyword in keywords:
mapping.setdefault(keyword, code)
return mapping
def iter_expense_keywords(*codes: str) -> Iterable[str]:
allowed_codes = {str(code or "").strip() for code in codes if str(code or "").strip()}
for code, _label, keywords in EXPENSE_TYPE_KEYWORD_GROUPS:
if allowed_codes and code not in allowed_codes:
continue
yield from keywords
def resolve_expense_type_code_from_text(value: str) -> str | None:
compact = str(value or "").replace(" ", "")
if not compact:
return None
for code, _label, keywords in EXPENSE_TYPE_KEYWORD_GROUPS:
if any(keyword in compact for keyword in keywords):
return code
return None
def resolve_expense_type_label_from_text(value: str) -> tuple[str, str] | None:
code = resolve_expense_type_code_from_text(value)
if not code:
return None
return code, EXPENSE_TYPE_LABEL_BY_CODE.get(code, str(value or "").strip())

View File

@@ -63,6 +63,7 @@ class KnowledgeIndexTaskManager:
heartbeat_stop = threading.Event()
heartbeat_thread: threading.Thread | None = None
tool_call_id = ""
knowledge_ingest: dict[str, Any] | None = None
tool_request_json = {
"agent": AgentName.HERMES.value,
"folder": folder,
@@ -74,6 +75,10 @@ class KnowledgeIndexTaskManager:
run_service = AgentRunService(db)
knowledge_service = KnowledgeService(db=db)
rag_service = KnowledgeRagService(db=db)
knowledge_ingest = _build_initial_knowledge_ingest_state(
knowledge_service,
document_ids=document_ids,
)
run_service.merge_route_json(
agent_run_id,
@@ -93,7 +98,18 @@ class KnowledgeIndexTaskManager:
"skipped_documents": 0,
"percent": 10 if document_ids else 100,
},
"knowledge_ingest": knowledge_ingest,
},
result_summary=_build_ingest_running_summary(
knowledge_ingest,
{
"total_documents": len(document_ids),
"completed_documents": 0,
"failed_documents": 0,
"skipped_documents": 0,
"percent": 10 if document_ids else 100,
},
),
)
tool_call = run_service.record_tool_call(
run_id=agent_run_id,
@@ -134,44 +150,159 @@ class KnowledgeIndexTaskManager:
)
heartbeat_thread.start()
response = rag_service.index_documents(document_ids=document_ids, force=force)
succeeded_document_ids = [
str(item).strip()
for item in list(response.get("succeeded_document_ids") or [])
if str(item).strip()
]
failed_documents = [
item
for item in list(response.get("failed_documents") or [])
if isinstance(item, dict)
]
responses: list[dict[str, Any]] = []
succeeded_document_ids: list[str] = []
failed_documents: list[dict[str, str]] = []
total_documents = len(document_ids)
for index, document_id in enumerate(document_ids, start=1):
_patch_ingest_document(
knowledge_ingest,
document_id,
{
"status": "running",
"phase": "indexing",
"started_at": datetime.now(UTC).isoformat(),
},
event=f"开始处理第 {index}/{total_documents} 个文件,正在写入 LightRAG。",
)
knowledge_ingest["current_document_id"] = document_id
_sync_ingest_route_json(
run_service,
agent_run_id,
knowledge_ingest,
progress=_build_ingest_progress(knowledge_ingest, total_documents),
)
try:
response = rag_service.index_documents(document_ids=[document_id], force=force)
except Exception as exc:
logger.exception(
"Knowledge document index failed run_id=%s doc_id=%s",
agent_run_id,
document_id,
)
failed_documents.append(
{
"document_id": document_id,
"status": "exception",
"error": str(exc),
}
)
_patch_ingest_document(
knowledge_ingest,
document_id,
{
"status": "failed",
"phase": "failed",
"finished_at": datetime.now(UTC).isoformat(),
"error": str(exc),
},
event=f"归集失败:{exc}",
level="error",
)
knowledge_service.set_document_ingest_statuses(
[document_id],
KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
)
_refresh_ingest_graph(knowledge_ingest)
_sync_ingest_route_json(
run_service,
agent_run_id,
knowledge_ingest,
progress=_build_ingest_progress(knowledge_ingest, total_documents),
)
continue
responses.append(response)
response_failed_documents = _extract_failed_documents(response, document_id)
document_summary = _extract_document_summary(response, document_id)
if response_failed_documents:
failed_documents.extend(response_failed_documents)
error_text = (
response_failed_documents[0].get("error") or "LightRAG 未返回可查询状态"
)
_patch_ingest_document(
knowledge_ingest,
document_id,
{
**document_summary,
"status": "failed",
"phase": "failed",
"finished_at": datetime.now(UTC).isoformat(),
"error": error_text,
"track_id": str(response.get("track_id") or "").strip(),
},
event=f"LightRAG 索引失败:{error_text}",
level="error",
)
knowledge_service.set_document_ingest_statuses(
[document_id],
KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
)
else:
succeeded_document_ids.append(document_id)
chunk_count = int(document_summary.get("chunk_count") or 0)
entity_count = int(document_summary.get("entity_count") or 0)
relation_count = int(document_summary.get("relation_count") or 0)
_patch_ingest_document(
knowledge_ingest,
document_id,
{
**document_summary,
"status": "succeeded",
"phase": "indexed",
"finished_at": datetime.now(UTC).isoformat(),
"track_id": str(response.get("track_id") or "").strip(),
},
event=(
"LightRAG 索引完成:"
f"{chunk_count} 个 chunk{entity_count} 个实体,"
f"{relation_count} 条关系。"
),
)
knowledge_service.set_document_ingest_statuses(
[document_id],
KNOWLEDGE_INGEST_STATUS_INGESTED,
agent_run_id=agent_run_id,
)
_refresh_ingest_graph(knowledge_ingest)
_sync_ingest_route_json(
run_service,
agent_run_id,
knowledge_ingest,
progress=_build_ingest_progress(knowledge_ingest, total_documents),
)
failed_document_ids = [
str(item.get("document_id") or "").strip()
for item in failed_documents
if str(item.get("document_id") or "").strip()
]
if succeeded_document_ids:
knowledge_service.set_document_ingest_statuses(
succeeded_document_ids,
KNOWLEDGE_INGEST_STATUS_INGESTED,
agent_run_id=agent_run_id,
)
if failed_document_ids:
knowledge_service.set_document_ingest_statuses(
failed_document_ids,
KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
)
duration_ms = int((perf_counter() - started) * 1000)
tool_status = "succeeded" if not failed_document_ids else "failed"
latest_track_id = _resolve_latest_track_id(responses)
knowledge_ingest["current_document_id"] = ""
knowledge_ingest["status"] = tool_status
knowledge_ingest["phase"] = "completed"
knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
heartbeat_stop.set()
if heartbeat_thread is not None:
heartbeat_thread.join(timeout=1)
run_service.update_tool_call(
tool_call_id,
response_json=response,
response_json={
"track_id": latest_track_id,
"requested_document_ids": document_ids,
"succeeded_document_ids": succeeded_document_ids,
"failed_documents": failed_documents,
"documents": knowledge_ingest.get("documents", []),
"responses": responses,
},
status=tool_status,
duration_ms=duration_ms,
error_message=None if tool_status == "succeeded" else "部分文档索引失败。",
@@ -183,14 +314,17 @@ class KnowledgeIndexTaskManager:
summary = (
f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引。"
if failed_count == 0
else f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引,失败 {failed_count} 个。"
else (
f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引,"
f"失败 {failed_count} 个。"
)
)
run_service.merge_route_json(
agent_run_id,
{
"job_type": "knowledge_index_sync",
"phase": "completed",
"track_id": str(response.get("track_id") or "").strip(),
"track_id": latest_track_id,
"heartbeat_at": datetime.now(UTC).isoformat(),
"progress": {
"total_documents": total_documents,
@@ -199,6 +333,7 @@ class KnowledgeIndexTaskManager:
"skipped_documents": 0,
"percent": 100,
},
"knowledge_ingest": knowledge_ingest,
},
status=(
AgentRunStatus.SUCCEEDED.value
@@ -234,24 +369,50 @@ class KnowledgeIndexTaskManager:
error_message=str(exc),
)
KnowledgeService(db=db).set_document_ingest_statuses(
document_ids,
_resolve_failed_ingest_document_ids(knowledge_ingest, document_ids),
KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
)
if knowledge_ingest is not None:
for document_id in document_ids:
document = _find_ingest_document(knowledge_ingest, document_id)
if document is None or document.get("status") in {"succeeded", "failed"}:
continue
_patch_ingest_document(
knowledge_ingest,
document_id,
{
"status": "failed",
"phase": "failed",
"finished_at": datetime.now(UTC).isoformat(),
"error": str(exc),
},
event=f"归集任务中断:{exc}",
level="error",
)
knowledge_ingest["status"] = "failed"
knowledge_ingest["phase"] = "failed"
knowledge_ingest["current_document_id"] = ""
knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
route_payload: dict[str, Any] = {
"job_type": "knowledge_index_sync",
"phase": "failed",
"heartbeat_at": datetime.now(UTC).isoformat(),
"progress": {
"total_documents": len(document_ids),
"completed_documents": 0,
"failed_documents": len(document_ids),
"skipped_documents": 0,
"percent": 100,
},
}
if knowledge_ingest is not None:
route_payload["knowledge_ingest"] = knowledge_ingest
AgentRunService(db).merge_route_json(
agent_run_id,
{
"job_type": "knowledge_index_sync",
"phase": "failed",
"heartbeat_at": datetime.now(UTC).isoformat(),
"progress": {
"total_documents": len(document_ids),
"completed_documents": 0,
"failed_documents": len(document_ids),
"skipped_documents": 0,
"percent": 100,
},
},
route_payload,
status=AgentRunStatus.FAILED.value,
result_summary=str(exc),
error_message=str(exc),
@@ -267,4 +428,312 @@ class KnowledgeIndexTaskManager:
db.close()
def _build_initial_knowledge_ingest_state(
knowledge_service: KnowledgeService,
*,
document_ids: list[str],
) -> dict[str, Any]:
now = datetime.now(UTC).isoformat()
documents = [
_build_initial_knowledge_ingest_document(knowledge_service, document_id, now=now)
for document_id in document_ids
]
return {
"schema_version": 1,
"status": "running",
"phase": "queued",
"started_at": now,
"finished_at": None,
"current_document_id": documents[0]["document_id"] if documents else "",
"documents": documents,
"graph": _build_ingest_graph({"documents": documents}),
}
def _build_initial_knowledge_ingest_document(
knowledge_service: KnowledgeService,
document_id: str,
*,
now: str,
) -> dict[str, Any]:
try:
entry = knowledge_service.get_document_entry(document_id)
except Exception:
entry = {}
return {
"document_id": document_id,
"name": str(entry.get("original_name") or document_id).strip(),
"folder": str(entry.get("folder") or "").strip(),
"extension": str(entry.get("extension") or "").strip(),
"mime_type": str(entry.get("mime_type") or "").strip(),
"status": "queued",
"phase": "queued",
"started_at": None,
"finished_at": None,
"text_chars": 0,
"indexed_text_chars": 0,
"section_count": 0,
"sections": [],
"chunk_count": 0,
"chunk_ids": [],
"chunks": [],
"entity_count": 0,
"relation_count": 0,
"entities": [],
"relations": [],
"events": [
{
"at": now,
"level": "info",
"message": "已进入知识归集队列,等待 LightRAG 处理。",
}
],
}
def _patch_ingest_document(
knowledge_ingest: dict[str, Any],
document_id: str,
updates: dict[str, Any],
*,
event: str = "",
level: str = "info",
) -> None:
document = _find_ingest_document(knowledge_ingest, document_id)
if document is None:
return
document.update(updates)
if event:
_append_ingest_event(document, event, level=level)
def _append_ingest_event(document: dict[str, Any], message: str, *, level: str) -> None:
events = document.get("events")
if not isinstance(events, list):
events = []
events.append(
{
"at": datetime.now(UTC).isoformat(),
"level": level,
"message": message,
}
)
document["events"] = events[-30:]
def _find_ingest_document(
knowledge_ingest: dict[str, Any],
document_id: str,
) -> dict[str, Any] | None:
for document in list(knowledge_ingest.get("documents") or []):
if not isinstance(document, dict):
continue
if str(document.get("document_id") or "").strip() == document_id:
return document
return None
def _sync_ingest_route_json(
run_service: AgentRunService,
agent_run_id: str,
knowledge_ingest: dict[str, Any],
*,
progress: dict[str, int],
) -> None:
run_service.merge_route_json(
agent_run_id,
{
"job_type": "knowledge_index_sync",
"phase": "indexing",
"heartbeat_at": datetime.now(UTC).isoformat(),
"progress": progress,
"knowledge_ingest": knowledge_ingest,
},
result_summary=_build_ingest_running_summary(knowledge_ingest, progress),
)
def _build_ingest_running_summary(
knowledge_ingest: dict[str, Any],
progress: dict[str, int],
) -> str:
total_documents = int(progress.get("total_documents") or 0)
completed_documents = int(progress.get("completed_documents") or 0)
failed_documents = int(progress.get("failed_documents") or 0)
current_document_id = str(knowledge_ingest.get("current_document_id") or "").strip()
current_document = (
_find_ingest_document(knowledge_ingest, current_document_id)
if current_document_id
else None
)
if current_document is not None:
name = str(current_document.get("name") or current_document_id).strip()
current_index = _resolve_ingest_document_index(knowledge_ingest, current_document_id)
return (
f"知识归纳正在处理 {current_index}/{total_documents}{name}"
f"已完成 {completed_documents} 个,失败 {failed_documents} 个。"
)
return (
f"知识归纳正在运行,已完成 {completed_documents}/{total_documents} 个文档,"
f"失败 {failed_documents} 个。"
)
def _resolve_ingest_document_index(
knowledge_ingest: dict[str, Any],
document_id: str,
) -> int:
documents = [
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
]
for index, document in enumerate(documents, start=1):
if str(document.get("document_id") or "").strip() == document_id:
return index
return 0
def _build_ingest_progress(
knowledge_ingest: dict[str, Any],
total_documents: int,
) -> dict[str, int]:
documents = [
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
]
completed_documents = sum(1 for item in documents if item.get("status") == "succeeded")
failed_documents = sum(1 for item in documents if item.get("status") == "failed")
skipped_documents = sum(1 for item in documents if item.get("status") == "skipped")
done_documents = completed_documents + failed_documents + skipped_documents
if total_documents <= 0:
percent = 100
else:
percent = min(95, max(10, 10 + int(done_documents * 85 / total_documents)))
return {
"total_documents": total_documents,
"completed_documents": completed_documents,
"failed_documents": failed_documents,
"skipped_documents": skipped_documents,
"percent": percent,
}
def _extract_document_summary(response: dict[str, Any], document_id: str) -> dict[str, Any]:
for item in list(response.get("document_summaries") or []):
if not isinstance(item, dict):
continue
if str(item.get("document_id") or "").strip() == document_id:
return dict(item)
return {}
def _extract_failed_documents(
response: dict[str, Any],
document_id: str,
) -> list[dict[str, str]]:
failed_documents: list[dict[str, str]] = []
for item in list(response.get("failed_documents") or []):
if not isinstance(item, dict):
continue
item_document_id = str(item.get("document_id") or "").strip()
if item_document_id and item_document_id != document_id:
continue
failed_documents.append(
{
"document_id": item_document_id or document_id,
"status": str(item.get("status") or "failed").strip(),
"error": str(item.get("error") or "LightRAG 索引失败").strip(),
}
)
return failed_documents
def _resolve_failed_ingest_document_ids(
knowledge_ingest: dict[str, Any] | None,
document_ids: list[str],
) -> list[str]:
if knowledge_ingest is None:
return document_ids
failed_document_ids: list[str] = []
seen_document_ids: set[str] = set()
for document in list(knowledge_ingest.get("documents") or []):
if not isinstance(document, dict):
continue
document_id = str(document.get("document_id") or "").strip()
if not document_id:
continue
seen_document_ids.add(document_id)
if document.get("status") != "succeeded":
failed_document_ids.append(document_id)
failed_document_ids.extend(
document_id for document_id in document_ids if document_id not in seen_document_ids
)
return failed_document_ids
def _refresh_ingest_graph(knowledge_ingest: dict[str, Any]) -> None:
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
def _build_ingest_graph(knowledge_ingest: dict[str, Any]) -> dict[str, Any]:
documents = [
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
]
entities = _dedupe_text_items(
entity for document in documents for entity in list(document.get("entities") or [])
)
relations = _dedupe_relations(
relation for document in documents for relation in list(document.get("relations") or [])
)
return {
"chunk_count": sum(_to_int(document.get("chunk_count")) for document in documents),
"entity_count": sum(_to_int(document.get("entity_count")) for document in documents),
"relation_count": sum(_to_int(document.get("relation_count")) for document in documents),
"entities": entities[:60],
"relations": relations[:60],
}
def _dedupe_text_items(items: Any) -> list[str]:
deduped: list[str] = []
seen: set[str] = set()
for item in items:
text = str(item or "").strip()
if not text or text in seen:
continue
seen.add(text)
deduped.append(text)
return deduped
def _dedupe_relations(items: Any) -> list[dict[str, str]]:
deduped: list[dict[str, str]] = []
seen: set[tuple[str, str, str]] = set()
for item in items:
if not isinstance(item, dict):
continue
source = str(item.get("source") or "").strip()
target = str(item.get("target") or "").strip()
relation_type = str(item.get("type") or "关联").strip()
key = (source, target, relation_type)
if not source or not target or key in seen:
continue
seen.add(key)
deduped.append({"source": source, "target": target, "type": relation_type})
return deduped
def _resolve_latest_track_id(responses: list[dict[str, Any]]) -> str:
for response in reversed(responses):
track_id = str(response.get("track_id") or "").strip()
if track_id:
return track_id
return ""
def _to_int(value: Any) -> int:
try:
return int(value or 0)
except (TypeError, ValueError):
return 0
knowledge_index_task_manager = KnowledgeIndexTaskManager()

View File

@@ -0,0 +1,224 @@
from __future__ import annotations
import json
import re
from pathlib import Path
from typing import Any
MAX_INGEST_LOG_CHUNKS = 24
MAX_INGEST_LOG_ENTITIES = 24
MAX_INGEST_LOG_RELATIONS = 24
MAX_INGEST_LOG_SECTIONS = 12
MAX_INGEST_LOG_TEXT_PREVIEW = 180
INGEST_SECTION_HEADING_PATTERN = re.compile(
r"^(?:#{1,4}\s+.+|第[一二三四五六七八九十百零0-9]+[章节条]\s*.*)$"
)
def build_ingest_document_summary(
*,
document_id: str,
entry: dict[str, Any],
raw_text: str,
indexed_text: str,
) -> dict[str, Any]:
raw_text_value = str(raw_text or "")
indexed_text_value = str(indexed_text or "")
sections = _extract_ingest_sections(indexed_text_value)
return {
"document_id": document_id,
"name": str(entry.get("original_name") or "").strip(),
"folder": str(entry.get("folder") or "").strip(),
"extension": str(entry.get("extension") or "").strip(),
"mime_type": str(entry.get("mime_type") or "").strip(),
"text_chars": len(raw_text_value),
"indexed_text_chars": len(indexed_text_value),
"section_count": len(sections),
"sections": sections,
"chunk_count": 0,
"chunk_ids": [],
"chunks": [],
"entity_count": 0,
"relation_count": 0,
"entities": [],
"relations": [],
}
def build_ingest_status_summary(
*,
status_payload: dict[str, Any],
graph_summary: dict[str, Any],
) -> dict[str, Any]:
chunk_ids = _normalize_chunk_ids(status_payload)
chunk_count = _resolve_chunk_count(status_payload, chunk_ids)
return {
"lightrag_status": str(status_payload.get("status") or "").strip(),
"query_ready": bool(status_payload.get("query_ready")),
"chunk_count": chunk_count,
"chunk_ids": chunk_ids[:MAX_INGEST_LOG_CHUNKS],
**graph_summary,
}
def build_document_graph_summary(
storage_root: Path,
*,
workspace: str,
document_id: str,
) -> dict[str, Any]:
workspace_dir = (
Path(storage_root) / "knowledge" / ".lightrag" / str(workspace).strip()
).resolve()
entities_payload = _load_json_file(workspace_dir / "kv_store_full_entities.json")
relations_payload = _load_json_file(workspace_dir / "kv_store_full_relations.json")
chunks_payload = _load_json_file(workspace_dir / "kv_store_text_chunks.json")
entities = _normalize_document_entities(entities_payload, document_id)
relations = _normalize_document_relations(relations_payload, document_id)
chunks = _normalize_document_chunks(chunks_payload, document_id)
return {
"entity_count": len(entities),
"relation_count": len(relations),
"entities": entities[:MAX_INGEST_LOG_ENTITIES],
"relations": relations[:MAX_INGEST_LOG_RELATIONS],
"chunks": chunks[:MAX_INGEST_LOG_CHUNKS],
}
def _extract_ingest_sections(text: str) -> list[dict[str, str]]:
sections: list[dict[str, str]] = []
lines = [line.strip() for line in str(text or "").splitlines()]
for index, line in enumerate(lines):
if len(sections) >= MAX_INGEST_LOG_SECTIONS:
break
if not line or len(line) > 90 or not INGEST_SECTION_HEADING_PATTERN.match(line):
continue
sections.append(
{
"title": line.lstrip("#").strip(),
"excerpt": _find_following_excerpt(lines[index + 1 :]),
}
)
return sections
def _find_following_excerpt(lines: list[str]) -> str:
collected: list[str] = []
for line in lines:
if not line:
continue
if INGEST_SECTION_HEADING_PATTERN.match(line):
break
collected.append(line)
if len(" ".join(collected)) >= MAX_INGEST_LOG_TEXT_PREVIEW:
break
return _truncate_text(" ".join(collected), max_length=MAX_INGEST_LOG_TEXT_PREVIEW)
def _normalize_chunk_ids(status_payload: dict[str, Any]) -> list[str]:
chunks_list = status_payload.get("chunks_list")
if not isinstance(chunks_list, list):
return []
return [str(item).strip() for item in chunks_list if str(item or "").strip()]
def _resolve_chunk_count(status_payload: dict[str, Any], chunk_ids: list[str]) -> int:
try:
return int(status_payload.get("chunks_count") or len(chunk_ids))
except (TypeError, ValueError):
return len(chunk_ids)
def _load_json_file(path: Path) -> dict[str, Any]:
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except (FileNotFoundError, json.JSONDecodeError, OSError):
return {}
return payload if isinstance(payload, dict) else {}
def _normalize_document_entities(payload: dict[str, Any], document_id: str) -> list[str]:
document_payload = payload.get(document_id) if isinstance(payload, dict) else {}
entity_names = (
document_payload.get("entity_names") if isinstance(document_payload, dict) else []
)
if not isinstance(entity_names, list):
return []
return _dedupe_text_items(entity_names)
def _normalize_document_relations(
payload: dict[str, Any], document_id: str
) -> list[dict[str, str]]:
document_payload = payload.get(document_id) if isinstance(payload, dict) else {}
relation_pairs = (
document_payload.get("relation_pairs") if isinstance(document_payload, dict) else []
)
if not isinstance(relation_pairs, list):
return []
relations: list[dict[str, str]] = []
seen: set[tuple[str, str]] = set()
for pair in relation_pairs:
if not isinstance(pair, (list, tuple)) or len(pair) < 2:
continue
source = str(pair[0] or "").strip()
target = str(pair[1] or "").strip()
if not source or not target or (source, target) in seen:
continue
seen.add((source, target))
relations.append({"source": source, "target": target, "type": "关联"})
return relations
def _normalize_document_chunks(payload: dict[str, Any], document_id: str) -> list[dict[str, Any]]:
chunks: list[dict[str, Any]] = []
for chunk_id, raw_chunk in payload.items():
if not isinstance(raw_chunk, dict):
continue
if str(raw_chunk.get("full_doc_id") or "").strip() != document_id:
continue
content = str(raw_chunk.get("content") or "").strip()
chunks.append(
{
"id": str(raw_chunk.get("_id") or chunk_id).strip(),
"order": _to_int(raw_chunk.get("chunk_order_index")),
"tokens": _to_int(raw_chunk.get("tokens")),
"summary": _build_chunk_summary(content),
}
)
return sorted(chunks, key=lambda item: (item["order"], item["id"]))
def _build_chunk_summary(content: str) -> str:
lines = [line.strip() for line in str(content or "").splitlines() if line.strip()]
text = next((line for line in lines if len(line) >= 12), lines[0] if lines else "")
return _truncate_text(text, max_length=MAX_INGEST_LOG_TEXT_PREVIEW)
def _dedupe_text_items(items: list[Any]) -> list[str]:
deduped: list[str] = []
seen: set[str] = set()
for item in items:
text = str(item or "").strip()
if not text or text in seen:
continue
seen.add(text)
deduped.append(text)
return deduped
def _to_int(value: Any) -> int:
try:
return int(value or 0)
except (TypeError, ValueError):
return 0
def _truncate_text(text: str, *, max_length: int) -> str:
normalized = " ".join(str(text or "").split()).strip()
if len(normalized) <= max_length:
return normalized
return f"{normalized[: max_length - 3].rstrip()}..."

View File

@@ -12,24 +12,15 @@ from sqlalchemy.orm import Session
from app.core.config import get_settings
from app.core.logging import get_logger
from app.db.session import get_session_factory
from app.services.knowledge_ingest_log import (
build_document_graph_summary,
build_ingest_document_summary,
build_ingest_status_summary,
)
from app.services.knowledge_rag_runtime import (
DEFAULT_EMBEDDING_TIMEOUT_SECONDS,
DEFAULT_LIGHTRAG_QUERY_MODE,
DEFAULT_LLM_TIMEOUT_SECONDS,
KnowledgeRagError,
RuntimeModelConfig,
_LightRagRuntime,
_build_ali_rerank_request,
_build_azure_deployment_base,
_build_headers,
_ensure_path,
_extract_chat_text,
_extract_embedding_vectors,
_extract_error_message,
_extract_rerank_results,
_normalize_endpoint,
_parse_json_body,
_send_json_request,
)
from app.services.settings import SettingsService
@@ -76,11 +67,9 @@ STRUCTURED_APPENDIX_LEADING_MARKERS = (
"# 结构化表格补充",
)
STRUCTURED_APPENDIX_LEADING_WINDOW = 220
_runtime_lock = threading.RLock()
_runtime_instance: _LightRagRuntime | None = None
_runtime_signature: tuple[Any, ...] | None = None
_runtime_instances: dict[int, _LightRagRuntime] = {}
_runtime_signatures: dict[int, tuple[Any, ...]] = {}
class KnowledgeRagService:
@@ -147,7 +136,11 @@ class KnowledgeRagService:
"query": normalized_query,
"record_count": len(hits),
"hits": hits,
"references": [str(item.get("code") or "").strip() for item in hits if str(item.get("code") or "").strip()],
"references": [
str(item.get("code") or "").strip()
for item in hits
if str(item.get("code") or "").strip()
],
"raw_references": references,
"metadata": raw.get("metadata") if isinstance(raw, dict) else {},
"message": f"已从知识库中检索到 {len(hits)} 条相关内容。",
@@ -172,6 +165,7 @@ class KnowledgeRagService:
)
texts: list[str] = []
file_paths: list[str] = []
document_summaries: list[dict[str, Any]] = []
runtime = self._get_runtime()
existing_statuses = runtime.get_document_statuses(normalized_ids)
@@ -182,12 +176,29 @@ class KnowledgeRagService:
try:
runtime.delete_document(document_id)
except Exception as exc:
logger.warning("Delete existing LightRAG document failed doc_id=%s: %s", document_id, exc)
logger.warning(
"Delete existing LightRAG document failed doc_id=%s: %s", document_id, exc
)
text = knowledge_service.extract_document_text(document_id)
raw_text = text
if normalization_service is not None:
text = normalization_service.build_enriched_text(text)
texts.append(text)
file_paths.append(str((knowledge_service.library_root / entry["folder"] / entry["stored_name"]).resolve()))
file_paths.append(
str(
(
knowledge_service.library_root / entry["folder"] / entry["stored_name"]
).resolve()
)
)
document_summaries.append(
build_ingest_document_summary(
document_id=document_id,
entry=entry,
raw_text=raw_text,
indexed_text=text,
)
)
track_id = runtime.insert_documents(
texts=texts,
@@ -198,10 +209,32 @@ class KnowledgeRagService:
statuses = runtime.get_document_statuses(normalized_ids)
succeeded_document_ids: list[str] = []
failed_documents: list[dict[str, str]] = []
summary_by_id = {
str(item.get("document_id") or "").strip(): item
for item in document_summaries
if str(item.get("document_id") or "").strip()
}
for document_id in normalized_ids:
status_obj = statuses.get(document_id)
status_text = self._status_value(status_obj)
status_payload = self._serialize_status(status_obj)
workspace = (
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
or DEFAULT_LIGHTRAG_WORKSPACE
)
graph_summary = build_document_graph_summary(
self.storage_root,
workspace=workspace,
document_id=document_id,
)
if document_id in summary_by_id:
summary_by_id[document_id].update(
build_ingest_status_summary(
status_payload=status_payload,
graph_summary=graph_summary,
)
)
if self.is_query_ready_status(status_obj):
succeeded_document_ids.append(document_id)
continue
@@ -218,13 +251,18 @@ class KnowledgeRagService:
"requested_document_ids": normalized_ids,
"succeeded_document_ids": succeeded_document_ids,
"failed_documents": failed_documents,
"document_summaries": [
summary_by_id.get(document_id, {}) for document_id in normalized_ids
],
"status_snapshot": {
document_id: self._serialize_status(status_obj)
for document_id, status_obj in statuses.items()
},
}
def get_document_status_map(self, document_ids: list[str] | None = None) -> dict[str, dict[str, Any]]:
def get_document_status_map(
self, document_ids: list[str] | None = None
) -> dict[str, dict[str, Any]]:
target_ids = [str(item).strip() for item in document_ids or [] if str(item).strip()]
if not target_ids:
return {}
@@ -248,28 +286,32 @@ class KnowledgeRagService:
logger.warning("Delete LightRAG document ignored doc_id=%s: %s", normalized_id, exc)
def _get_runtime(self) -> _LightRagRuntime:
global _runtime_instance, _runtime_signature
signature, runtime_kwargs = self._build_runtime_signature()
thread_id = threading.get_ident()
with _runtime_lock:
if _runtime_instance is not None and _runtime_signature == signature:
return _runtime_instance
runtime = _runtime_instances.get(thread_id)
if runtime is not None and _runtime_signatures.get(thread_id) == signature:
return runtime
if _runtime_instance is not None:
if runtime is not None:
try:
_runtime_instance.finalize()
runtime.finalize()
except Exception as exc: # pragma: no cover - best effort cleanup
logger.warning("Finalize previous LightRAG runtime failed: %s", exc)
_runtime_instance = _LightRagRuntime(**runtime_kwargs)
_runtime_signature = signature
return _runtime_instance
runtime = _LightRagRuntime(**runtime_kwargs)
_runtime_instances[thread_id] = runtime
_runtime_signatures[thread_id] = signature
return runtime
def _build_runtime_signature(self) -> tuple[tuple[Any, ...], dict[str, Any]]:
configs = self._load_runtime_configs()
settings = get_settings()
working_dir = (self.storage_root / "knowledge" / ".lightrag").resolve()
workspace = os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip() or DEFAULT_LIGHTRAG_WORKSPACE
workspace = (
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
or DEFAULT_LIGHTRAG_WORKSPACE
)
qdrant_url = os.environ.get("QDRANT_URL", "").strip() or _resolve_default_qdrant_url()
qdrant_api_key = os.environ.get("QDRANT_API_KEY", "").strip()
@@ -318,7 +360,9 @@ class KnowledgeRagService:
try:
settings_service = SettingsService(session)
main = self._normalize_runtime_model(settings_service.get_runtime_model_config("main"))
embedding = self._normalize_runtime_model(settings_service.get_runtime_model_config("embedding"))
embedding = self._normalize_runtime_model(
settings_service.get_runtime_model_config("embedding")
)
try:
backup_raw = settings_service.get_runtime_model_config("backup")
backup = self._normalize_runtime_model(backup_raw)
@@ -405,7 +449,9 @@ class KnowledgeRagService:
document_id, document_name = _parse_document_identity(file_path)
normalized_chunk_id = chunk_id or f"path-{rank}"
normalized_content = _truncate_text(content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH)
normalized_content = _truncate_text(
content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH
)
excerpt = _build_query_focused_excerpt(
normalized_content,
query_terms=query_terms,
@@ -510,17 +556,14 @@ class KnowledgeRagService:
def shutdown_knowledge_rag_runtime() -> None:
global _runtime_instance, _runtime_signature
with _runtime_lock:
if _runtime_instance is None:
return
try:
_runtime_instance.finalize()
except Exception as exc: # pragma: no cover - best effort cleanup
logger.warning("Finalize LightRAG runtime failed during shutdown: %s", exc)
_runtime_instance = None
_runtime_signature = None
for runtime in list(_runtime_instances.values()):
try:
runtime.finalize()
except Exception as exc: # pragma: no cover - best effort cleanup
logger.warning("Finalize LightRAG runtime failed during shutdown: %s", exc)
_runtime_instances.clear()
_runtime_signatures.clear()
def _parse_document_identity(file_path: str) -> tuple[str, str]:
@@ -551,9 +594,7 @@ def _build_query_focused_excerpt(
lowered = normalized.lower()
match_positions = [
lowered.find(term)
for term in query_terms
if term and lowered.find(term) >= 0
lowered.find(term) for term in query_terms if term and lowered.find(term) >= 0
]
if not match_positions:
return _build_excerpt(normalized, max_length=max_length)
@@ -649,7 +690,9 @@ def _score_knowledge_hit(
elif leading_appendix_marker == "# 重点章节摘录":
score += 4 if matched_terms else -12
elif leading_appendix_marker == "# 问答线索补充":
score += 8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
score += (
8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
)
elif leading_appendix_marker == "# 结构化表格补充":
if prefers_tabular_evidence and matched_terms:
score += 16
@@ -666,7 +709,11 @@ def _score_knowledge_hit(
score += 4
if matched_terms and any(marker in content for marker in ("附表", "", "")):
score += 4
if not prefers_tabular_evidence and matched_terms and any(marker in content for marker in ("", "", "", "-", "")):
if (
not prefers_tabular_evidence
and matched_terms
and any(marker in content for marker in ("", "", "", "-", ""))
):
score += 4
if title and any(term in title for term in query_terms):
score += 6

View File

@@ -170,6 +170,7 @@ class SemanticOntologyService(
entities = self._merge_entities(
entities,
model_parse.entity_hints if model_parse is not None else [],
compact_query,
)
intent = self._resolve_intent(
compact_query,
@@ -193,6 +194,11 @@ class SemanticOntologyService(
context_json=context_json,
)
)
missing_slots = self._filter_expense_missing_slots(
compact_query=compact_query,
entities=entities,
missing_slots=missing_slots,
)
relax_knowledge_follow_up = self._should_relax_knowledge_follow_up_clarification(
compact_query=compact_query,
scenario=scenario,
@@ -306,6 +312,45 @@ class SemanticOntologyService(
follow_up_markers = ("", "那么", "这个", "这种", "", "的话", "p", "P")
return any(marker in compact_query for marker in follow_up_markers)
@staticmethod
def _filter_expense_missing_slots(
*,
compact_query: str,
entities: list[object],
missing_slots: list[str],
) -> list[str]:
expense_types = {
str(getattr(item, "normalized_value", "") or getattr(item, "value", "") or "").strip()
for item in entities
if getattr(item, "type", "") == "expense_type"
}
has_transport = "transport" in expense_types
has_entertainment = "entertainment" in expense_types
explicit_entertainment = any(
keyword in compact_query
for keyword in (
"业务招待",
"招待费",
"招待",
"宴请",
"请客",
"请客户吃饭",
"客户吃饭",
"客户用餐",
"客户餐",
"商务接待",
"商务宴请",
"接待餐",
)
)
if has_transport and not has_entertainment and not explicit_entertainment:
return [
item
for item in missing_slots
if item not in {"customer_name", "participants"}
]
return missing_slots
def _record_semantic_parse(
self,
*,

View File

@@ -37,6 +37,39 @@ from app.services.ontology_rules import (
logger = get_logger("app.services.ontology")
TRANSPORT_EXPENSE_OVERRIDE_KEYWORDS = (
"打车",
"网约车",
"出租车票",
"出租车",
"的士票",
"的士",
"滴滴",
"市内交通",
"乘车",
"乘车费",
"用车",
"叫车",
"车费",
"车资",
"机场",
)
EXPLICIT_ENTERTAINMENT_KEYWORDS = (
"业务招待",
"招待费",
"招待",
"宴请",
"请客",
"请客户吃饭",
"客户吃饭",
"客户用餐",
"客户餐",
"商务接待",
"商务宴请",
"接待餐",
)
class OntologyDetectionMixin:
def _detect_scenario(self, compact_query: str) -> tuple[str, float]:
scores = {key: 0.0 for key in SCENARIO_KEYWORDS}
@@ -337,6 +370,9 @@ class OntologyDetectionMixin:
"出现“客户”不等于应收,出现“供应商”不等于应付,必须结合动作词和业务目标判断。"
"只有明确查询、统计、列出、多少、明细、对比时才优先使用 query 或 compare。"
"附件名称和 OCR 摘要只作为辅助证据,不能编造未出现的事实。"
"如果用户明确提到打车、的士票、出租车票、网约车、乘车费、车费等交通票据,"
"即使句子里出现“客户”,也必须优先识别为 transport不要推断为 entertainment。"
"不要输出用户原文未出现、且与规则候选冲突的费用类型。"
"信息不足时 clarification_required=true并给出一句简短中文追问。"
"missing_slots 使用简短 snake_case例如 expense_type, amount, "
"customer_name, participants, attachments。"
@@ -351,12 +387,12 @@ class OntologyDetectionMixin:
' "intent": "draft",\n'
' "confidence": 0.88,\n'
' "clarification_required": true,\n'
' "clarification_question": "请补充客户单位、参与人员和票据附件。",\n'
' "missing_slots": ["customer_name", "participants", "attachments"],\n'
' "clarification_question": "请补充发生时间、金额和票据附件。",\n'
' "missing_slots": ["time_range", "amount", "attachments"],\n'
' "ambiguity": [],\n'
' "entity_hints": [\n'
' {"type": "expense_type", "value": "招待", '
'"normalized_value": "entertainment", "role": "filter", '
' {"type": "expense_type", "value": "交通费", '
'"normalized_value": "transport", "role": "filter", '
'"confidence": 0.86}\n'
" ]\n"
"}"
@@ -432,6 +468,7 @@ class OntologyDetectionMixin:
def _merge_entities(
base_entities: list[OntologyEntity],
entity_hints: list[LlmOntologyEntityHint],
compact_query: str = "",
) -> list[OntologyEntity]:
merged: dict[tuple[str, str], OntologyEntity] = {
(item.type, item.normalized_value): item for item in base_entities
@@ -454,7 +491,36 @@ class OntologyDetectionMixin:
if existing is None or existing.confidence < candidate.confidence:
merged[key] = candidate
return list(merged.values())
items = list(merged.values())
if OntologyDetectionMixin._should_transport_override_entertainment(
compact_query,
items,
):
items = [
item
for item in items
if not (
item.type == "expense_type"
and item.normalized_value == "entertainment"
)
]
return items
@staticmethod
def _should_transport_override_entertainment(
compact_query: str,
entities: list[OntologyEntity],
) -> bool:
expense_types = {
str(item.normalized_value or item.value or "").strip()
for item in entities
if item.type == "expense_type"
}
if not {"transport", "entertainment"}.issubset(expense_types):
return False
if not any(keyword in compact_query for keyword in TRANSPORT_EXPENSE_OVERRIDE_KEYWORDS):
return False
return not any(keyword in compact_query for keyword in EXPLICIT_ENTERTAINMENT_KEYWORDS)
@staticmethod
def _normalize_short_text_list(values: list[str]) -> list[str]:

View File

@@ -59,11 +59,16 @@ class OntologyExtractionMixin:
missing_slots.append("attachments")
return missing_slots
if any(
has_entertainment_type = any(
item.normalized_value == "entertainment"
for item in entities
if item.type == "expense_type"
):
)
has_explicit_entertainment_text = "客户" in compact_query and any(
keyword in compact_query
for keyword in ("招待", "接待", "吃饭", "用餐", "宴请", "请客", "客户餐")
)
if has_entertainment_type or has_explicit_entertainment_text:
if "customer" not in entity_types:
missing_slots.append("customer_name")
missing_slots.append("participants")
@@ -171,14 +176,14 @@ class OntologyExtractionMixin:
upsert(self._make_entity("expense_type", label, normalized, role="filter"))
has_customer_entertainment_signal = "客户" in query and any(
keyword in query for keyword in ("吃饭", "用餐", "餐饮", "宴请", "请客", "招待")
keyword in query for keyword in ("吃饭", "用餐", "餐饮", "宴请", "请客", "招待", "接待")
)
if has_customer_entertainment_signal:
upsert(
self._make_entity(
"expense_type",
"客户招待",
"entertainment",
"业务招待",
"meal",
role="filter",
confidence=0.96,
)
@@ -189,46 +194,52 @@ class OntologyExtractionMixin:
for keyword in (
"打车",
"网约车",
"出租车",
"出租车票",
"出租车",
"车费",
"乘车",
"用车",
"叫车",
"车资",
"的士",
"的士票",
"的士",
"滴滴",
"市内交通",
"地铁",
"公交",
"停车费",
"过路费",
"通行费",
"高速费",
)
):
upsert(self._make_entity("expense_type", "交通", "transport", role="filter", confidence=0.9))
if any(keyword in query for keyword in ("出差", "机票", "火车", "高铁", "行程单")):
if any(keyword in query for keyword in ("出差", "机票", "飞机票", "航班", "火车票", "火车", "高铁票", "高铁", "动车", "行程单")):
upsert(self._make_entity("expense_type", "差旅", "travel", role="filter", confidence=0.88))
if any(keyword in query for keyword in ("酒店", "住宿", "宾馆")):
if any(keyword in query for keyword in ("酒店", "酒店发票", "住宿", "住宿费", "宾馆", "民宿", "房费", "客房")):
upsert(self._make_entity("expense_type", "住宿", "hotel", role="filter", confidence=0.86))
if (
not has_customer_entertainment_signal
and any(keyword in query for keyword in ("餐费", "用餐", "午餐", "晚餐", "早餐", "餐饮"))
):
upsert(self._make_entity("expense_type", "", "meal", role="filter", confidence=0.84))
upsert(self._make_entity("expense_type", "业务招待", "meal", role="filter", confidence=0.84))
if any(
keyword in query
for keyword in ("办公用品", "文具", "耗材", "办公耗材", "打印纸", "办公设备", "键盘", "鼠标", "白板")
for keyword in ("办公用品", "文具", "耗材", "办公耗材", "打印纸", "办公设备", "键盘", "鼠标", "白板", "硒鼓", "墨盒")
):
upsert(self._make_entity("expense_type", "办公费", "office", role="filter", confidence=0.87))
upsert(self._make_entity("expense_type", "办公用品", "office", role="filter", confidence=0.87))
if any(keyword in query for keyword in ("培训", "讲师费", "课时费", "课程费")):
if any(keyword in query for keyword in ("培训", "讲师费", "课时费", "课程费", "教材", "认证费", "考试费")):
upsert(self._make_entity("expense_type", "培训费", "training", role="filter", confidence=0.84))
if any(keyword in query for keyword in ("通讯费", "话费", "流量费", "宽带费")):
if any(keyword in query for keyword in ("通讯费", "话费", "电话费", "手机费", "流量费", "宽带费", "网络费")):
upsert(self._make_entity("expense_type", "通讯费", "communication", role="filter", confidence=0.84))
if any(keyword in query for keyword in ("福利费", "团建", "慰问", "节日福利", "体检费")):
if any(keyword in query for keyword in ("福利费", "团建", "慰问", "节日福利", "体检费", "员工关怀")):
upsert(self._make_entity("expense_type", "福利费", "welfare", role="filter", confidence=0.84))
for amount in self._extract_amount_entities(query):

View File

@@ -6,6 +6,7 @@ from dataclasses import dataclass
from pydantic import BaseModel, ConfigDict, Field
from app.schemas.ontology import OntologyIntent, OntologyScenario
from app.services.expense_type_keywords import build_expense_type_keyword_map
DATE_RANGE_PATTERN = re.compile(
r"(?P<start>\d{4}-\d{1,2}-\d{1,2})\s*(?:到|至|~|-)\s*(?P<end>\d{4}-\d{1,2}-\d{1,2})"
@@ -128,44 +129,7 @@ OPERATE_KEYWORDS = (
"删除",
)
EXPENSE_TYPE_KEYWORDS = {
"差旅": "travel",
"出差": "travel",
"住宿": "hotel",
"酒店": "hotel",
"交通": "transport",
"打车": "transport",
"网约车": "transport",
"出租车": "transport",
"出租车票": "transport",
"乘车": "transport",
"乘车费": "transport",
"用车": "transport",
"叫车": "transport",
"车资": "transport",
"的士": "transport",
"的士票": "transport",
"停车费": "transport",
"餐费": "meal",
"用餐": "meal",
"会务": "meeting",
"招待费": "entertainment",
"招待": "entertainment",
"宴请": "entertainment",
"办公费": "office",
"办公用品": "office",
"文具": "office",
"耗材": "office",
"办公耗材": "office",
"打印纸": "office",
"办公设备": "office",
"培训费": "training",
"培训": "training",
"通讯费": "communication",
"话费": "communication",
"福利费": "welfare",
"团建": "welfare",
}
EXPENSE_TYPE_KEYWORDS = build_expense_type_keyword_map()
EXPENSE_NARRATIVE_KEYWORDS = (
"报销",

View File

@@ -74,16 +74,16 @@ EXPENSE_RISK_LEVEL_LABELS = {
"medium": "中风险",
"warning": "中风险",
"low": "低风险",
"info": "低风险",
"info": "提示",
}
EXPENSE_TYPE_LABELS = {
"travel": "差旅费",
"hotel": "住宿费",
"transport": "交通费",
"meal": "",
"meal": "业务招待",
"meeting": "会务费",
"entertainment": "业务招待费",
"office": "办公费",
"office": "办公用品",
"training": "培训费",
"communication": "通讯费",
"welfare": "福利费",

View File

@@ -35,10 +35,10 @@ EXPENSE_TYPE_LABELS = {
"travel": "差旅费",
"hotel": "住宿费",
"transport": "交通费",
"meal": "",
"meal": "业务招待",
"meeting": "会务费",
"entertainment": "业务招待费",
"office": "办公费",
"office": "办公用品",
"training": "培训费",
"communication": "通讯费",
"welfare": "福利费",
@@ -48,10 +48,10 @@ EXPENSE_TYPE_LABELS = {
GROUP_SCENE_LABELS = {
"travel": "差旅费",
"entertainment": "业务招待费",
"meal": "伙食",
"meal": "业务招待",
"transport": "交通费",
"hotel": "住宿费",
"office": "办公费",
"office": "办公用品",
"training": "培训费",
"communication": "通讯费",
"welfare": "福利费",
@@ -62,8 +62,12 @@ EXPENSE_SCENE_SELECTION_OPTIONS = (
("travel", "差旅费", "出差、长途交通、住宿、差旅补贴等场景。"),
("transport", "交通费", "市内打车、停车、过路费等日常交通场景。"),
("hotel", "住宿费", "单独住宿、酒店发票等场景。"),
("entertainment", "业务招待费", "客户接待、宴请、招待等场景。"),
("office", "办公", "办公用品、耗材、办公设备等采购场景。"),
("meal", "业务招待费", "客户接待、工作餐、加班餐、餐饮票据等场景。"),
("meeting", "会务", "会议、论坛、会场、参会等场景。"),
("office", "办公用品费", "办公用品、耗材、办公设备等采购场景。"),
("training", "培训费", "培训课程、讲师费、教材、认证等场景。"),
("communication", "通讯费", "话费、流量、宽带、网络等场景。"),
("welfare", "福利费", "团建、体检、慰问、节日福利等场景。"),
("other", "其他费用", "暂不属于以上分类的报销场景。"),
)
@@ -130,10 +134,10 @@ INFERRED_REASON_LABELS = {
"travel": "出差行程",
"hotel": "住宿报销",
"transport": "交通出行",
"meal": "餐饮用餐",
"meal": "业务招待",
"meeting": "会务活动",
"entertainment": "客户接待",
"office": "办公采购",
"office": "办公用品采购",
"training": "培训学习",
"communication": "通讯使用",
"welfare": "员工福利",

View File

@@ -9,16 +9,32 @@ from app.schemas.user_agent import UserAgentRequest, UserAgentReviewDocumentCard
DEFAULT_GROUP_SCENE_LABELS = {
"travel": "差旅费",
"entertainment": "业务招待费",
"meal": "伙食",
"meal": "业务招待",
"transport": "交通费",
"hotel": "住宿费",
"office": "办公费",
"office": "办公用品",
"training": "培训费",
"communication": "通讯费",
"welfare": "福利费",
"other": "其他费用",
}
DOCUMENT_SCENE_LABELS = {
"flight_itinerary": "机票/航班行程单",
"train_ticket": "火车/高铁票",
"ship_ticket": "轮船票",
"travel_ticket": "交通出行票据",
"hotel_invoice": "酒店住宿票据",
"taxi_receipt": "出租车/网约车票据",
"transport_receipt": "乘车票据",
"parking_toll_receipt": "停车/通行费票据",
"meal_receipt": "餐饮发票",
"office_invoice": "文具/办公用品发票",
"meeting_invoice": "会议/会务票据",
"training_invoice": "培训票据",
"other": "其他票据",
}
DOCUMENT_DATE_TEXT_PATTERN = re.compile(
r"(\d{4}[年/-]\d{1,2}[月/-]\d{1,2}日?(?:\s*[T ]?\s*(?:[01]?\d|2[0-3])[:][0-5]\d)?)"
)
@@ -48,55 +64,55 @@ class UserAgentDocumentService:
provided_type = str(item.get("document_type") or "").strip().lower()
normalized_expense_type = str(expense_type_code or "").strip().lower()
if provided_type:
if provided_type in {"flight_itinerary", "train_ticket"}:
if provided_type in {"flight_itinerary", "train_ticket", "ship_ticket"}:
return {
"document_type": provided_type,
"expense_type": "travel",
"group_code": "travel",
"scene_label": "差旅票据",
"scene_label": DOCUMENT_SCENE_LABELS.get(provided_type, "交通出行票据"),
}
if provided_type == "hotel_invoice":
return {
"document_type": provided_type,
"expense_type": "hotel",
"group_code": "travel",
"scene_label": "住宿票据",
"scene_label": DOCUMENT_SCENE_LABELS["hotel_invoice"],
}
if provided_type in {"taxi_receipt", "parking_toll_receipt"}:
if provided_type in {"taxi_receipt", "transport_receipt", "parking_toll_receipt"}:
return {
"document_type": provided_type,
"expense_type": "transport",
"group_code": "travel",
"scene_label": "交通票据",
"scene_label": DOCUMENT_SCENE_LABELS.get(provided_type, "乘车票据"),
}
if provided_type == "meal_receipt":
group_code = "entertainment" if normalized_expense_type == "entertainment" or has_customer else "meal"
group_code = "meal"
return {
"document_type": provided_type,
"expense_type": group_code,
"group_code": group_code,
"scene_label": "餐饮票据",
"scene_label": DOCUMENT_SCENE_LABELS["meal_receipt"],
}
if provided_type == "office_invoice":
return {
"document_type": provided_type,
"expense_type": "office",
"group_code": "office",
"scene_label": "办公用品票据",
"scene_label": DOCUMENT_SCENE_LABELS["office_invoice"],
}
if provided_type == "meeting_invoice":
return {
"document_type": provided_type,
"expense_type": "meeting",
"group_code": "meeting",
"scene_label": "会务票据",
"scene_label": DOCUMENT_SCENE_LABELS["meeting_invoice"],
}
if provided_type == "training_invoice":
return {
"document_type": provided_type,
"expense_type": "training",
"group_code": "training",
"scene_label": "培训票据",
"scene_label": DOCUMENT_SCENE_LABELS["training_invoice"],
}
text = " ".join(
@@ -108,41 +124,69 @@ class UserAgentDocumentService:
).lower()
compact = text.replace(" ", "")
if any(keyword in compact for keyword in ("机票", "航班", "", "", "行程单")):
if any(keyword in compact for keyword in ("火车", "高铁", "", "", "车次")):
return {
"document_type": "travel_ticket",
"document_type": "train_ticket",
"expense_type": "travel",
"group_code": "travel",
"scene_label": "差旅票据",
"scene_label": DOCUMENT_SCENE_LABELS["train_ticket"],
}
if any(keyword in compact for keyword in ("过路费", "停车", "通行费", "收费站")):
return {
"document_type": "parking_toll_receipt",
"expense_type": "transport",
"group_code": "travel",
"scene_label": DOCUMENT_SCENE_LABELS["parking_toll_receipt"],
}
if any(keyword in compact for keyword in ("打车", "出租车", "滴滴", "网约车", "叫车", "车费", "车资", "的士")):
return {
"document_type": "taxi_receipt",
"expense_type": "transport",
"group_code": "travel",
"scene_label": DOCUMENT_SCENE_LABELS["taxi_receipt"],
}
if any(keyword in compact for keyword in ("乘车", "用车")):
return {
"document_type": "transport_receipt",
"expense_type": "transport",
"group_code": "travel",
"scene_label": DOCUMENT_SCENE_LABELS["transport_receipt"],
}
if any(keyword in compact for keyword in ("机票", "航班", "登机", "航空", "客票")):
return {
"document_type": "flight_itinerary",
"expense_type": "travel",
"group_code": "travel",
"scene_label": DOCUMENT_SCENE_LABELS["flight_itinerary"],
}
if any(keyword in compact for keyword in ("轮船", "船票", "客轮", "渡轮", "航运")):
return {
"document_type": "ship_ticket",
"expense_type": "travel",
"group_code": "travel",
"scene_label": DOCUMENT_SCENE_LABELS["ship_ticket"],
}
if any(keyword in compact for keyword in ("酒店", "住宿", "宾馆")):
return {
"document_type": "hotel_invoice",
"expense_type": "hotel",
"group_code": "travel",
"scene_label": "住宿票据",
}
if any(keyword in compact for keyword in ("打车", "出租车", "滴滴", "网约车", "乘车", "用车", "叫车", "车费", "车资", "的士", "过路费", "停车")):
return {
"document_type": "transport_receipt",
"expense_type": "transport",
"group_code": "travel",
"scene_label": "交通票据",
"scene_label": DOCUMENT_SCENE_LABELS["hotel_invoice"],
}
if any(keyword in compact for keyword in ("", "饭店", "酒楼", "酒家", "餐饮", "meal")):
group_code = "entertainment" if normalized_expense_type == "entertainment" or has_customer else "meal"
group_code = "meal"
return {
"document_type": "meal_receipt",
"expense_type": group_code,
"group_code": group_code,
"scene_label": "餐饮票据",
"scene_label": DOCUMENT_SCENE_LABELS["meal_receipt"],
}
if any(keyword in compact for keyword in ("办公用品", "文具", "耗材", "办公耗材", "打印纸", "键盘", "鼠标", "白板", "墨盒", "硒鼓")):
return {
"document_type": "other",
"document_type": "office_invoice",
"expense_type": "office",
"group_code": "office",
"scene_label": "办公用品票据",
"scene_label": DOCUMENT_SCENE_LABELS["office_invoice"],
}
return {
"document_type": "other",

View File

@@ -314,10 +314,7 @@ class UserAgentReviewCoreMixin:
filename=str(item.get("filename") or f"document-{index}"),
document_type=classified["document_type"],
suggested_expense_type=classified["expense_type"],
scene_label=GROUP_SCENE_LABELS.get(
classified["group_code"],
classified["scene_label"],
),
scene_label=self._resolve_review_document_scene_label(item, classified),
summary=str(item.get("summary") or item.get("text") or "").strip(),
avg_score=float(item.get("avg_score") or 0.0),
preview_kind=str(item.get("preview_kind") or "").strip(),
@@ -338,6 +335,25 @@ class UserAgentReviewCoreMixin:
return cards
@staticmethod
def _resolve_review_document_scene_label(item: dict[str, object], classified: dict[str, str]) -> str:
provided_label = str(item.get("document_type_label") or "").strip()
if provided_label and provided_label != "其他单据":
return provided_label
classified_scene_label = str(classified.get("scene_label") or "").strip()
if classified_scene_label:
return classified_scene_label
document_type = str(classified.get("document_type") or item.get("document_type") or "").strip()
document_type_label = resolve_document_type_label(document_type)
if document_type_label and document_type_label not in {"其他单据", document_type}:
return document_type_label
scene_label = str(item.get("scene_label") or "").strip()
return scene_label or "其他票据"
def _build_review_claim_groups(
self,
payload: UserAgentRequest,

View File

@@ -59,6 +59,20 @@ class UserAgentReviewProfileMixin:
manager_name = self._resolve_manager_name(employee)
reason = slot_map.get("reason").value if slot_map.get("reason") else ""
attachments = "".join(self._resolve_attachment_names(payload))
expense_type_code = str(slot_map.get("expense_type").normalized_value if slot_map.get("expense_type") else "").strip()
customer_name = str(slot_map.get("customer_name").value if slot_map.get("customer_name") else "").strip()
merchant_name = str(slot_map.get("merchant_name").value if slot_map.get("merchant_name") else "").strip()
participants = str(slot_map.get("participants").value if slot_map.get("participants") else "").strip()
customer_slot = slot_map.get("customer_name")
participants_slot = slot_map.get("participants")
customer_required = bool(
customer_slot
and (customer_slot.required or customer_slot.status == "missing")
)
participants_required = bool(
participants_slot
and (participants_slot.required or participants_slot.status == "missing")
)
fields = [
UserAgentReviewEditField(
@@ -98,13 +112,20 @@ class UserAgentReviewProfileMixin:
required=False,
group="basic",
),
UserAgentReviewEditField(
key="customer_name",
label="客户名称",
value=slot_map.get("customer_name").value if slot_map.get("customer_name") else "",
placeholder="请输入客户名称",
group="business",
),
]
if expense_type_code == "entertainment" or customer_required or customer_name:
fields.append(
UserAgentReviewEditField(
key="customer_name",
label="客户名称",
value=customer_name,
placeholder="请输入客户名称",
group="business",
)
)
fields.append(
UserAgentReviewEditField(
key="business_location",
label="业务地点",
@@ -112,15 +133,22 @@ class UserAgentReviewProfileMixin:
placeholder="例如:北京 / 客户现场",
required=False,
group="business",
),
UserAgentReviewEditField(
key="merchant_name",
label="酒店/商户",
value=slot_map.get("merchant_name").value if slot_map.get("merchant_name") else "",
placeholder="请输入酒店或商户名称",
required=False,
group="business",
),
)
)
if expense_type_code == "hotel" or merchant_name:
fields.append(
UserAgentReviewEditField(
key="merchant_name",
label="酒店/商户",
value=merchant_name,
placeholder="请输入酒店或商户名称",
required=False,
group="business",
)
)
fields.extend([
UserAgentReviewEditField(
key="amount",
label="金额",
@@ -128,13 +156,20 @@ class UserAgentReviewProfileMixin:
placeholder="例如200.00元",
group="business",
),
UserAgentReviewEditField(
key="participants",
label="参与人员",
value=slot_map.get("participants").value if slot_map.get("participants") else "",
placeholder="例如:客户 2 人,我方 1 人",
group="business",
),
])
if expense_type_code == "entertainment" or participants_required or participants:
fields.append(
UserAgentReviewEditField(
key="participants",
label="参与人员",
value=participants,
placeholder="例如:客户 2 人,我方 1 人",
group="business",
)
)
fields.extend([
UserAgentReviewEditField(
key="reason",
label="事由",
@@ -152,7 +187,7 @@ class UserAgentReviewProfileMixin:
field_type="textarea",
group="attachments",
),
]
])
return fields

View File

@@ -37,6 +37,7 @@ from app.services.expense_claims import ExpenseClaimService
from app.services.expense_rule_runtime import ExpenseRuleRuntimeService, RuntimeTravelPolicy, resolve_document_type_label
from app.services.risk_ontology_bridge import resolve_rule_codes_for_risk_check
from app.services.travel_reimbursement_calculator import TravelReimbursementCalculatorService
from app.services.expense_type_keywords import resolve_expense_type_label_from_text
from app.services.user_agent_constants import *
@@ -568,27 +569,9 @@ class UserAgentReviewSlotMixin:
@staticmethod
def _normalize_expense_type_input(value: str) -> tuple[str, str]:
compact = str(value or "").replace(" ", "")
if "招待" in compact or ("客户" in compact and any(keyword in compact for keyword in ("吃饭", "用餐", "宴请", "请客"))):
return "entertainment", "业务招待费"
if any(keyword in compact for keyword in ("差旅", "出差", "机票", "行程")):
return "travel", "差旅费"
if any(keyword in compact for keyword in ("住宿", "酒店", "宾馆")):
return "hotel", "住宿费"
if any(keyword in compact for keyword in ("交通", "打车", "网约车", "出租车", "乘车", "用车", "叫车", "车费", "车资", "的士", "停车")):
return "transport", "交通费"
if any(keyword in compact for keyword in ("餐费", "用餐", "午餐", "晚餐", "早餐", "伙食")):
return "meal", "餐费"
if "会务" in compact:
return "meeting", "会务费"
if any(keyword in compact for keyword in ("办公费", "办公用品", "文具", "耗材", "办公耗材", "打印纸", "办公设备", "键盘", "鼠标", "白板")):
return "office", "办公费"
if any(keyword in compact for keyword in ("培训费", "培训", "讲师费", "课时费", "课程费")):
return "training", "培训费"
if any(keyword in compact for keyword in ("通讯费", "话费", "流量费", "宽带费")):
return "communication", "通讯费"
if any(keyword in compact for keyword in ("福利费", "团建", "慰问", "节日福利", "体检费")):
return "welfare", "福利费"
resolved = resolve_expense_type_label_from_text(value)
if resolved is not None:
return resolved
return "other", str(value or "").strip() or "其他费用"

View File

@@ -137,14 +137,13 @@ class UserAgentReviewTravelPolicyMixin:
continue
night_count = self._extract_review_hotel_night_count(card)
nightly_amount = (amount / Decimal(max(night_count, 1))).quantize(Decimal("0.01"))
if nightly_amount <= cap:
continue
amount_measurement_lines.append(
f"{card.filename}:识别为{document_type_label},金额 {amount:.2f} 元,"
f"{night_count} 晚折算 {nightly_amount:.2f} 元/晚;"
f"适用标准为 {band_label}{city_tier_label} {cap:.2f} 元/晚,"
f"{'超出标准' if nightly_amount > cap else '测算通过'}"
f"适用标准为 {band_label}{city_tier_label} {cap:.2f} 元/晚,超出标准。"
)
if nightly_amount <= cap:
continue
basis = (
f"依据《{standard_rule_name}》({standard_rule_version}{band_label}{city_tier_label}"
@@ -200,12 +199,11 @@ class UserAgentReviewTravelPolicyMixin:
)
continue
amount_measurement_lines.append(
f"{card.filename}:识别为{document_type_label},金额 {amount:.2f} 元;"
f"适用《{standard_rule_name}{region_label}伙食补助标准 {standard_amount:.2f} 元/天,"
f"{'超出标准' if amount > standard_amount else '测算通过'}"
)
if amount > standard_amount:
amount_measurement_lines.append(
f"{card.filename}:识别为{document_type_label},金额 {amount:.2f} 元;"
f"适用《{standard_rule_name}{region_label}伙食补助标准 {standard_amount:.2f} 元/天,超出标准。"
)
append_once(
f"travel-meal-allowance-over-limit-{card.index}",
UserAgentReviewRiskBrief(
@@ -251,13 +249,6 @@ class UserAgentReviewTravelPolicyMixin:
)
continue
if standard_amount is not None:
amount_measurement_lines.append(
f"{card.filename}:识别为{document_type_label},金额 {amount:.2f} 元;"
f"适用《{scene_policy.rule_name}{metric_label}标准 {standard_amount:.2f} 元,"
f"{'超出标准' if amount > standard_amount else '测算通过'}"
)
amount_risk = self._evaluate_review_scene_amount(
amount=amount,
limit_config=scene_limit,
@@ -265,6 +256,11 @@ class UserAgentReviewTravelPolicyMixin:
)
if amount_risk is not None:
severity, threshold = amount_risk
if standard_amount is not None:
amount_measurement_lines.append(
f"{card.filename}:识别为{document_type_label},金额 {amount:.2f} 元;"
f"适用《{scene_policy.rule_name}{metric_label}标准 {standard_amount:.2f} 元,超出标准。"
)
append_once(
f"{scene_code}-amount-over-limit-{card.index}",
UserAgentReviewRiskBrief(
@@ -348,11 +344,11 @@ class UserAgentReviewTravelPolicyMixin:
briefs.insert(
0,
UserAgentReviewRiskBrief(
title="附件金额测算结果",
level="info",
content="系统根据首轮上传附件识别金额,并匹配当前可执行的报销标准进行测算",
title="附件金额测算异常",
level="warning",
content="系统根据首轮上传附件识别金额后,发现有需要进一步核查或说明的测算结果",
detail="".join(dict.fromkeys(amount_measurement_lines)),
suggestion="如测算结果超标,请补充超标说明、调整金额或更正票据类型后再继续。",
suggestion="请补充超标说明、调整金额或更正票据类型后再继续。",
),
)