feat: 增强知识库索引与设置页面模块化拆分
扩展知识库索引任务和 RAG 检索支持增量入库和文档去重,优 化本体检测和规则匹配精度,前端设置页面拆分为 LLM、邮件 和 Hermes 员工同步子面板并重构样式,新增日志详情组件和 知识入库日志模型,补充单元测试覆盖。
This commit is contained in:
@@ -3,6 +3,8 @@ from __future__ import annotations
|
||||
import re
|
||||
from decimal import Decimal
|
||||
|
||||
from app.services.expense_type_keywords import iter_expense_keywords
|
||||
|
||||
EXPENSE_TYPE_LABELS = {
|
||||
"travel": "差旅",
|
||||
"train_ticket": "火车票",
|
||||
@@ -12,10 +14,10 @@ EXPENSE_TYPE_LABELS = {
|
||||
"travel_allowance": "出差补贴",
|
||||
"hotel": "住宿",
|
||||
"transport": "交通",
|
||||
"meal": "餐费",
|
||||
"meal": "业务招待",
|
||||
"meeting": "会务",
|
||||
"entertainment": "招待",
|
||||
"office": "办公",
|
||||
"office": "办公用品",
|
||||
"training": "培训",
|
||||
"communication": "通讯",
|
||||
"welfare": "福利",
|
||||
@@ -131,40 +133,19 @@ DOCUMENT_ROUTE_DESTINATION_LABELS = {
|
||||
GENERIC_ATTACHMENT_BACKFILL_ITEM_TYPES = {"", "other", "travel", "transport", "hotel"}
|
||||
LOCATION_REQUIRED_EXPENSE_TYPES = {"travel", "meeting", "entertainment"}
|
||||
EXPENSE_SCENE_KEYWORDS = {
|
||||
"travel": ("差旅", "出差", "行程"),
|
||||
"hotel": ("酒店", "住宿", "房费", "客房", "入住", "离店"),
|
||||
"transport": (
|
||||
"交通",
|
||||
"打车",
|
||||
"出租车",
|
||||
"网约车",
|
||||
"滴滴",
|
||||
"出行",
|
||||
"乘车",
|
||||
"用车",
|
||||
"叫车",
|
||||
"车费",
|
||||
"车资",
|
||||
"的士",
|
||||
"高铁",
|
||||
"动车",
|
||||
"火车",
|
||||
"机票",
|
||||
"航班",
|
||||
"行程单",
|
||||
"登机",
|
||||
"客票",
|
||||
"公交",
|
||||
"地铁",
|
||||
"过路费",
|
||||
"通行费",
|
||||
"停车",
|
||||
),
|
||||
"meal": ("餐饮", "餐费", "用餐", "外卖", "快餐", "酒楼", "饭店", "饭馆", "食品", "咖啡"),
|
||||
"entertainment": ("招待", "宴请", "接待", "客户餐", "商务餐", "业务招待"),
|
||||
"office": ("办公", "办公用品", "文具", "耗材", "打印", "纸张", "硒鼓", "墨盒", "鼠标", "键盘", "电脑"),
|
||||
"meeting": ("会议", "会务", "会展", "会议室", "会场", "场地费", "论坛"),
|
||||
"training": ("培训", "课程", "讲师", "教材", "学费", "认证"),
|
||||
code: tuple(iter_expense_keywords(code))
|
||||
for code in (
|
||||
"travel",
|
||||
"hotel",
|
||||
"transport",
|
||||
"meal",
|
||||
"entertainment",
|
||||
"office",
|
||||
"meeting",
|
||||
"training",
|
||||
"communication",
|
||||
"welfare",
|
||||
)
|
||||
}
|
||||
EXPENSE_TYPE_ALLOWED_DOCUMENT_SCENES = {
|
||||
"travel": {"travel", "hotel", "transport", "meal"},
|
||||
@@ -185,7 +166,7 @@ DOCUMENT_SCENE_LABELS = {
|
||||
"travel": "差旅",
|
||||
"hotel": "住宿",
|
||||
"transport": "交通",
|
||||
"meal": "餐饮",
|
||||
"meal": "业务招待",
|
||||
"entertainment": "业务招待",
|
||||
"office": "办公用品",
|
||||
"meeting": "会务",
|
||||
|
||||
@@ -87,6 +87,7 @@ from app.services.expense_claim_constants import (
|
||||
TRAVEL_POLICY_TRAIN_CLASS_PATTERNS,
|
||||
TRAVEL_POLICY_HOTEL_NIGHT_PATTERN,
|
||||
)
|
||||
from app.services.expense_type_keywords import resolve_expense_type_code_from_text
|
||||
from app.services.expense_claim_risk_review import ExpenseClaimRiskReviewMixin
|
||||
from app.services.expense_amounts import (
|
||||
extract_amount_candidates,
|
||||
@@ -209,26 +210,7 @@ class ExpenseClaimOntologyResolverMixin:
|
||||
or ""
|
||||
).replace(" ", "")
|
||||
if compact:
|
||||
if "招待" in compact or ("客户" in compact and any(word in compact for word in ("吃饭", "宴请", "请客", "用餐"))):
|
||||
return "entertainment"
|
||||
if any(word in compact for word in ("差旅", "出差", "机票", "行程")):
|
||||
return "travel"
|
||||
if any(word in compact for word in ("住宿", "酒店", "宾馆")):
|
||||
return "hotel"
|
||||
if any(word in compact for word in ("交通", "打车", "网约车", "出租车", "乘车", "用车", "叫车", "车费", "车资", "的士", "停车")):
|
||||
return "transport"
|
||||
if any(word in compact for word in ("餐费", "用餐", "午餐", "晚餐", "早餐", "伙食")):
|
||||
return "meal"
|
||||
if "会务" in compact:
|
||||
return "meeting"
|
||||
if any(word in compact for word in ("办公费", "办公用品", "文具", "耗材", "办公耗材", "打印纸", "办公设备", "键盘", "鼠标", "白板")):
|
||||
return "office"
|
||||
if any(word in compact for word in ("培训费", "培训", "讲师费", "课时费", "课程费")):
|
||||
return "training"
|
||||
if any(word in compact for word in ("通讯费", "话费", "流量费", "宽带费")):
|
||||
return "communication"
|
||||
if any(word in compact for word in ("福利费", "团建", "慰问", "节日福利", "体检费")):
|
||||
return "welfare"
|
||||
return resolve_expense_type_code_from_text(compact)
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -538,8 +538,8 @@ class ExpenseRuleRuntimeService:
|
||||
if any(keyword in normalized for keyword in ("市内交通", "打车", "网约车", "出租车")):
|
||||
return "transport"
|
||||
if "招待" in normalized and "餐" in normalized:
|
||||
return "entertainment"
|
||||
if "餐补" in normalized or normalized == "餐费":
|
||||
return "meal"
|
||||
if "餐补" in normalized or normalized in {"餐费", "业务招待费"}:
|
||||
return "meal"
|
||||
return ""
|
||||
|
||||
@@ -547,7 +547,7 @@ class ExpenseRuleRuntimeService:
|
||||
def _spreadsheet_metric_label(expense_type: str) -> str:
|
||||
return {
|
||||
"transport": "单笔交通金额",
|
||||
"meal": "差旅餐补金额",
|
||||
"meal": "业务招待费金额",
|
||||
"entertainment": "人均招待餐费",
|
||||
}.get(expense_type, "金额")
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ EXPENSE_RULE_CODE_BLOCK_PATTERN = re.compile(r"```expense-rule\s*(\{.*?\})\s*```
|
||||
DOCUMENT_TYPE_LABELS = {
|
||||
"flight_itinerary": "机票/航班行程单",
|
||||
"train_ticket": "火车/高铁票",
|
||||
"ship_ticket": "轮船票",
|
||||
"hotel_invoice": "酒店住宿票据",
|
||||
"taxi_receipt": "出租车/网约车票据",
|
||||
"parking_toll_receipt": "停车/通行费票据",
|
||||
@@ -24,9 +25,9 @@ SCENE_LABELS = {
|
||||
"travel": "差旅",
|
||||
"hotel": "住宿",
|
||||
"transport": "交通",
|
||||
"meal": "餐饮",
|
||||
"meal": "业务招待",
|
||||
"entertainment": "业务招待",
|
||||
"office": "办公",
|
||||
"office": "办公用品",
|
||||
"meeting": "会务",
|
||||
"training": "培训",
|
||||
"communication": "通讯",
|
||||
@@ -73,7 +74,7 @@ DEFAULT_SCENE_MATRIX_CONFIG: dict[str, Any] = {
|
||||
},
|
||||
},
|
||||
"meal": {
|
||||
"label": "餐费",
|
||||
"label": "业务招待费",
|
||||
"location_required": False,
|
||||
"min_attachment_count": 1,
|
||||
"allowed_scene_codes": ["meal"],
|
||||
@@ -84,7 +85,7 @@ DEFAULT_SCENE_MATRIX_CONFIG: dict[str, Any] = {
|
||||
"warn_amount": "300.00",
|
||||
"block_amount": "800.00",
|
||||
"exception_keywords": ["客户接待", "团队活动", "加班", "展会", "超标说明"],
|
||||
"metric_label": "餐费合计",
|
||||
"metric_label": "业务招待费合计",
|
||||
},
|
||||
},
|
||||
"entertainment": {
|
||||
@@ -103,7 +104,7 @@ DEFAULT_SCENE_MATRIX_CONFIG: dict[str, Any] = {
|
||||
},
|
||||
},
|
||||
"office": {
|
||||
"label": "办公费",
|
||||
"label": "办公用品费",
|
||||
"location_required": False,
|
||||
"min_attachment_count": 1,
|
||||
"allowed_scene_codes": ["office"],
|
||||
@@ -114,7 +115,7 @@ DEFAULT_SCENE_MATRIX_CONFIG: dict[str, Any] = {
|
||||
"warn_amount": "1500.00",
|
||||
"block_amount": "5000.00",
|
||||
"exception_keywords": ["批量采购", "固定资产", "部门集中采购", "超标说明"],
|
||||
"metric_label": "办公费合计",
|
||||
"metric_label": "办公用品费合计",
|
||||
},
|
||||
},
|
||||
"meeting": {
|
||||
|
||||
245
server/src/app/services/expense_type_keywords.py
Normal file
245
server/src/app/services/expense_type_keywords.py
Normal file
@@ -0,0 +1,245 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable
|
||||
|
||||
|
||||
EXPENSE_TYPE_KEYWORD_GROUPS: tuple[tuple[str, str, tuple[str, ...]], ...] = (
|
||||
(
|
||||
"travel",
|
||||
"差旅费",
|
||||
(
|
||||
"差旅费",
|
||||
"差旅",
|
||||
"出差",
|
||||
"外地出差",
|
||||
"跨城交通",
|
||||
"往返车票",
|
||||
"机票",
|
||||
"飞机票",
|
||||
"航班",
|
||||
"登机牌",
|
||||
"行程单",
|
||||
"火车票",
|
||||
"高铁票",
|
||||
"动车票",
|
||||
"铁路客票",
|
||||
"客票",
|
||||
),
|
||||
),
|
||||
(
|
||||
"hotel",
|
||||
"住宿费",
|
||||
(
|
||||
"住宿费",
|
||||
"住宿",
|
||||
"酒店发票",
|
||||
"酒店",
|
||||
"宾馆",
|
||||
"民宿",
|
||||
"房费",
|
||||
"客房",
|
||||
"住店",
|
||||
"入住",
|
||||
"离店",
|
||||
"住宿清单",
|
||||
),
|
||||
),
|
||||
(
|
||||
"transport",
|
||||
"交通费",
|
||||
(
|
||||
"交通费",
|
||||
"交通",
|
||||
"市内交通",
|
||||
"打车",
|
||||
"网约车",
|
||||
"出租车票",
|
||||
"出租车",
|
||||
"的士票",
|
||||
"的士",
|
||||
"滴滴",
|
||||
"曹操出行",
|
||||
"T3出行",
|
||||
"出行",
|
||||
"乘车费",
|
||||
"乘车",
|
||||
"用车",
|
||||
"叫车",
|
||||
"车费",
|
||||
"车资",
|
||||
"公交",
|
||||
"地铁",
|
||||
"停车费",
|
||||
"停车",
|
||||
"过路费",
|
||||
"通行费",
|
||||
"高速费",
|
||||
"燃油费",
|
||||
"油费",
|
||||
),
|
||||
),
|
||||
(
|
||||
"meal",
|
||||
"业务招待费",
|
||||
(
|
||||
"业务招待费",
|
||||
"业务招待",
|
||||
"招待费",
|
||||
"招待",
|
||||
"客户招待",
|
||||
"客户接待",
|
||||
"商务接待",
|
||||
"商务宴请",
|
||||
"宴请",
|
||||
"请客",
|
||||
"请客户",
|
||||
"客户用餐",
|
||||
"客户餐",
|
||||
"客户吃饭",
|
||||
"陪同用餐",
|
||||
"接待餐",
|
||||
"餐费",
|
||||
"伙食费",
|
||||
"伙食",
|
||||
"工作餐",
|
||||
"餐饮",
|
||||
"用餐",
|
||||
"早餐",
|
||||
"午餐",
|
||||
"晚餐",
|
||||
"夜宵",
|
||||
"盒饭",
|
||||
"茶歇",
|
||||
"餐票",
|
||||
"饭票",
|
||||
),
|
||||
),
|
||||
(
|
||||
"meeting",
|
||||
"会务费",
|
||||
(
|
||||
"会务费",
|
||||
"会务",
|
||||
"会议费",
|
||||
"会议",
|
||||
"参会",
|
||||
"会场",
|
||||
"场地费",
|
||||
"论坛",
|
||||
"展会",
|
||||
"研讨会",
|
||||
"峰会",
|
||||
"布展",
|
||||
),
|
||||
),
|
||||
(
|
||||
"office",
|
||||
"办公用品费",
|
||||
(
|
||||
"办公用品费",
|
||||
"办公费",
|
||||
"办公用品",
|
||||
"办公耗材",
|
||||
"办公设备",
|
||||
"办公",
|
||||
"文具",
|
||||
"耗材",
|
||||
"打印纸",
|
||||
"打印",
|
||||
"纸张",
|
||||
"硒鼓",
|
||||
"墨盒",
|
||||
"键盘",
|
||||
"鼠标",
|
||||
"白板",
|
||||
"电脑配件",
|
||||
),
|
||||
),
|
||||
(
|
||||
"training",
|
||||
"培训费",
|
||||
(
|
||||
"培训费",
|
||||
"培训",
|
||||
"讲师费",
|
||||
"讲师",
|
||||
"课时费",
|
||||
"课程费",
|
||||
"课程",
|
||||
"教材",
|
||||
"学费",
|
||||
"考试费",
|
||||
"认证费",
|
||||
"认证",
|
||||
),
|
||||
),
|
||||
(
|
||||
"communication",
|
||||
"通讯费",
|
||||
(
|
||||
"通讯费",
|
||||
"通讯",
|
||||
"话费",
|
||||
"电话费",
|
||||
"手机费",
|
||||
"流量费",
|
||||
"流量",
|
||||
"宽带费",
|
||||
"宽带",
|
||||
"网络费",
|
||||
),
|
||||
),
|
||||
(
|
||||
"welfare",
|
||||
"福利费",
|
||||
(
|
||||
"福利费",
|
||||
"福利",
|
||||
"团建",
|
||||
"慰问",
|
||||
"节日福利",
|
||||
"体检费",
|
||||
"体检",
|
||||
"员工关怀",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
EXPENSE_TYPE_LABEL_BY_CODE = {
|
||||
code: label for code, label, _keywords in EXPENSE_TYPE_KEYWORD_GROUPS
|
||||
}
|
||||
EXPENSE_TYPE_LABEL_BY_CODE.setdefault("entertainment", "业务招待费")
|
||||
|
||||
|
||||
def build_expense_type_keyword_map() -> dict[str, str]:
|
||||
mapping: dict[str, str] = {}
|
||||
for code, _label, keywords in EXPENSE_TYPE_KEYWORD_GROUPS:
|
||||
for keyword in keywords:
|
||||
mapping.setdefault(keyword, code)
|
||||
return mapping
|
||||
|
||||
|
||||
def iter_expense_keywords(*codes: str) -> Iterable[str]:
|
||||
allowed_codes = {str(code or "").strip() for code in codes if str(code or "").strip()}
|
||||
for code, _label, keywords in EXPENSE_TYPE_KEYWORD_GROUPS:
|
||||
if allowed_codes and code not in allowed_codes:
|
||||
continue
|
||||
yield from keywords
|
||||
|
||||
|
||||
def resolve_expense_type_code_from_text(value: str) -> str | None:
|
||||
compact = str(value or "").replace(" ", "")
|
||||
if not compact:
|
||||
return None
|
||||
for code, _label, keywords in EXPENSE_TYPE_KEYWORD_GROUPS:
|
||||
if any(keyword in compact for keyword in keywords):
|
||||
return code
|
||||
return None
|
||||
|
||||
|
||||
def resolve_expense_type_label_from_text(value: str) -> tuple[str, str] | None:
|
||||
code = resolve_expense_type_code_from_text(value)
|
||||
if not code:
|
||||
return None
|
||||
return code, EXPENSE_TYPE_LABEL_BY_CODE.get(code, str(value or "").strip())
|
||||
@@ -63,6 +63,7 @@ class KnowledgeIndexTaskManager:
|
||||
heartbeat_stop = threading.Event()
|
||||
heartbeat_thread: threading.Thread | None = None
|
||||
tool_call_id = ""
|
||||
knowledge_ingest: dict[str, Any] | None = None
|
||||
tool_request_json = {
|
||||
"agent": AgentName.HERMES.value,
|
||||
"folder": folder,
|
||||
@@ -74,6 +75,10 @@ class KnowledgeIndexTaskManager:
|
||||
run_service = AgentRunService(db)
|
||||
knowledge_service = KnowledgeService(db=db)
|
||||
rag_service = KnowledgeRagService(db=db)
|
||||
knowledge_ingest = _build_initial_knowledge_ingest_state(
|
||||
knowledge_service,
|
||||
document_ids=document_ids,
|
||||
)
|
||||
|
||||
run_service.merge_route_json(
|
||||
agent_run_id,
|
||||
@@ -93,7 +98,18 @@ class KnowledgeIndexTaskManager:
|
||||
"skipped_documents": 0,
|
||||
"percent": 10 if document_ids else 100,
|
||||
},
|
||||
"knowledge_ingest": knowledge_ingest,
|
||||
},
|
||||
result_summary=_build_ingest_running_summary(
|
||||
knowledge_ingest,
|
||||
{
|
||||
"total_documents": len(document_ids),
|
||||
"completed_documents": 0,
|
||||
"failed_documents": 0,
|
||||
"skipped_documents": 0,
|
||||
"percent": 10 if document_ids else 100,
|
||||
},
|
||||
),
|
||||
)
|
||||
tool_call = run_service.record_tool_call(
|
||||
run_id=agent_run_id,
|
||||
@@ -134,44 +150,159 @@ class KnowledgeIndexTaskManager:
|
||||
)
|
||||
heartbeat_thread.start()
|
||||
|
||||
response = rag_service.index_documents(document_ids=document_ids, force=force)
|
||||
succeeded_document_ids = [
|
||||
str(item).strip()
|
||||
for item in list(response.get("succeeded_document_ids") or [])
|
||||
if str(item).strip()
|
||||
]
|
||||
failed_documents = [
|
||||
item
|
||||
for item in list(response.get("failed_documents") or [])
|
||||
if isinstance(item, dict)
|
||||
]
|
||||
responses: list[dict[str, Any]] = []
|
||||
succeeded_document_ids: list[str] = []
|
||||
failed_documents: list[dict[str, str]] = []
|
||||
total_documents = len(document_ids)
|
||||
|
||||
for index, document_id in enumerate(document_ids, start=1):
|
||||
_patch_ingest_document(
|
||||
knowledge_ingest,
|
||||
document_id,
|
||||
{
|
||||
"status": "running",
|
||||
"phase": "indexing",
|
||||
"started_at": datetime.now(UTC).isoformat(),
|
||||
},
|
||||
event=f"开始处理第 {index}/{total_documents} 个文件,正在写入 LightRAG。",
|
||||
)
|
||||
knowledge_ingest["current_document_id"] = document_id
|
||||
_sync_ingest_route_json(
|
||||
run_service,
|
||||
agent_run_id,
|
||||
knowledge_ingest,
|
||||
progress=_build_ingest_progress(knowledge_ingest, total_documents),
|
||||
)
|
||||
|
||||
try:
|
||||
response = rag_service.index_documents(document_ids=[document_id], force=force)
|
||||
except Exception as exc:
|
||||
logger.exception(
|
||||
"Knowledge document index failed run_id=%s doc_id=%s",
|
||||
agent_run_id,
|
||||
document_id,
|
||||
)
|
||||
failed_documents.append(
|
||||
{
|
||||
"document_id": document_id,
|
||||
"status": "exception",
|
||||
"error": str(exc),
|
||||
}
|
||||
)
|
||||
_patch_ingest_document(
|
||||
knowledge_ingest,
|
||||
document_id,
|
||||
{
|
||||
"status": "failed",
|
||||
"phase": "failed",
|
||||
"finished_at": datetime.now(UTC).isoformat(),
|
||||
"error": str(exc),
|
||||
},
|
||||
event=f"归集失败:{exc}",
|
||||
level="error",
|
||||
)
|
||||
knowledge_service.set_document_ingest_statuses(
|
||||
[document_id],
|
||||
KNOWLEDGE_INGEST_STATUS_FAILED,
|
||||
agent_run_id=agent_run_id,
|
||||
)
|
||||
_refresh_ingest_graph(knowledge_ingest)
|
||||
_sync_ingest_route_json(
|
||||
run_service,
|
||||
agent_run_id,
|
||||
knowledge_ingest,
|
||||
progress=_build_ingest_progress(knowledge_ingest, total_documents),
|
||||
)
|
||||
continue
|
||||
|
||||
responses.append(response)
|
||||
response_failed_documents = _extract_failed_documents(response, document_id)
|
||||
document_summary = _extract_document_summary(response, document_id)
|
||||
if response_failed_documents:
|
||||
failed_documents.extend(response_failed_documents)
|
||||
error_text = (
|
||||
response_failed_documents[0].get("error") or "LightRAG 未返回可查询状态"
|
||||
)
|
||||
_patch_ingest_document(
|
||||
knowledge_ingest,
|
||||
document_id,
|
||||
{
|
||||
**document_summary,
|
||||
"status": "failed",
|
||||
"phase": "failed",
|
||||
"finished_at": datetime.now(UTC).isoformat(),
|
||||
"error": error_text,
|
||||
"track_id": str(response.get("track_id") or "").strip(),
|
||||
},
|
||||
event=f"LightRAG 索引失败:{error_text}",
|
||||
level="error",
|
||||
)
|
||||
knowledge_service.set_document_ingest_statuses(
|
||||
[document_id],
|
||||
KNOWLEDGE_INGEST_STATUS_FAILED,
|
||||
agent_run_id=agent_run_id,
|
||||
)
|
||||
else:
|
||||
succeeded_document_ids.append(document_id)
|
||||
chunk_count = int(document_summary.get("chunk_count") or 0)
|
||||
entity_count = int(document_summary.get("entity_count") or 0)
|
||||
relation_count = int(document_summary.get("relation_count") or 0)
|
||||
_patch_ingest_document(
|
||||
knowledge_ingest,
|
||||
document_id,
|
||||
{
|
||||
**document_summary,
|
||||
"status": "succeeded",
|
||||
"phase": "indexed",
|
||||
"finished_at": datetime.now(UTC).isoformat(),
|
||||
"track_id": str(response.get("track_id") or "").strip(),
|
||||
},
|
||||
event=(
|
||||
"LightRAG 索引完成:"
|
||||
f"{chunk_count} 个 chunk,{entity_count} 个实体,"
|
||||
f"{relation_count} 条关系。"
|
||||
),
|
||||
)
|
||||
knowledge_service.set_document_ingest_statuses(
|
||||
[document_id],
|
||||
KNOWLEDGE_INGEST_STATUS_INGESTED,
|
||||
agent_run_id=agent_run_id,
|
||||
)
|
||||
_refresh_ingest_graph(knowledge_ingest)
|
||||
_sync_ingest_route_json(
|
||||
run_service,
|
||||
agent_run_id,
|
||||
knowledge_ingest,
|
||||
progress=_build_ingest_progress(knowledge_ingest, total_documents),
|
||||
)
|
||||
|
||||
failed_document_ids = [
|
||||
str(item.get("document_id") or "").strip()
|
||||
for item in failed_documents
|
||||
if str(item.get("document_id") or "").strip()
|
||||
]
|
||||
|
||||
if succeeded_document_ids:
|
||||
knowledge_service.set_document_ingest_statuses(
|
||||
succeeded_document_ids,
|
||||
KNOWLEDGE_INGEST_STATUS_INGESTED,
|
||||
agent_run_id=agent_run_id,
|
||||
)
|
||||
if failed_document_ids:
|
||||
knowledge_service.set_document_ingest_statuses(
|
||||
failed_document_ids,
|
||||
KNOWLEDGE_INGEST_STATUS_FAILED,
|
||||
agent_run_id=agent_run_id,
|
||||
)
|
||||
|
||||
duration_ms = int((perf_counter() - started) * 1000)
|
||||
tool_status = "succeeded" if not failed_document_ids else "failed"
|
||||
latest_track_id = _resolve_latest_track_id(responses)
|
||||
knowledge_ingest["current_document_id"] = ""
|
||||
knowledge_ingest["status"] = tool_status
|
||||
knowledge_ingest["phase"] = "completed"
|
||||
knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
|
||||
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
|
||||
heartbeat_stop.set()
|
||||
if heartbeat_thread is not None:
|
||||
heartbeat_thread.join(timeout=1)
|
||||
run_service.update_tool_call(
|
||||
tool_call_id,
|
||||
response_json=response,
|
||||
response_json={
|
||||
"track_id": latest_track_id,
|
||||
"requested_document_ids": document_ids,
|
||||
"succeeded_document_ids": succeeded_document_ids,
|
||||
"failed_documents": failed_documents,
|
||||
"documents": knowledge_ingest.get("documents", []),
|
||||
"responses": responses,
|
||||
},
|
||||
status=tool_status,
|
||||
duration_ms=duration_ms,
|
||||
error_message=None if tool_status == "succeeded" else "部分文档索引失败。",
|
||||
@@ -183,14 +314,17 @@ class KnowledgeIndexTaskManager:
|
||||
summary = (
|
||||
f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引。"
|
||||
if failed_count == 0
|
||||
else f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引,失败 {failed_count} 个。"
|
||||
else (
|
||||
f"LightRAG 已完成 {completed_documents}/{total_documents} 个知识文档索引,"
|
||||
f"失败 {failed_count} 个。"
|
||||
)
|
||||
)
|
||||
run_service.merge_route_json(
|
||||
agent_run_id,
|
||||
{
|
||||
"job_type": "knowledge_index_sync",
|
||||
"phase": "completed",
|
||||
"track_id": str(response.get("track_id") or "").strip(),
|
||||
"track_id": latest_track_id,
|
||||
"heartbeat_at": datetime.now(UTC).isoformat(),
|
||||
"progress": {
|
||||
"total_documents": total_documents,
|
||||
@@ -199,6 +333,7 @@ class KnowledgeIndexTaskManager:
|
||||
"skipped_documents": 0,
|
||||
"percent": 100,
|
||||
},
|
||||
"knowledge_ingest": knowledge_ingest,
|
||||
},
|
||||
status=(
|
||||
AgentRunStatus.SUCCEEDED.value
|
||||
@@ -234,24 +369,50 @@ class KnowledgeIndexTaskManager:
|
||||
error_message=str(exc),
|
||||
)
|
||||
KnowledgeService(db=db).set_document_ingest_statuses(
|
||||
document_ids,
|
||||
_resolve_failed_ingest_document_ids(knowledge_ingest, document_ids),
|
||||
KNOWLEDGE_INGEST_STATUS_FAILED,
|
||||
agent_run_id=agent_run_id,
|
||||
)
|
||||
if knowledge_ingest is not None:
|
||||
for document_id in document_ids:
|
||||
document = _find_ingest_document(knowledge_ingest, document_id)
|
||||
if document is None or document.get("status") in {"succeeded", "failed"}:
|
||||
continue
|
||||
_patch_ingest_document(
|
||||
knowledge_ingest,
|
||||
document_id,
|
||||
{
|
||||
"status": "failed",
|
||||
"phase": "failed",
|
||||
"finished_at": datetime.now(UTC).isoformat(),
|
||||
"error": str(exc),
|
||||
},
|
||||
event=f"归集任务中断:{exc}",
|
||||
level="error",
|
||||
)
|
||||
knowledge_ingest["status"] = "failed"
|
||||
knowledge_ingest["phase"] = "failed"
|
||||
knowledge_ingest["current_document_id"] = ""
|
||||
knowledge_ingest["finished_at"] = datetime.now(UTC).isoformat()
|
||||
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
|
||||
|
||||
route_payload: dict[str, Any] = {
|
||||
"job_type": "knowledge_index_sync",
|
||||
"phase": "failed",
|
||||
"heartbeat_at": datetime.now(UTC).isoformat(),
|
||||
"progress": {
|
||||
"total_documents": len(document_ids),
|
||||
"completed_documents": 0,
|
||||
"failed_documents": len(document_ids),
|
||||
"skipped_documents": 0,
|
||||
"percent": 100,
|
||||
},
|
||||
}
|
||||
if knowledge_ingest is not None:
|
||||
route_payload["knowledge_ingest"] = knowledge_ingest
|
||||
AgentRunService(db).merge_route_json(
|
||||
agent_run_id,
|
||||
{
|
||||
"job_type": "knowledge_index_sync",
|
||||
"phase": "failed",
|
||||
"heartbeat_at": datetime.now(UTC).isoformat(),
|
||||
"progress": {
|
||||
"total_documents": len(document_ids),
|
||||
"completed_documents": 0,
|
||||
"failed_documents": len(document_ids),
|
||||
"skipped_documents": 0,
|
||||
"percent": 100,
|
||||
},
|
||||
},
|
||||
route_payload,
|
||||
status=AgentRunStatus.FAILED.value,
|
||||
result_summary=str(exc),
|
||||
error_message=str(exc),
|
||||
@@ -267,4 +428,312 @@ class KnowledgeIndexTaskManager:
|
||||
db.close()
|
||||
|
||||
|
||||
def _build_initial_knowledge_ingest_state(
|
||||
knowledge_service: KnowledgeService,
|
||||
*,
|
||||
document_ids: list[str],
|
||||
) -> dict[str, Any]:
|
||||
now = datetime.now(UTC).isoformat()
|
||||
documents = [
|
||||
_build_initial_knowledge_ingest_document(knowledge_service, document_id, now=now)
|
||||
for document_id in document_ids
|
||||
]
|
||||
return {
|
||||
"schema_version": 1,
|
||||
"status": "running",
|
||||
"phase": "queued",
|
||||
"started_at": now,
|
||||
"finished_at": None,
|
||||
"current_document_id": documents[0]["document_id"] if documents else "",
|
||||
"documents": documents,
|
||||
"graph": _build_ingest_graph({"documents": documents}),
|
||||
}
|
||||
|
||||
|
||||
def _build_initial_knowledge_ingest_document(
|
||||
knowledge_service: KnowledgeService,
|
||||
document_id: str,
|
||||
*,
|
||||
now: str,
|
||||
) -> dict[str, Any]:
|
||||
try:
|
||||
entry = knowledge_service.get_document_entry(document_id)
|
||||
except Exception:
|
||||
entry = {}
|
||||
return {
|
||||
"document_id": document_id,
|
||||
"name": str(entry.get("original_name") or document_id).strip(),
|
||||
"folder": str(entry.get("folder") or "").strip(),
|
||||
"extension": str(entry.get("extension") or "").strip(),
|
||||
"mime_type": str(entry.get("mime_type") or "").strip(),
|
||||
"status": "queued",
|
||||
"phase": "queued",
|
||||
"started_at": None,
|
||||
"finished_at": None,
|
||||
"text_chars": 0,
|
||||
"indexed_text_chars": 0,
|
||||
"section_count": 0,
|
||||
"sections": [],
|
||||
"chunk_count": 0,
|
||||
"chunk_ids": [],
|
||||
"chunks": [],
|
||||
"entity_count": 0,
|
||||
"relation_count": 0,
|
||||
"entities": [],
|
||||
"relations": [],
|
||||
"events": [
|
||||
{
|
||||
"at": now,
|
||||
"level": "info",
|
||||
"message": "已进入知识归集队列,等待 LightRAG 处理。",
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _patch_ingest_document(
|
||||
knowledge_ingest: dict[str, Any],
|
||||
document_id: str,
|
||||
updates: dict[str, Any],
|
||||
*,
|
||||
event: str = "",
|
||||
level: str = "info",
|
||||
) -> None:
|
||||
document = _find_ingest_document(knowledge_ingest, document_id)
|
||||
if document is None:
|
||||
return
|
||||
document.update(updates)
|
||||
if event:
|
||||
_append_ingest_event(document, event, level=level)
|
||||
|
||||
|
||||
def _append_ingest_event(document: dict[str, Any], message: str, *, level: str) -> None:
|
||||
events = document.get("events")
|
||||
if not isinstance(events, list):
|
||||
events = []
|
||||
events.append(
|
||||
{
|
||||
"at": datetime.now(UTC).isoformat(),
|
||||
"level": level,
|
||||
"message": message,
|
||||
}
|
||||
)
|
||||
document["events"] = events[-30:]
|
||||
|
||||
|
||||
def _find_ingest_document(
|
||||
knowledge_ingest: dict[str, Any],
|
||||
document_id: str,
|
||||
) -> dict[str, Any] | None:
|
||||
for document in list(knowledge_ingest.get("documents") or []):
|
||||
if not isinstance(document, dict):
|
||||
continue
|
||||
if str(document.get("document_id") or "").strip() == document_id:
|
||||
return document
|
||||
return None
|
||||
|
||||
|
||||
def _sync_ingest_route_json(
|
||||
run_service: AgentRunService,
|
||||
agent_run_id: str,
|
||||
knowledge_ingest: dict[str, Any],
|
||||
*,
|
||||
progress: dict[str, int],
|
||||
) -> None:
|
||||
run_service.merge_route_json(
|
||||
agent_run_id,
|
||||
{
|
||||
"job_type": "knowledge_index_sync",
|
||||
"phase": "indexing",
|
||||
"heartbeat_at": datetime.now(UTC).isoformat(),
|
||||
"progress": progress,
|
||||
"knowledge_ingest": knowledge_ingest,
|
||||
},
|
||||
result_summary=_build_ingest_running_summary(knowledge_ingest, progress),
|
||||
)
|
||||
|
||||
|
||||
def _build_ingest_running_summary(
|
||||
knowledge_ingest: dict[str, Any],
|
||||
progress: dict[str, int],
|
||||
) -> str:
|
||||
total_documents = int(progress.get("total_documents") or 0)
|
||||
completed_documents = int(progress.get("completed_documents") or 0)
|
||||
failed_documents = int(progress.get("failed_documents") or 0)
|
||||
current_document_id = str(knowledge_ingest.get("current_document_id") or "").strip()
|
||||
current_document = (
|
||||
_find_ingest_document(knowledge_ingest, current_document_id)
|
||||
if current_document_id
|
||||
else None
|
||||
)
|
||||
if current_document is not None:
|
||||
name = str(current_document.get("name") or current_document_id).strip()
|
||||
current_index = _resolve_ingest_document_index(knowledge_ingest, current_document_id)
|
||||
return (
|
||||
f"知识归纳正在处理 {current_index}/{total_documents}:{name}。"
|
||||
f"已完成 {completed_documents} 个,失败 {failed_documents} 个。"
|
||||
)
|
||||
return (
|
||||
f"知识归纳正在运行,已完成 {completed_documents}/{total_documents} 个文档,"
|
||||
f"失败 {failed_documents} 个。"
|
||||
)
|
||||
|
||||
|
||||
def _resolve_ingest_document_index(
|
||||
knowledge_ingest: dict[str, Any],
|
||||
document_id: str,
|
||||
) -> int:
|
||||
documents = [
|
||||
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
|
||||
]
|
||||
for index, document in enumerate(documents, start=1):
|
||||
if str(document.get("document_id") or "").strip() == document_id:
|
||||
return index
|
||||
return 0
|
||||
|
||||
|
||||
def _build_ingest_progress(
|
||||
knowledge_ingest: dict[str, Any],
|
||||
total_documents: int,
|
||||
) -> dict[str, int]:
|
||||
documents = [
|
||||
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
|
||||
]
|
||||
completed_documents = sum(1 for item in documents if item.get("status") == "succeeded")
|
||||
failed_documents = sum(1 for item in documents if item.get("status") == "failed")
|
||||
skipped_documents = sum(1 for item in documents if item.get("status") == "skipped")
|
||||
done_documents = completed_documents + failed_documents + skipped_documents
|
||||
if total_documents <= 0:
|
||||
percent = 100
|
||||
else:
|
||||
percent = min(95, max(10, 10 + int(done_documents * 85 / total_documents)))
|
||||
return {
|
||||
"total_documents": total_documents,
|
||||
"completed_documents": completed_documents,
|
||||
"failed_documents": failed_documents,
|
||||
"skipped_documents": skipped_documents,
|
||||
"percent": percent,
|
||||
}
|
||||
|
||||
|
||||
def _extract_document_summary(response: dict[str, Any], document_id: str) -> dict[str, Any]:
|
||||
for item in list(response.get("document_summaries") or []):
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
if str(item.get("document_id") or "").strip() == document_id:
|
||||
return dict(item)
|
||||
return {}
|
||||
|
||||
|
||||
def _extract_failed_documents(
|
||||
response: dict[str, Any],
|
||||
document_id: str,
|
||||
) -> list[dict[str, str]]:
|
||||
failed_documents: list[dict[str, str]] = []
|
||||
for item in list(response.get("failed_documents") or []):
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
item_document_id = str(item.get("document_id") or "").strip()
|
||||
if item_document_id and item_document_id != document_id:
|
||||
continue
|
||||
failed_documents.append(
|
||||
{
|
||||
"document_id": item_document_id or document_id,
|
||||
"status": str(item.get("status") or "failed").strip(),
|
||||
"error": str(item.get("error") or "LightRAG 索引失败").strip(),
|
||||
}
|
||||
)
|
||||
return failed_documents
|
||||
|
||||
|
||||
def _resolve_failed_ingest_document_ids(
|
||||
knowledge_ingest: dict[str, Any] | None,
|
||||
document_ids: list[str],
|
||||
) -> list[str]:
|
||||
if knowledge_ingest is None:
|
||||
return document_ids
|
||||
failed_document_ids: list[str] = []
|
||||
seen_document_ids: set[str] = set()
|
||||
for document in list(knowledge_ingest.get("documents") or []):
|
||||
if not isinstance(document, dict):
|
||||
continue
|
||||
document_id = str(document.get("document_id") or "").strip()
|
||||
if not document_id:
|
||||
continue
|
||||
seen_document_ids.add(document_id)
|
||||
if document.get("status") != "succeeded":
|
||||
failed_document_ids.append(document_id)
|
||||
failed_document_ids.extend(
|
||||
document_id for document_id in document_ids if document_id not in seen_document_ids
|
||||
)
|
||||
return failed_document_ids
|
||||
|
||||
|
||||
def _refresh_ingest_graph(knowledge_ingest: dict[str, Any]) -> None:
|
||||
knowledge_ingest["graph"] = _build_ingest_graph(knowledge_ingest)
|
||||
|
||||
|
||||
def _build_ingest_graph(knowledge_ingest: dict[str, Any]) -> dict[str, Any]:
|
||||
documents = [
|
||||
item for item in list(knowledge_ingest.get("documents") or []) if isinstance(item, dict)
|
||||
]
|
||||
entities = _dedupe_text_items(
|
||||
entity for document in documents for entity in list(document.get("entities") or [])
|
||||
)
|
||||
relations = _dedupe_relations(
|
||||
relation for document in documents for relation in list(document.get("relations") or [])
|
||||
)
|
||||
return {
|
||||
"chunk_count": sum(_to_int(document.get("chunk_count")) for document in documents),
|
||||
"entity_count": sum(_to_int(document.get("entity_count")) for document in documents),
|
||||
"relation_count": sum(_to_int(document.get("relation_count")) for document in documents),
|
||||
"entities": entities[:60],
|
||||
"relations": relations[:60],
|
||||
}
|
||||
|
||||
|
||||
def _dedupe_text_items(items: Any) -> list[str]:
|
||||
deduped: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for item in items:
|
||||
text = str(item or "").strip()
|
||||
if not text or text in seen:
|
||||
continue
|
||||
seen.add(text)
|
||||
deduped.append(text)
|
||||
return deduped
|
||||
|
||||
|
||||
def _dedupe_relations(items: Any) -> list[dict[str, str]]:
|
||||
deduped: list[dict[str, str]] = []
|
||||
seen: set[tuple[str, str, str]] = set()
|
||||
for item in items:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
source = str(item.get("source") or "").strip()
|
||||
target = str(item.get("target") or "").strip()
|
||||
relation_type = str(item.get("type") or "关联").strip()
|
||||
key = (source, target, relation_type)
|
||||
if not source or not target or key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
deduped.append({"source": source, "target": target, "type": relation_type})
|
||||
return deduped
|
||||
|
||||
|
||||
def _resolve_latest_track_id(responses: list[dict[str, Any]]) -> str:
|
||||
for response in reversed(responses):
|
||||
track_id = str(response.get("track_id") or "").strip()
|
||||
if track_id:
|
||||
return track_id
|
||||
return ""
|
||||
|
||||
|
||||
def _to_int(value: Any) -> int:
|
||||
try:
|
||||
return int(value or 0)
|
||||
except (TypeError, ValueError):
|
||||
return 0
|
||||
|
||||
|
||||
knowledge_index_task_manager = KnowledgeIndexTaskManager()
|
||||
|
||||
224
server/src/app/services/knowledge_ingest_log.py
Normal file
224
server/src/app/services/knowledge_ingest_log.py
Normal file
@@ -0,0 +1,224 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
MAX_INGEST_LOG_CHUNKS = 24
|
||||
MAX_INGEST_LOG_ENTITIES = 24
|
||||
MAX_INGEST_LOG_RELATIONS = 24
|
||||
MAX_INGEST_LOG_SECTIONS = 12
|
||||
MAX_INGEST_LOG_TEXT_PREVIEW = 180
|
||||
|
||||
INGEST_SECTION_HEADING_PATTERN = re.compile(
|
||||
r"^(?:#{1,4}\s+.+|第[一二三四五六七八九十百零0-9]+[章节条]\s*.*)$"
|
||||
)
|
||||
|
||||
|
||||
def build_ingest_document_summary(
|
||||
*,
|
||||
document_id: str,
|
||||
entry: dict[str, Any],
|
||||
raw_text: str,
|
||||
indexed_text: str,
|
||||
) -> dict[str, Any]:
|
||||
raw_text_value = str(raw_text or "")
|
||||
indexed_text_value = str(indexed_text or "")
|
||||
sections = _extract_ingest_sections(indexed_text_value)
|
||||
return {
|
||||
"document_id": document_id,
|
||||
"name": str(entry.get("original_name") or "").strip(),
|
||||
"folder": str(entry.get("folder") or "").strip(),
|
||||
"extension": str(entry.get("extension") or "").strip(),
|
||||
"mime_type": str(entry.get("mime_type") or "").strip(),
|
||||
"text_chars": len(raw_text_value),
|
||||
"indexed_text_chars": len(indexed_text_value),
|
||||
"section_count": len(sections),
|
||||
"sections": sections,
|
||||
"chunk_count": 0,
|
||||
"chunk_ids": [],
|
||||
"chunks": [],
|
||||
"entity_count": 0,
|
||||
"relation_count": 0,
|
||||
"entities": [],
|
||||
"relations": [],
|
||||
}
|
||||
|
||||
|
||||
def build_ingest_status_summary(
|
||||
*,
|
||||
status_payload: dict[str, Any],
|
||||
graph_summary: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
chunk_ids = _normalize_chunk_ids(status_payload)
|
||||
chunk_count = _resolve_chunk_count(status_payload, chunk_ids)
|
||||
return {
|
||||
"lightrag_status": str(status_payload.get("status") or "").strip(),
|
||||
"query_ready": bool(status_payload.get("query_ready")),
|
||||
"chunk_count": chunk_count,
|
||||
"chunk_ids": chunk_ids[:MAX_INGEST_LOG_CHUNKS],
|
||||
**graph_summary,
|
||||
}
|
||||
|
||||
|
||||
def build_document_graph_summary(
|
||||
storage_root: Path,
|
||||
*,
|
||||
workspace: str,
|
||||
document_id: str,
|
||||
) -> dict[str, Any]:
|
||||
workspace_dir = (
|
||||
Path(storage_root) / "knowledge" / ".lightrag" / str(workspace).strip()
|
||||
).resolve()
|
||||
entities_payload = _load_json_file(workspace_dir / "kv_store_full_entities.json")
|
||||
relations_payload = _load_json_file(workspace_dir / "kv_store_full_relations.json")
|
||||
chunks_payload = _load_json_file(workspace_dir / "kv_store_text_chunks.json")
|
||||
|
||||
entities = _normalize_document_entities(entities_payload, document_id)
|
||||
relations = _normalize_document_relations(relations_payload, document_id)
|
||||
chunks = _normalize_document_chunks(chunks_payload, document_id)
|
||||
return {
|
||||
"entity_count": len(entities),
|
||||
"relation_count": len(relations),
|
||||
"entities": entities[:MAX_INGEST_LOG_ENTITIES],
|
||||
"relations": relations[:MAX_INGEST_LOG_RELATIONS],
|
||||
"chunks": chunks[:MAX_INGEST_LOG_CHUNKS],
|
||||
}
|
||||
|
||||
|
||||
def _extract_ingest_sections(text: str) -> list[dict[str, str]]:
|
||||
sections: list[dict[str, str]] = []
|
||||
lines = [line.strip() for line in str(text or "").splitlines()]
|
||||
for index, line in enumerate(lines):
|
||||
if len(sections) >= MAX_INGEST_LOG_SECTIONS:
|
||||
break
|
||||
if not line or len(line) > 90 or not INGEST_SECTION_HEADING_PATTERN.match(line):
|
||||
continue
|
||||
sections.append(
|
||||
{
|
||||
"title": line.lstrip("#").strip(),
|
||||
"excerpt": _find_following_excerpt(lines[index + 1 :]),
|
||||
}
|
||||
)
|
||||
return sections
|
||||
|
||||
|
||||
def _find_following_excerpt(lines: list[str]) -> str:
|
||||
collected: list[str] = []
|
||||
for line in lines:
|
||||
if not line:
|
||||
continue
|
||||
if INGEST_SECTION_HEADING_PATTERN.match(line):
|
||||
break
|
||||
collected.append(line)
|
||||
if len(" ".join(collected)) >= MAX_INGEST_LOG_TEXT_PREVIEW:
|
||||
break
|
||||
return _truncate_text(" ".join(collected), max_length=MAX_INGEST_LOG_TEXT_PREVIEW)
|
||||
|
||||
|
||||
def _normalize_chunk_ids(status_payload: dict[str, Any]) -> list[str]:
|
||||
chunks_list = status_payload.get("chunks_list")
|
||||
if not isinstance(chunks_list, list):
|
||||
return []
|
||||
return [str(item).strip() for item in chunks_list if str(item or "").strip()]
|
||||
|
||||
|
||||
def _resolve_chunk_count(status_payload: dict[str, Any], chunk_ids: list[str]) -> int:
|
||||
try:
|
||||
return int(status_payload.get("chunks_count") or len(chunk_ids))
|
||||
except (TypeError, ValueError):
|
||||
return len(chunk_ids)
|
||||
|
||||
|
||||
def _load_json_file(path: Path) -> dict[str, Any]:
|
||||
try:
|
||||
payload = json.loads(path.read_text(encoding="utf-8"))
|
||||
except (FileNotFoundError, json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
return payload if isinstance(payload, dict) else {}
|
||||
|
||||
|
||||
def _normalize_document_entities(payload: dict[str, Any], document_id: str) -> list[str]:
|
||||
document_payload = payload.get(document_id) if isinstance(payload, dict) else {}
|
||||
entity_names = (
|
||||
document_payload.get("entity_names") if isinstance(document_payload, dict) else []
|
||||
)
|
||||
if not isinstance(entity_names, list):
|
||||
return []
|
||||
return _dedupe_text_items(entity_names)
|
||||
|
||||
|
||||
def _normalize_document_relations(
|
||||
payload: dict[str, Any], document_id: str
|
||||
) -> list[dict[str, str]]:
|
||||
document_payload = payload.get(document_id) if isinstance(payload, dict) else {}
|
||||
relation_pairs = (
|
||||
document_payload.get("relation_pairs") if isinstance(document_payload, dict) else []
|
||||
)
|
||||
if not isinstance(relation_pairs, list):
|
||||
return []
|
||||
|
||||
relations: list[dict[str, str]] = []
|
||||
seen: set[tuple[str, str]] = set()
|
||||
for pair in relation_pairs:
|
||||
if not isinstance(pair, (list, tuple)) or len(pair) < 2:
|
||||
continue
|
||||
source = str(pair[0] or "").strip()
|
||||
target = str(pair[1] or "").strip()
|
||||
if not source or not target or (source, target) in seen:
|
||||
continue
|
||||
seen.add((source, target))
|
||||
relations.append({"source": source, "target": target, "type": "关联"})
|
||||
return relations
|
||||
|
||||
|
||||
def _normalize_document_chunks(payload: dict[str, Any], document_id: str) -> list[dict[str, Any]]:
|
||||
chunks: list[dict[str, Any]] = []
|
||||
for chunk_id, raw_chunk in payload.items():
|
||||
if not isinstance(raw_chunk, dict):
|
||||
continue
|
||||
if str(raw_chunk.get("full_doc_id") or "").strip() != document_id:
|
||||
continue
|
||||
content = str(raw_chunk.get("content") or "").strip()
|
||||
chunks.append(
|
||||
{
|
||||
"id": str(raw_chunk.get("_id") or chunk_id).strip(),
|
||||
"order": _to_int(raw_chunk.get("chunk_order_index")),
|
||||
"tokens": _to_int(raw_chunk.get("tokens")),
|
||||
"summary": _build_chunk_summary(content),
|
||||
}
|
||||
)
|
||||
return sorted(chunks, key=lambda item: (item["order"], item["id"]))
|
||||
|
||||
|
||||
def _build_chunk_summary(content: str) -> str:
|
||||
lines = [line.strip() for line in str(content or "").splitlines() if line.strip()]
|
||||
text = next((line for line in lines if len(line) >= 12), lines[0] if lines else "")
|
||||
return _truncate_text(text, max_length=MAX_INGEST_LOG_TEXT_PREVIEW)
|
||||
|
||||
|
||||
def _dedupe_text_items(items: list[Any]) -> list[str]:
|
||||
deduped: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for item in items:
|
||||
text = str(item or "").strip()
|
||||
if not text or text in seen:
|
||||
continue
|
||||
seen.add(text)
|
||||
deduped.append(text)
|
||||
return deduped
|
||||
|
||||
|
||||
def _to_int(value: Any) -> int:
|
||||
try:
|
||||
return int(value or 0)
|
||||
except (TypeError, ValueError):
|
||||
return 0
|
||||
|
||||
|
||||
def _truncate_text(text: str, *, max_length: int) -> str:
|
||||
normalized = " ".join(str(text or "").split()).strip()
|
||||
if len(normalized) <= max_length:
|
||||
return normalized
|
||||
return f"{normalized[: max_length - 3].rstrip()}..."
|
||||
@@ -12,24 +12,15 @@ from sqlalchemy.orm import Session
|
||||
from app.core.config import get_settings
|
||||
from app.core.logging import get_logger
|
||||
from app.db.session import get_session_factory
|
||||
from app.services.knowledge_ingest_log import (
|
||||
build_document_graph_summary,
|
||||
build_ingest_document_summary,
|
||||
build_ingest_status_summary,
|
||||
)
|
||||
from app.services.knowledge_rag_runtime import (
|
||||
DEFAULT_EMBEDDING_TIMEOUT_SECONDS,
|
||||
DEFAULT_LIGHTRAG_QUERY_MODE,
|
||||
DEFAULT_LLM_TIMEOUT_SECONDS,
|
||||
KnowledgeRagError,
|
||||
RuntimeModelConfig,
|
||||
_LightRagRuntime,
|
||||
_build_ali_rerank_request,
|
||||
_build_azure_deployment_base,
|
||||
_build_headers,
|
||||
_ensure_path,
|
||||
_extract_chat_text,
|
||||
_extract_embedding_vectors,
|
||||
_extract_error_message,
|
||||
_extract_rerank_results,
|
||||
_normalize_endpoint,
|
||||
_parse_json_body,
|
||||
_send_json_request,
|
||||
)
|
||||
from app.services.settings import SettingsService
|
||||
|
||||
@@ -76,11 +67,9 @@ STRUCTURED_APPENDIX_LEADING_MARKERS = (
|
||||
"# 结构化表格补充",
|
||||
)
|
||||
STRUCTURED_APPENDIX_LEADING_WINDOW = 220
|
||||
|
||||
|
||||
_runtime_lock = threading.RLock()
|
||||
_runtime_instance: _LightRagRuntime | None = None
|
||||
_runtime_signature: tuple[Any, ...] | None = None
|
||||
_runtime_instances: dict[int, _LightRagRuntime] = {}
|
||||
_runtime_signatures: dict[int, tuple[Any, ...]] = {}
|
||||
|
||||
|
||||
class KnowledgeRagService:
|
||||
@@ -147,7 +136,11 @@ class KnowledgeRagService:
|
||||
"query": normalized_query,
|
||||
"record_count": len(hits),
|
||||
"hits": hits,
|
||||
"references": [str(item.get("code") or "").strip() for item in hits if str(item.get("code") or "").strip()],
|
||||
"references": [
|
||||
str(item.get("code") or "").strip()
|
||||
for item in hits
|
||||
if str(item.get("code") or "").strip()
|
||||
],
|
||||
"raw_references": references,
|
||||
"metadata": raw.get("metadata") if isinstance(raw, dict) else {},
|
||||
"message": f"已从知识库中检索到 {len(hits)} 条相关内容。",
|
||||
@@ -172,6 +165,7 @@ class KnowledgeRagService:
|
||||
)
|
||||
texts: list[str] = []
|
||||
file_paths: list[str] = []
|
||||
document_summaries: list[dict[str, Any]] = []
|
||||
|
||||
runtime = self._get_runtime()
|
||||
existing_statuses = runtime.get_document_statuses(normalized_ids)
|
||||
@@ -182,12 +176,29 @@ class KnowledgeRagService:
|
||||
try:
|
||||
runtime.delete_document(document_id)
|
||||
except Exception as exc:
|
||||
logger.warning("Delete existing LightRAG document failed doc_id=%s: %s", document_id, exc)
|
||||
logger.warning(
|
||||
"Delete existing LightRAG document failed doc_id=%s: %s", document_id, exc
|
||||
)
|
||||
text = knowledge_service.extract_document_text(document_id)
|
||||
raw_text = text
|
||||
if normalization_service is not None:
|
||||
text = normalization_service.build_enriched_text(text)
|
||||
texts.append(text)
|
||||
file_paths.append(str((knowledge_service.library_root / entry["folder"] / entry["stored_name"]).resolve()))
|
||||
file_paths.append(
|
||||
str(
|
||||
(
|
||||
knowledge_service.library_root / entry["folder"] / entry["stored_name"]
|
||||
).resolve()
|
||||
)
|
||||
)
|
||||
document_summaries.append(
|
||||
build_ingest_document_summary(
|
||||
document_id=document_id,
|
||||
entry=entry,
|
||||
raw_text=raw_text,
|
||||
indexed_text=text,
|
||||
)
|
||||
)
|
||||
|
||||
track_id = runtime.insert_documents(
|
||||
texts=texts,
|
||||
@@ -198,10 +209,32 @@ class KnowledgeRagService:
|
||||
statuses = runtime.get_document_statuses(normalized_ids)
|
||||
succeeded_document_ids: list[str] = []
|
||||
failed_documents: list[dict[str, str]] = []
|
||||
summary_by_id = {
|
||||
str(item.get("document_id") or "").strip(): item
|
||||
for item in document_summaries
|
||||
if str(item.get("document_id") or "").strip()
|
||||
}
|
||||
|
||||
for document_id in normalized_ids:
|
||||
status_obj = statuses.get(document_id)
|
||||
status_text = self._status_value(status_obj)
|
||||
status_payload = self._serialize_status(status_obj)
|
||||
workspace = (
|
||||
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
|
||||
or DEFAULT_LIGHTRAG_WORKSPACE
|
||||
)
|
||||
graph_summary = build_document_graph_summary(
|
||||
self.storage_root,
|
||||
workspace=workspace,
|
||||
document_id=document_id,
|
||||
)
|
||||
if document_id in summary_by_id:
|
||||
summary_by_id[document_id].update(
|
||||
build_ingest_status_summary(
|
||||
status_payload=status_payload,
|
||||
graph_summary=graph_summary,
|
||||
)
|
||||
)
|
||||
if self.is_query_ready_status(status_obj):
|
||||
succeeded_document_ids.append(document_id)
|
||||
continue
|
||||
@@ -218,13 +251,18 @@ class KnowledgeRagService:
|
||||
"requested_document_ids": normalized_ids,
|
||||
"succeeded_document_ids": succeeded_document_ids,
|
||||
"failed_documents": failed_documents,
|
||||
"document_summaries": [
|
||||
summary_by_id.get(document_id, {}) for document_id in normalized_ids
|
||||
],
|
||||
"status_snapshot": {
|
||||
document_id: self._serialize_status(status_obj)
|
||||
for document_id, status_obj in statuses.items()
|
||||
},
|
||||
}
|
||||
|
||||
def get_document_status_map(self, document_ids: list[str] | None = None) -> dict[str, dict[str, Any]]:
|
||||
def get_document_status_map(
|
||||
self, document_ids: list[str] | None = None
|
||||
) -> dict[str, dict[str, Any]]:
|
||||
target_ids = [str(item).strip() for item in document_ids or [] if str(item).strip()]
|
||||
if not target_ids:
|
||||
return {}
|
||||
@@ -248,28 +286,32 @@ class KnowledgeRagService:
|
||||
logger.warning("Delete LightRAG document ignored doc_id=%s: %s", normalized_id, exc)
|
||||
|
||||
def _get_runtime(self) -> _LightRagRuntime:
|
||||
global _runtime_instance, _runtime_signature
|
||||
|
||||
signature, runtime_kwargs = self._build_runtime_signature()
|
||||
thread_id = threading.get_ident()
|
||||
with _runtime_lock:
|
||||
if _runtime_instance is not None and _runtime_signature == signature:
|
||||
return _runtime_instance
|
||||
runtime = _runtime_instances.get(thread_id)
|
||||
if runtime is not None and _runtime_signatures.get(thread_id) == signature:
|
||||
return runtime
|
||||
|
||||
if _runtime_instance is not None:
|
||||
if runtime is not None:
|
||||
try:
|
||||
_runtime_instance.finalize()
|
||||
runtime.finalize()
|
||||
except Exception as exc: # pragma: no cover - best effort cleanup
|
||||
logger.warning("Finalize previous LightRAG runtime failed: %s", exc)
|
||||
|
||||
_runtime_instance = _LightRagRuntime(**runtime_kwargs)
|
||||
_runtime_signature = signature
|
||||
return _runtime_instance
|
||||
runtime = _LightRagRuntime(**runtime_kwargs)
|
||||
_runtime_instances[thread_id] = runtime
|
||||
_runtime_signatures[thread_id] = signature
|
||||
return runtime
|
||||
|
||||
def _build_runtime_signature(self) -> tuple[tuple[Any, ...], dict[str, Any]]:
|
||||
configs = self._load_runtime_configs()
|
||||
settings = get_settings()
|
||||
working_dir = (self.storage_root / "knowledge" / ".lightrag").resolve()
|
||||
workspace = os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip() or DEFAULT_LIGHTRAG_WORKSPACE
|
||||
workspace = (
|
||||
os.environ.get("LIGHTRAG_WORKSPACE", DEFAULT_LIGHTRAG_WORKSPACE).strip()
|
||||
or DEFAULT_LIGHTRAG_WORKSPACE
|
||||
)
|
||||
qdrant_url = os.environ.get("QDRANT_URL", "").strip() or _resolve_default_qdrant_url()
|
||||
qdrant_api_key = os.environ.get("QDRANT_API_KEY", "").strip()
|
||||
|
||||
@@ -318,7 +360,9 @@ class KnowledgeRagService:
|
||||
try:
|
||||
settings_service = SettingsService(session)
|
||||
main = self._normalize_runtime_model(settings_service.get_runtime_model_config("main"))
|
||||
embedding = self._normalize_runtime_model(settings_service.get_runtime_model_config("embedding"))
|
||||
embedding = self._normalize_runtime_model(
|
||||
settings_service.get_runtime_model_config("embedding")
|
||||
)
|
||||
try:
|
||||
backup_raw = settings_service.get_runtime_model_config("backup")
|
||||
backup = self._normalize_runtime_model(backup_raw)
|
||||
@@ -405,7 +449,9 @@ class KnowledgeRagService:
|
||||
|
||||
document_id, document_name = _parse_document_identity(file_path)
|
||||
normalized_chunk_id = chunk_id or f"path-{rank}"
|
||||
normalized_content = _truncate_text(content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH)
|
||||
normalized_content = _truncate_text(
|
||||
content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH
|
||||
)
|
||||
excerpt = _build_query_focused_excerpt(
|
||||
normalized_content,
|
||||
query_terms=query_terms,
|
||||
@@ -510,17 +556,14 @@ class KnowledgeRagService:
|
||||
|
||||
|
||||
def shutdown_knowledge_rag_runtime() -> None:
|
||||
global _runtime_instance, _runtime_signature
|
||||
|
||||
with _runtime_lock:
|
||||
if _runtime_instance is None:
|
||||
return
|
||||
try:
|
||||
_runtime_instance.finalize()
|
||||
except Exception as exc: # pragma: no cover - best effort cleanup
|
||||
logger.warning("Finalize LightRAG runtime failed during shutdown: %s", exc)
|
||||
_runtime_instance = None
|
||||
_runtime_signature = None
|
||||
for runtime in list(_runtime_instances.values()):
|
||||
try:
|
||||
runtime.finalize()
|
||||
except Exception as exc: # pragma: no cover - best effort cleanup
|
||||
logger.warning("Finalize LightRAG runtime failed during shutdown: %s", exc)
|
||||
_runtime_instances.clear()
|
||||
_runtime_signatures.clear()
|
||||
|
||||
|
||||
def _parse_document_identity(file_path: str) -> tuple[str, str]:
|
||||
@@ -551,9 +594,7 @@ def _build_query_focused_excerpt(
|
||||
|
||||
lowered = normalized.lower()
|
||||
match_positions = [
|
||||
lowered.find(term)
|
||||
for term in query_terms
|
||||
if term and lowered.find(term) >= 0
|
||||
lowered.find(term) for term in query_terms if term and lowered.find(term) >= 0
|
||||
]
|
||||
if not match_positions:
|
||||
return _build_excerpt(normalized, max_length=max_length)
|
||||
@@ -649,7 +690,9 @@ def _score_knowledge_hit(
|
||||
elif leading_appendix_marker == "# 重点章节摘录":
|
||||
score += 4 if matched_terms else -12
|
||||
elif leading_appendix_marker == "# 问答线索补充":
|
||||
score += 8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
|
||||
score += (
|
||||
8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
|
||||
)
|
||||
elif leading_appendix_marker == "# 结构化表格补充":
|
||||
if prefers_tabular_evidence and matched_terms:
|
||||
score += 16
|
||||
@@ -666,7 +709,11 @@ def _score_knowledge_hit(
|
||||
score += 4
|
||||
if matched_terms and any(marker in content for marker in ("附表", "第", "条")):
|
||||
score += 4
|
||||
if not prefers_tabular_evidence and matched_terms and any(marker in content for marker in ("第", "条", ":", "-", "•")):
|
||||
if (
|
||||
not prefers_tabular_evidence
|
||||
and matched_terms
|
||||
and any(marker in content for marker in ("第", "条", ":", "-", "•"))
|
||||
):
|
||||
score += 4
|
||||
if title and any(term in title for term in query_terms):
|
||||
score += 6
|
||||
|
||||
@@ -170,6 +170,7 @@ class SemanticOntologyService(
|
||||
entities = self._merge_entities(
|
||||
entities,
|
||||
model_parse.entity_hints if model_parse is not None else [],
|
||||
compact_query,
|
||||
)
|
||||
intent = self._resolve_intent(
|
||||
compact_query,
|
||||
@@ -193,6 +194,11 @@ class SemanticOntologyService(
|
||||
context_json=context_json,
|
||||
)
|
||||
)
|
||||
missing_slots = self._filter_expense_missing_slots(
|
||||
compact_query=compact_query,
|
||||
entities=entities,
|
||||
missing_slots=missing_slots,
|
||||
)
|
||||
relax_knowledge_follow_up = self._should_relax_knowledge_follow_up_clarification(
|
||||
compact_query=compact_query,
|
||||
scenario=scenario,
|
||||
@@ -306,6 +312,45 @@ class SemanticOntologyService(
|
||||
follow_up_markers = ("那", "那么", "这个", "这种", "呢", "的话", "p", "P")
|
||||
return any(marker in compact_query for marker in follow_up_markers)
|
||||
|
||||
@staticmethod
|
||||
def _filter_expense_missing_slots(
|
||||
*,
|
||||
compact_query: str,
|
||||
entities: list[object],
|
||||
missing_slots: list[str],
|
||||
) -> list[str]:
|
||||
expense_types = {
|
||||
str(getattr(item, "normalized_value", "") or getattr(item, "value", "") or "").strip()
|
||||
for item in entities
|
||||
if getattr(item, "type", "") == "expense_type"
|
||||
}
|
||||
has_transport = "transport" in expense_types
|
||||
has_entertainment = "entertainment" in expense_types
|
||||
explicit_entertainment = any(
|
||||
keyword in compact_query
|
||||
for keyword in (
|
||||
"业务招待",
|
||||
"招待费",
|
||||
"招待",
|
||||
"宴请",
|
||||
"请客",
|
||||
"请客户吃饭",
|
||||
"客户吃饭",
|
||||
"客户用餐",
|
||||
"客户餐",
|
||||
"商务接待",
|
||||
"商务宴请",
|
||||
"接待餐",
|
||||
)
|
||||
)
|
||||
if has_transport and not has_entertainment and not explicit_entertainment:
|
||||
return [
|
||||
item
|
||||
for item in missing_slots
|
||||
if item not in {"customer_name", "participants"}
|
||||
]
|
||||
return missing_slots
|
||||
|
||||
def _record_semantic_parse(
|
||||
self,
|
||||
*,
|
||||
|
||||
@@ -37,6 +37,39 @@ from app.services.ontology_rules import (
|
||||
logger = get_logger("app.services.ontology")
|
||||
|
||||
|
||||
TRANSPORT_EXPENSE_OVERRIDE_KEYWORDS = (
|
||||
"打车",
|
||||
"网约车",
|
||||
"出租车票",
|
||||
"出租车",
|
||||
"的士票",
|
||||
"的士",
|
||||
"滴滴",
|
||||
"市内交通",
|
||||
"乘车",
|
||||
"乘车费",
|
||||
"用车",
|
||||
"叫车",
|
||||
"车费",
|
||||
"车资",
|
||||
"机场",
|
||||
)
|
||||
EXPLICIT_ENTERTAINMENT_KEYWORDS = (
|
||||
"业务招待",
|
||||
"招待费",
|
||||
"招待",
|
||||
"宴请",
|
||||
"请客",
|
||||
"请客户吃饭",
|
||||
"客户吃饭",
|
||||
"客户用餐",
|
||||
"客户餐",
|
||||
"商务接待",
|
||||
"商务宴请",
|
||||
"接待餐",
|
||||
)
|
||||
|
||||
|
||||
class OntologyDetectionMixin:
|
||||
def _detect_scenario(self, compact_query: str) -> tuple[str, float]:
|
||||
scores = {key: 0.0 for key in SCENARIO_KEYWORDS}
|
||||
@@ -337,6 +370,9 @@ class OntologyDetectionMixin:
|
||||
"出现“客户”不等于应收,出现“供应商”不等于应付,必须结合动作词和业务目标判断。"
|
||||
"只有明确查询、统计、列出、多少、明细、对比时才优先使用 query 或 compare。"
|
||||
"附件名称和 OCR 摘要只作为辅助证据,不能编造未出现的事实。"
|
||||
"如果用户明确提到打车、的士票、出租车票、网约车、乘车费、车费等交通票据,"
|
||||
"即使句子里出现“客户”,也必须优先识别为 transport,不要推断为 entertainment。"
|
||||
"不要输出用户原文未出现、且与规则候选冲突的费用类型。"
|
||||
"信息不足时 clarification_required=true,并给出一句简短中文追问。"
|
||||
"missing_slots 使用简短 snake_case,例如 expense_type, amount, "
|
||||
"customer_name, participants, attachments。"
|
||||
@@ -351,12 +387,12 @@ class OntologyDetectionMixin:
|
||||
' "intent": "draft",\n'
|
||||
' "confidence": 0.88,\n'
|
||||
' "clarification_required": true,\n'
|
||||
' "clarification_question": "请补充客户单位、参与人员和票据附件。",\n'
|
||||
' "missing_slots": ["customer_name", "participants", "attachments"],\n'
|
||||
' "clarification_question": "请补充发生时间、金额和票据附件。",\n'
|
||||
' "missing_slots": ["time_range", "amount", "attachments"],\n'
|
||||
' "ambiguity": [],\n'
|
||||
' "entity_hints": [\n'
|
||||
' {"type": "expense_type", "value": "招待", '
|
||||
'"normalized_value": "entertainment", "role": "filter", '
|
||||
' {"type": "expense_type", "value": "交通费", '
|
||||
'"normalized_value": "transport", "role": "filter", '
|
||||
'"confidence": 0.86}\n'
|
||||
" ]\n"
|
||||
"}"
|
||||
@@ -432,6 +468,7 @@ class OntologyDetectionMixin:
|
||||
def _merge_entities(
|
||||
base_entities: list[OntologyEntity],
|
||||
entity_hints: list[LlmOntologyEntityHint],
|
||||
compact_query: str = "",
|
||||
) -> list[OntologyEntity]:
|
||||
merged: dict[tuple[str, str], OntologyEntity] = {
|
||||
(item.type, item.normalized_value): item for item in base_entities
|
||||
@@ -454,7 +491,36 @@ class OntologyDetectionMixin:
|
||||
if existing is None or existing.confidence < candidate.confidence:
|
||||
merged[key] = candidate
|
||||
|
||||
return list(merged.values())
|
||||
items = list(merged.values())
|
||||
if OntologyDetectionMixin._should_transport_override_entertainment(
|
||||
compact_query,
|
||||
items,
|
||||
):
|
||||
items = [
|
||||
item
|
||||
for item in items
|
||||
if not (
|
||||
item.type == "expense_type"
|
||||
and item.normalized_value == "entertainment"
|
||||
)
|
||||
]
|
||||
return items
|
||||
|
||||
@staticmethod
|
||||
def _should_transport_override_entertainment(
|
||||
compact_query: str,
|
||||
entities: list[OntologyEntity],
|
||||
) -> bool:
|
||||
expense_types = {
|
||||
str(item.normalized_value or item.value or "").strip()
|
||||
for item in entities
|
||||
if item.type == "expense_type"
|
||||
}
|
||||
if not {"transport", "entertainment"}.issubset(expense_types):
|
||||
return False
|
||||
if not any(keyword in compact_query for keyword in TRANSPORT_EXPENSE_OVERRIDE_KEYWORDS):
|
||||
return False
|
||||
return not any(keyword in compact_query for keyword in EXPLICIT_ENTERTAINMENT_KEYWORDS)
|
||||
|
||||
@staticmethod
|
||||
def _normalize_short_text_list(values: list[str]) -> list[str]:
|
||||
|
||||
@@ -59,11 +59,16 @@ class OntologyExtractionMixin:
|
||||
missing_slots.append("attachments")
|
||||
return missing_slots
|
||||
|
||||
if any(
|
||||
has_entertainment_type = any(
|
||||
item.normalized_value == "entertainment"
|
||||
for item in entities
|
||||
if item.type == "expense_type"
|
||||
):
|
||||
)
|
||||
has_explicit_entertainment_text = "客户" in compact_query and any(
|
||||
keyword in compact_query
|
||||
for keyword in ("招待", "接待", "吃饭", "用餐", "宴请", "请客", "客户餐")
|
||||
)
|
||||
if has_entertainment_type or has_explicit_entertainment_text:
|
||||
if "customer" not in entity_types:
|
||||
missing_slots.append("customer_name")
|
||||
missing_slots.append("participants")
|
||||
@@ -171,14 +176,14 @@ class OntologyExtractionMixin:
|
||||
upsert(self._make_entity("expense_type", label, normalized, role="filter"))
|
||||
|
||||
has_customer_entertainment_signal = "客户" in query and any(
|
||||
keyword in query for keyword in ("吃饭", "用餐", "餐饮", "宴请", "请客", "招待")
|
||||
keyword in query for keyword in ("吃饭", "用餐", "餐饮", "宴请", "请客", "招待", "接待")
|
||||
)
|
||||
if has_customer_entertainment_signal:
|
||||
upsert(
|
||||
self._make_entity(
|
||||
"expense_type",
|
||||
"客户招待",
|
||||
"entertainment",
|
||||
"业务招待费",
|
||||
"meal",
|
||||
role="filter",
|
||||
confidence=0.96,
|
||||
)
|
||||
@@ -189,46 +194,52 @@ class OntologyExtractionMixin:
|
||||
for keyword in (
|
||||
"打车",
|
||||
"网约车",
|
||||
"出租车",
|
||||
"出租车票",
|
||||
"出租车",
|
||||
"车费",
|
||||
"乘车",
|
||||
"用车",
|
||||
"叫车",
|
||||
"车资",
|
||||
"的士",
|
||||
"的士票",
|
||||
"的士",
|
||||
"滴滴",
|
||||
"市内交通",
|
||||
"地铁",
|
||||
"公交",
|
||||
"停车费",
|
||||
"过路费",
|
||||
"通行费",
|
||||
"高速费",
|
||||
)
|
||||
):
|
||||
upsert(self._make_entity("expense_type", "交通", "transport", role="filter", confidence=0.9))
|
||||
|
||||
if any(keyword in query for keyword in ("出差", "机票", "火车", "高铁", "行程单")):
|
||||
if any(keyword in query for keyword in ("出差", "机票", "飞机票", "航班", "火车票", "火车", "高铁票", "高铁", "动车", "行程单")):
|
||||
upsert(self._make_entity("expense_type", "差旅", "travel", role="filter", confidence=0.88))
|
||||
|
||||
if any(keyword in query for keyword in ("酒店", "住宿", "宾馆")):
|
||||
if any(keyword in query for keyword in ("酒店", "酒店发票", "住宿", "住宿费", "宾馆", "民宿", "房费", "客房")):
|
||||
upsert(self._make_entity("expense_type", "住宿", "hotel", role="filter", confidence=0.86))
|
||||
|
||||
if (
|
||||
not has_customer_entertainment_signal
|
||||
and any(keyword in query for keyword in ("餐费", "用餐", "午餐", "晚餐", "早餐", "餐饮"))
|
||||
):
|
||||
upsert(self._make_entity("expense_type", "餐费", "meal", role="filter", confidence=0.84))
|
||||
upsert(self._make_entity("expense_type", "业务招待费", "meal", role="filter", confidence=0.84))
|
||||
|
||||
if any(
|
||||
keyword in query
|
||||
for keyword in ("办公用品", "文具", "耗材", "办公耗材", "打印纸", "办公设备", "键盘", "鼠标", "白板")
|
||||
for keyword in ("办公用品", "文具", "耗材", "办公耗材", "打印纸", "办公设备", "键盘", "鼠标", "白板", "硒鼓", "墨盒")
|
||||
):
|
||||
upsert(self._make_entity("expense_type", "办公费", "office", role="filter", confidence=0.87))
|
||||
upsert(self._make_entity("expense_type", "办公用品费", "office", role="filter", confidence=0.87))
|
||||
|
||||
if any(keyword in query for keyword in ("培训", "讲师费", "课时费", "课程费")):
|
||||
if any(keyword in query for keyword in ("培训", "讲师费", "课时费", "课程费", "教材", "认证费", "考试费")):
|
||||
upsert(self._make_entity("expense_type", "培训费", "training", role="filter", confidence=0.84))
|
||||
|
||||
if any(keyword in query for keyword in ("通讯费", "话费", "流量费", "宽带费")):
|
||||
if any(keyword in query for keyword in ("通讯费", "话费", "电话费", "手机费", "流量费", "宽带费", "网络费")):
|
||||
upsert(self._make_entity("expense_type", "通讯费", "communication", role="filter", confidence=0.84))
|
||||
|
||||
if any(keyword in query for keyword in ("福利费", "团建", "慰问", "节日福利", "体检费")):
|
||||
if any(keyword in query for keyword in ("福利费", "团建", "慰问", "节日福利", "体检费", "员工关怀")):
|
||||
upsert(self._make_entity("expense_type", "福利费", "welfare", role="filter", confidence=0.84))
|
||||
|
||||
for amount in self._extract_amount_entities(query):
|
||||
|
||||
@@ -6,6 +6,7 @@ from dataclasses import dataclass
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
from app.schemas.ontology import OntologyIntent, OntologyScenario
|
||||
from app.services.expense_type_keywords import build_expense_type_keyword_map
|
||||
|
||||
DATE_RANGE_PATTERN = re.compile(
|
||||
r"(?P<start>\d{4}-\d{1,2}-\d{1,2})\s*(?:到|至|~|-)\s*(?P<end>\d{4}-\d{1,2}-\d{1,2})"
|
||||
@@ -128,44 +129,7 @@ OPERATE_KEYWORDS = (
|
||||
"删除",
|
||||
)
|
||||
|
||||
EXPENSE_TYPE_KEYWORDS = {
|
||||
"差旅": "travel",
|
||||
"出差": "travel",
|
||||
"住宿": "hotel",
|
||||
"酒店": "hotel",
|
||||
"交通": "transport",
|
||||
"打车": "transport",
|
||||
"网约车": "transport",
|
||||
"出租车": "transport",
|
||||
"出租车票": "transport",
|
||||
"乘车": "transport",
|
||||
"乘车费": "transport",
|
||||
"用车": "transport",
|
||||
"叫车": "transport",
|
||||
"车资": "transport",
|
||||
"的士": "transport",
|
||||
"的士票": "transport",
|
||||
"停车费": "transport",
|
||||
"餐费": "meal",
|
||||
"用餐": "meal",
|
||||
"会务": "meeting",
|
||||
"招待费": "entertainment",
|
||||
"招待": "entertainment",
|
||||
"宴请": "entertainment",
|
||||
"办公费": "office",
|
||||
"办公用品": "office",
|
||||
"文具": "office",
|
||||
"耗材": "office",
|
||||
"办公耗材": "office",
|
||||
"打印纸": "office",
|
||||
"办公设备": "office",
|
||||
"培训费": "training",
|
||||
"培训": "training",
|
||||
"通讯费": "communication",
|
||||
"话费": "communication",
|
||||
"福利费": "welfare",
|
||||
"团建": "welfare",
|
||||
}
|
||||
EXPENSE_TYPE_KEYWORDS = build_expense_type_keyword_map()
|
||||
|
||||
EXPENSE_NARRATIVE_KEYWORDS = (
|
||||
"报销",
|
||||
|
||||
@@ -74,16 +74,16 @@ EXPENSE_RISK_LEVEL_LABELS = {
|
||||
"medium": "中风险",
|
||||
"warning": "中风险",
|
||||
"low": "低风险",
|
||||
"info": "低风险",
|
||||
"info": "提示",
|
||||
}
|
||||
EXPENSE_TYPE_LABELS = {
|
||||
"travel": "差旅费",
|
||||
"hotel": "住宿费",
|
||||
"transport": "交通费",
|
||||
"meal": "餐费",
|
||||
"meal": "业务招待费",
|
||||
"meeting": "会务费",
|
||||
"entertainment": "业务招待费",
|
||||
"office": "办公费",
|
||||
"office": "办公用品费",
|
||||
"training": "培训费",
|
||||
"communication": "通讯费",
|
||||
"welfare": "福利费",
|
||||
|
||||
@@ -35,10 +35,10 @@ EXPENSE_TYPE_LABELS = {
|
||||
"travel": "差旅费",
|
||||
"hotel": "住宿费",
|
||||
"transport": "交通费",
|
||||
"meal": "餐费",
|
||||
"meal": "业务招待费",
|
||||
"meeting": "会务费",
|
||||
"entertainment": "业务招待费",
|
||||
"office": "办公费",
|
||||
"office": "办公用品费",
|
||||
"training": "培训费",
|
||||
"communication": "通讯费",
|
||||
"welfare": "福利费",
|
||||
@@ -48,10 +48,10 @@ EXPENSE_TYPE_LABELS = {
|
||||
GROUP_SCENE_LABELS = {
|
||||
"travel": "差旅费",
|
||||
"entertainment": "业务招待费",
|
||||
"meal": "伙食费",
|
||||
"meal": "业务招待费",
|
||||
"transport": "交通费",
|
||||
"hotel": "住宿费",
|
||||
"office": "办公费",
|
||||
"office": "办公用品费",
|
||||
"training": "培训费",
|
||||
"communication": "通讯费",
|
||||
"welfare": "福利费",
|
||||
@@ -62,8 +62,12 @@ EXPENSE_SCENE_SELECTION_OPTIONS = (
|
||||
("travel", "差旅费", "出差、长途交通、住宿、差旅补贴等场景。"),
|
||||
("transport", "交通费", "市内打车、停车、过路费等日常交通场景。"),
|
||||
("hotel", "住宿费", "单独住宿、酒店发票等场景。"),
|
||||
("entertainment", "业务招待费", "客户接待、宴请、招待等场景。"),
|
||||
("office", "办公费", "办公用品、耗材、办公设备等采购场景。"),
|
||||
("meal", "业务招待费", "客户接待、工作餐、加班餐、餐饮票据等场景。"),
|
||||
("meeting", "会务费", "会议、论坛、会场、参会等场景。"),
|
||||
("office", "办公用品费", "办公用品、耗材、办公设备等采购场景。"),
|
||||
("training", "培训费", "培训课程、讲师费、教材、认证等场景。"),
|
||||
("communication", "通讯费", "话费、流量、宽带、网络等场景。"),
|
||||
("welfare", "福利费", "团建、体检、慰问、节日福利等场景。"),
|
||||
("other", "其他费用", "暂不属于以上分类的报销场景。"),
|
||||
)
|
||||
|
||||
@@ -130,10 +134,10 @@ INFERRED_REASON_LABELS = {
|
||||
"travel": "出差行程",
|
||||
"hotel": "住宿报销",
|
||||
"transport": "交通出行",
|
||||
"meal": "餐饮用餐",
|
||||
"meal": "业务招待",
|
||||
"meeting": "会务活动",
|
||||
"entertainment": "客户接待",
|
||||
"office": "办公采购",
|
||||
"office": "办公用品采购",
|
||||
"training": "培训学习",
|
||||
"communication": "通讯使用",
|
||||
"welfare": "员工福利",
|
||||
|
||||
@@ -9,16 +9,32 @@ from app.schemas.user_agent import UserAgentRequest, UserAgentReviewDocumentCard
|
||||
DEFAULT_GROUP_SCENE_LABELS = {
|
||||
"travel": "差旅费",
|
||||
"entertainment": "业务招待费",
|
||||
"meal": "伙食费",
|
||||
"meal": "业务招待费",
|
||||
"transport": "交通费",
|
||||
"hotel": "住宿费",
|
||||
"office": "办公费",
|
||||
"office": "办公用品费",
|
||||
"training": "培训费",
|
||||
"communication": "通讯费",
|
||||
"welfare": "福利费",
|
||||
"other": "其他费用",
|
||||
}
|
||||
|
||||
DOCUMENT_SCENE_LABELS = {
|
||||
"flight_itinerary": "机票/航班行程单",
|
||||
"train_ticket": "火车/高铁票",
|
||||
"ship_ticket": "轮船票",
|
||||
"travel_ticket": "交通出行票据",
|
||||
"hotel_invoice": "酒店住宿票据",
|
||||
"taxi_receipt": "出租车/网约车票据",
|
||||
"transport_receipt": "乘车票据",
|
||||
"parking_toll_receipt": "停车/通行费票据",
|
||||
"meal_receipt": "餐饮发票",
|
||||
"office_invoice": "文具/办公用品发票",
|
||||
"meeting_invoice": "会议/会务票据",
|
||||
"training_invoice": "培训票据",
|
||||
"other": "其他票据",
|
||||
}
|
||||
|
||||
DOCUMENT_DATE_TEXT_PATTERN = re.compile(
|
||||
r"(\d{4}[年/-]\d{1,2}[月/-]\d{1,2}日?(?:\s*[T ]?\s*(?:[01]?\d|2[0-3])[::][0-5]\d)?)"
|
||||
)
|
||||
@@ -48,55 +64,55 @@ class UserAgentDocumentService:
|
||||
provided_type = str(item.get("document_type") or "").strip().lower()
|
||||
normalized_expense_type = str(expense_type_code or "").strip().lower()
|
||||
if provided_type:
|
||||
if provided_type in {"flight_itinerary", "train_ticket"}:
|
||||
if provided_type in {"flight_itinerary", "train_ticket", "ship_ticket"}:
|
||||
return {
|
||||
"document_type": provided_type,
|
||||
"expense_type": "travel",
|
||||
"group_code": "travel",
|
||||
"scene_label": "差旅票据",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS.get(provided_type, "交通出行票据"),
|
||||
}
|
||||
if provided_type == "hotel_invoice":
|
||||
return {
|
||||
"document_type": provided_type,
|
||||
"expense_type": "hotel",
|
||||
"group_code": "travel",
|
||||
"scene_label": "住宿票据",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["hotel_invoice"],
|
||||
}
|
||||
if provided_type in {"taxi_receipt", "parking_toll_receipt"}:
|
||||
if provided_type in {"taxi_receipt", "transport_receipt", "parking_toll_receipt"}:
|
||||
return {
|
||||
"document_type": provided_type,
|
||||
"expense_type": "transport",
|
||||
"group_code": "travel",
|
||||
"scene_label": "交通票据",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS.get(provided_type, "乘车票据"),
|
||||
}
|
||||
if provided_type == "meal_receipt":
|
||||
group_code = "entertainment" if normalized_expense_type == "entertainment" or has_customer else "meal"
|
||||
group_code = "meal"
|
||||
return {
|
||||
"document_type": provided_type,
|
||||
"expense_type": group_code,
|
||||
"group_code": group_code,
|
||||
"scene_label": "餐饮票据",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["meal_receipt"],
|
||||
}
|
||||
if provided_type == "office_invoice":
|
||||
return {
|
||||
"document_type": provided_type,
|
||||
"expense_type": "office",
|
||||
"group_code": "office",
|
||||
"scene_label": "办公用品票据",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["office_invoice"],
|
||||
}
|
||||
if provided_type == "meeting_invoice":
|
||||
return {
|
||||
"document_type": provided_type,
|
||||
"expense_type": "meeting",
|
||||
"group_code": "meeting",
|
||||
"scene_label": "会务票据",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["meeting_invoice"],
|
||||
}
|
||||
if provided_type == "training_invoice":
|
||||
return {
|
||||
"document_type": provided_type,
|
||||
"expense_type": "training",
|
||||
"group_code": "training",
|
||||
"scene_label": "培训票据",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["training_invoice"],
|
||||
}
|
||||
|
||||
text = " ".join(
|
||||
@@ -108,41 +124,69 @@ class UserAgentDocumentService:
|
||||
).lower()
|
||||
compact = text.replace(" ", "")
|
||||
|
||||
if any(keyword in compact for keyword in ("机票", "航班", "火车", "高铁", "行程单")):
|
||||
if any(keyword in compact for keyword in ("火车", "高铁", "动车", "铁路", "车次")):
|
||||
return {
|
||||
"document_type": "travel_ticket",
|
||||
"document_type": "train_ticket",
|
||||
"expense_type": "travel",
|
||||
"group_code": "travel",
|
||||
"scene_label": "差旅票据",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["train_ticket"],
|
||||
}
|
||||
if any(keyword in compact for keyword in ("过路费", "停车", "通行费", "收费站")):
|
||||
return {
|
||||
"document_type": "parking_toll_receipt",
|
||||
"expense_type": "transport",
|
||||
"group_code": "travel",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["parking_toll_receipt"],
|
||||
}
|
||||
if any(keyword in compact for keyword in ("打车", "出租车", "滴滴", "网约车", "叫车", "车费", "车资", "的士")):
|
||||
return {
|
||||
"document_type": "taxi_receipt",
|
||||
"expense_type": "transport",
|
||||
"group_code": "travel",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["taxi_receipt"],
|
||||
}
|
||||
if any(keyword in compact for keyword in ("乘车", "用车")):
|
||||
return {
|
||||
"document_type": "transport_receipt",
|
||||
"expense_type": "transport",
|
||||
"group_code": "travel",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["transport_receipt"],
|
||||
}
|
||||
if any(keyword in compact for keyword in ("机票", "航班", "登机", "航空", "客票")):
|
||||
return {
|
||||
"document_type": "flight_itinerary",
|
||||
"expense_type": "travel",
|
||||
"group_code": "travel",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["flight_itinerary"],
|
||||
}
|
||||
if any(keyword in compact for keyword in ("轮船", "船票", "客轮", "渡轮", "航运")):
|
||||
return {
|
||||
"document_type": "ship_ticket",
|
||||
"expense_type": "travel",
|
||||
"group_code": "travel",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["ship_ticket"],
|
||||
}
|
||||
if any(keyword in compact for keyword in ("酒店", "住宿", "宾馆")):
|
||||
return {
|
||||
"document_type": "hotel_invoice",
|
||||
"expense_type": "hotel",
|
||||
"group_code": "travel",
|
||||
"scene_label": "住宿票据",
|
||||
}
|
||||
if any(keyword in compact for keyword in ("打车", "出租车", "滴滴", "网约车", "乘车", "用车", "叫车", "车费", "车资", "的士", "过路费", "停车")):
|
||||
return {
|
||||
"document_type": "transport_receipt",
|
||||
"expense_type": "transport",
|
||||
"group_code": "travel",
|
||||
"scene_label": "交通票据",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["hotel_invoice"],
|
||||
}
|
||||
if any(keyword in compact for keyword in ("餐", "饭店", "酒楼", "酒家", "餐饮", "meal")):
|
||||
group_code = "entertainment" if normalized_expense_type == "entertainment" or has_customer else "meal"
|
||||
group_code = "meal"
|
||||
return {
|
||||
"document_type": "meal_receipt",
|
||||
"expense_type": group_code,
|
||||
"group_code": group_code,
|
||||
"scene_label": "餐饮票据",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["meal_receipt"],
|
||||
}
|
||||
if any(keyword in compact for keyword in ("办公用品", "文具", "耗材", "办公耗材", "打印纸", "键盘", "鼠标", "白板", "墨盒", "硒鼓")):
|
||||
return {
|
||||
"document_type": "other",
|
||||
"document_type": "office_invoice",
|
||||
"expense_type": "office",
|
||||
"group_code": "office",
|
||||
"scene_label": "办公用品票据",
|
||||
"scene_label": DOCUMENT_SCENE_LABELS["office_invoice"],
|
||||
}
|
||||
return {
|
||||
"document_type": "other",
|
||||
|
||||
@@ -314,10 +314,7 @@ class UserAgentReviewCoreMixin:
|
||||
filename=str(item.get("filename") or f"document-{index}"),
|
||||
document_type=classified["document_type"],
|
||||
suggested_expense_type=classified["expense_type"],
|
||||
scene_label=GROUP_SCENE_LABELS.get(
|
||||
classified["group_code"],
|
||||
classified["scene_label"],
|
||||
),
|
||||
scene_label=self._resolve_review_document_scene_label(item, classified),
|
||||
summary=str(item.get("summary") or item.get("text") or "").strip(),
|
||||
avg_score=float(item.get("avg_score") or 0.0),
|
||||
preview_kind=str(item.get("preview_kind") or "").strip(),
|
||||
@@ -338,6 +335,25 @@ class UserAgentReviewCoreMixin:
|
||||
return cards
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _resolve_review_document_scene_label(item: dict[str, object], classified: dict[str, str]) -> str:
|
||||
provided_label = str(item.get("document_type_label") or "").strip()
|
||||
if provided_label and provided_label != "其他单据":
|
||||
return provided_label
|
||||
|
||||
classified_scene_label = str(classified.get("scene_label") or "").strip()
|
||||
if classified_scene_label:
|
||||
return classified_scene_label
|
||||
|
||||
document_type = str(classified.get("document_type") or item.get("document_type") or "").strip()
|
||||
document_type_label = resolve_document_type_label(document_type)
|
||||
if document_type_label and document_type_label not in {"其他单据", document_type}:
|
||||
return document_type_label
|
||||
|
||||
scene_label = str(item.get("scene_label") or "").strip()
|
||||
return scene_label or "其他票据"
|
||||
|
||||
|
||||
def _build_review_claim_groups(
|
||||
self,
|
||||
payload: UserAgentRequest,
|
||||
|
||||
@@ -59,6 +59,20 @@ class UserAgentReviewProfileMixin:
|
||||
manager_name = self._resolve_manager_name(employee)
|
||||
reason = slot_map.get("reason").value if slot_map.get("reason") else ""
|
||||
attachments = "、".join(self._resolve_attachment_names(payload))
|
||||
expense_type_code = str(slot_map.get("expense_type").normalized_value if slot_map.get("expense_type") else "").strip()
|
||||
customer_name = str(slot_map.get("customer_name").value if slot_map.get("customer_name") else "").strip()
|
||||
merchant_name = str(slot_map.get("merchant_name").value if slot_map.get("merchant_name") else "").strip()
|
||||
participants = str(slot_map.get("participants").value if slot_map.get("participants") else "").strip()
|
||||
customer_slot = slot_map.get("customer_name")
|
||||
participants_slot = slot_map.get("participants")
|
||||
customer_required = bool(
|
||||
customer_slot
|
||||
and (customer_slot.required or customer_slot.status == "missing")
|
||||
)
|
||||
participants_required = bool(
|
||||
participants_slot
|
||||
and (participants_slot.required or participants_slot.status == "missing")
|
||||
)
|
||||
|
||||
fields = [
|
||||
UserAgentReviewEditField(
|
||||
@@ -98,13 +112,20 @@ class UserAgentReviewProfileMixin:
|
||||
required=False,
|
||||
group="basic",
|
||||
),
|
||||
UserAgentReviewEditField(
|
||||
key="customer_name",
|
||||
label="客户名称",
|
||||
value=slot_map.get("customer_name").value if slot_map.get("customer_name") else "",
|
||||
placeholder="请输入客户名称",
|
||||
group="business",
|
||||
),
|
||||
]
|
||||
|
||||
if expense_type_code == "entertainment" or customer_required or customer_name:
|
||||
fields.append(
|
||||
UserAgentReviewEditField(
|
||||
key="customer_name",
|
||||
label="客户名称",
|
||||
value=customer_name,
|
||||
placeholder="请输入客户名称",
|
||||
group="business",
|
||||
)
|
||||
)
|
||||
|
||||
fields.append(
|
||||
UserAgentReviewEditField(
|
||||
key="business_location",
|
||||
label="业务地点",
|
||||
@@ -112,15 +133,22 @@ class UserAgentReviewProfileMixin:
|
||||
placeholder="例如:北京 / 客户现场",
|
||||
required=False,
|
||||
group="business",
|
||||
),
|
||||
UserAgentReviewEditField(
|
||||
key="merchant_name",
|
||||
label="酒店/商户",
|
||||
value=slot_map.get("merchant_name").value if slot_map.get("merchant_name") else "",
|
||||
placeholder="请输入酒店或商户名称",
|
||||
required=False,
|
||||
group="business",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if expense_type_code == "hotel" or merchant_name:
|
||||
fields.append(
|
||||
UserAgentReviewEditField(
|
||||
key="merchant_name",
|
||||
label="酒店/商户",
|
||||
value=merchant_name,
|
||||
placeholder="请输入酒店或商户名称",
|
||||
required=False,
|
||||
group="business",
|
||||
)
|
||||
)
|
||||
|
||||
fields.extend([
|
||||
UserAgentReviewEditField(
|
||||
key="amount",
|
||||
label="金额",
|
||||
@@ -128,13 +156,20 @@ class UserAgentReviewProfileMixin:
|
||||
placeholder="例如:200.00元",
|
||||
group="business",
|
||||
),
|
||||
UserAgentReviewEditField(
|
||||
key="participants",
|
||||
label="参与人员",
|
||||
value=slot_map.get("participants").value if slot_map.get("participants") else "",
|
||||
placeholder="例如:客户 2 人,我方 1 人",
|
||||
group="business",
|
||||
),
|
||||
])
|
||||
|
||||
if expense_type_code == "entertainment" or participants_required or participants:
|
||||
fields.append(
|
||||
UserAgentReviewEditField(
|
||||
key="participants",
|
||||
label="参与人员",
|
||||
value=participants,
|
||||
placeholder="例如:客户 2 人,我方 1 人",
|
||||
group="business",
|
||||
)
|
||||
)
|
||||
|
||||
fields.extend([
|
||||
UserAgentReviewEditField(
|
||||
key="reason",
|
||||
label="事由",
|
||||
@@ -152,7 +187,7 @@ class UserAgentReviewProfileMixin:
|
||||
field_type="textarea",
|
||||
group="attachments",
|
||||
),
|
||||
]
|
||||
])
|
||||
return fields
|
||||
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ from app.services.expense_claims import ExpenseClaimService
|
||||
from app.services.expense_rule_runtime import ExpenseRuleRuntimeService, RuntimeTravelPolicy, resolve_document_type_label
|
||||
from app.services.risk_ontology_bridge import resolve_rule_codes_for_risk_check
|
||||
from app.services.travel_reimbursement_calculator import TravelReimbursementCalculatorService
|
||||
from app.services.expense_type_keywords import resolve_expense_type_label_from_text
|
||||
from app.services.user_agent_constants import *
|
||||
|
||||
|
||||
@@ -568,27 +569,9 @@ class UserAgentReviewSlotMixin:
|
||||
|
||||
@staticmethod
|
||||
def _normalize_expense_type_input(value: str) -> tuple[str, str]:
|
||||
compact = str(value or "").replace(" ", "")
|
||||
if "招待" in compact or ("客户" in compact and any(keyword in compact for keyword in ("吃饭", "用餐", "宴请", "请客"))):
|
||||
return "entertainment", "业务招待费"
|
||||
if any(keyword in compact for keyword in ("差旅", "出差", "机票", "行程")):
|
||||
return "travel", "差旅费"
|
||||
if any(keyword in compact for keyword in ("住宿", "酒店", "宾馆")):
|
||||
return "hotel", "住宿费"
|
||||
if any(keyword in compact for keyword in ("交通", "打车", "网约车", "出租车", "乘车", "用车", "叫车", "车费", "车资", "的士", "停车")):
|
||||
return "transport", "交通费"
|
||||
if any(keyword in compact for keyword in ("餐费", "用餐", "午餐", "晚餐", "早餐", "伙食")):
|
||||
return "meal", "餐费"
|
||||
if "会务" in compact:
|
||||
return "meeting", "会务费"
|
||||
if any(keyword in compact for keyword in ("办公费", "办公用品", "文具", "耗材", "办公耗材", "打印纸", "办公设备", "键盘", "鼠标", "白板")):
|
||||
return "office", "办公费"
|
||||
if any(keyword in compact for keyword in ("培训费", "培训", "讲师费", "课时费", "课程费")):
|
||||
return "training", "培训费"
|
||||
if any(keyword in compact for keyword in ("通讯费", "话费", "流量费", "宽带费")):
|
||||
return "communication", "通讯费"
|
||||
if any(keyword in compact for keyword in ("福利费", "团建", "慰问", "节日福利", "体检费")):
|
||||
return "welfare", "福利费"
|
||||
resolved = resolve_expense_type_label_from_text(value)
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
return "other", str(value or "").strip() or "其他费用"
|
||||
|
||||
|
||||
|
||||
@@ -137,14 +137,13 @@ class UserAgentReviewTravelPolicyMixin:
|
||||
continue
|
||||
night_count = self._extract_review_hotel_night_count(card)
|
||||
nightly_amount = (amount / Decimal(max(night_count, 1))).quantize(Decimal("0.01"))
|
||||
if nightly_amount <= cap:
|
||||
continue
|
||||
amount_measurement_lines.append(
|
||||
f"{card.filename}:识别为{document_type_label},金额 {amount:.2f} 元,"
|
||||
f"按 {night_count} 晚折算 {nightly_amount:.2f} 元/晚;"
|
||||
f"适用标准为 {band_label}{city_tier_label} {cap:.2f} 元/晚,"
|
||||
f"{'超出标准' if nightly_amount > cap else '测算通过'}。"
|
||||
f"适用标准为 {band_label}{city_tier_label} {cap:.2f} 元/晚,超出标准。"
|
||||
)
|
||||
if nightly_amount <= cap:
|
||||
continue
|
||||
|
||||
basis = (
|
||||
f"依据《{standard_rule_name}》({standard_rule_version}),{band_label} 在{city_tier_label}"
|
||||
@@ -200,12 +199,11 @@ class UserAgentReviewTravelPolicyMixin:
|
||||
)
|
||||
continue
|
||||
|
||||
amount_measurement_lines.append(
|
||||
f"{card.filename}:识别为{document_type_label},金额 {amount:.2f} 元;"
|
||||
f"适用《{standard_rule_name}》{region_label}伙食补助标准 {standard_amount:.2f} 元/天,"
|
||||
f"{'超出标准' if amount > standard_amount else '测算通过'}。"
|
||||
)
|
||||
if amount > standard_amount:
|
||||
amount_measurement_lines.append(
|
||||
f"{card.filename}:识别为{document_type_label},金额 {amount:.2f} 元;"
|
||||
f"适用《{standard_rule_name}》{region_label}伙食补助标准 {standard_amount:.2f} 元/天,超出标准。"
|
||||
)
|
||||
append_once(
|
||||
f"travel-meal-allowance-over-limit-{card.index}",
|
||||
UserAgentReviewRiskBrief(
|
||||
@@ -251,13 +249,6 @@ class UserAgentReviewTravelPolicyMixin:
|
||||
)
|
||||
continue
|
||||
|
||||
if standard_amount is not None:
|
||||
amount_measurement_lines.append(
|
||||
f"{card.filename}:识别为{document_type_label},金额 {amount:.2f} 元;"
|
||||
f"适用《{scene_policy.rule_name}》{metric_label}标准 {standard_amount:.2f} 元,"
|
||||
f"{'超出标准' if amount > standard_amount else '测算通过'}。"
|
||||
)
|
||||
|
||||
amount_risk = self._evaluate_review_scene_amount(
|
||||
amount=amount,
|
||||
limit_config=scene_limit,
|
||||
@@ -265,6 +256,11 @@ class UserAgentReviewTravelPolicyMixin:
|
||||
)
|
||||
if amount_risk is not None:
|
||||
severity, threshold = amount_risk
|
||||
if standard_amount is not None:
|
||||
amount_measurement_lines.append(
|
||||
f"{card.filename}:识别为{document_type_label},金额 {amount:.2f} 元;"
|
||||
f"适用《{scene_policy.rule_name}》{metric_label}标准 {standard_amount:.2f} 元,超出标准。"
|
||||
)
|
||||
append_once(
|
||||
f"{scene_code}-amount-over-limit-{card.index}",
|
||||
UserAgentReviewRiskBrief(
|
||||
@@ -348,11 +344,11 @@ class UserAgentReviewTravelPolicyMixin:
|
||||
briefs.insert(
|
||||
0,
|
||||
UserAgentReviewRiskBrief(
|
||||
title="附件金额测算结果",
|
||||
level="info",
|
||||
content="系统已根据首轮上传附件识别金额,并匹配当前可执行的报销标准进行测算。",
|
||||
title="附件金额测算异常",
|
||||
level="warning",
|
||||
content="系统根据首轮上传附件识别金额后,发现有需要进一步核查或说明的测算结果。",
|
||||
detail=";".join(dict.fromkeys(amount_measurement_lines)),
|
||||
suggestion="如测算结果超标,请补充超标说明、调整金额或更正票据类型后再继续。",
|
||||
suggestion="请补充超标说明、调整金额或更正票据类型后再继续。",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user