feat: 重构知识库系统,移除Hermes集成,增强RAG和同步功能

主要变更:
- 移除Hermes智能体及相关回调服务
- 新增知识库RAG、同步、调度、规范化和索引任务服务
- 重构orchestrator服务,增强运行时聊天功能
- 更新前端聊天、政策制度、设置等页面样式和逻辑
- 更新expense_claims和document_intelligence服务
- 删除llm_wiki相关服务和测试文件
- 更新docker-compose配置和启动脚本
This commit is contained in:
caoxiaozhu
2026-05-17 08:38:41 +00:00
parent 212c935308
commit 68f663f2f4
308 changed files with 83729 additions and 13588 deletions

View File

@@ -9,8 +9,6 @@ from typing import Any
from pydantic import BaseModel, Field, ValidationError
from sqlalchemy.orm import Session
from app.services.runtime_chat import RuntimeChatService
@dataclass(frozen=True, slots=True)
class DocumentField:
@@ -198,7 +196,7 @@ MERCHANT_PATTERNS = (
class DocumentIntelligenceService:
def __init__(self, db: Session | None = None) -> None:
self.runtime_chat_service = RuntimeChatService(db) if db is not None else None
self.db = db
def build_document_insight(
self,
@@ -254,95 +252,6 @@ class DocumentIntelligenceService:
rule_insight: DocumentInsight,
fields: tuple[DocumentField, ...],
) -> tuple[str, LlmDocumentClassification] | None:
if self.runtime_chat_service is None:
return None
trimmed_text = text.strip()
if not trimmed_text and not summary.strip():
return None
facts = {
"filename": filename,
"summary": summary[:300],
"ocr_text_excerpt": trimmed_text[:2000],
"rule_candidate": {
"document_type": rule_insight.document_type,
"document_type_label": rule_insight.document_type_label,
"scene_code": rule_insight.scene_code,
"scene_label": rule_insight.scene_label,
"expense_type": rule_insight.expense_type,
"confidence": round(rule_insight.classification_confidence, 2),
"evidence": list(rule_insight.evidence),
},
"extracted_fields": [
{"key": field.key, "label": field.label, "value": field.value}
for field in fields
],
"allowed_document_types": list(SUPPORTED_DOCUMENT_TYPES),
}
system_prompt = (
"你是企业报销票据识别复核器。"
"你的任务不是 OCR而是在已有 OCR 文本和票据预览基础上判断票据类型,并尽量复核关键字段。"
"只输出 JSON 对象,不要输出 Markdown、解释或代码块。"
"document_type 只能是:"
f"{', '.join(SUPPORTED_DOCUMENT_TYPES)}"
"如果证据不足,返回 other。"
"严禁编造 OCR 中不存在的商户、酒店、航司、路线或金额。"
"如果 OCR 出现冲突碎片,应优先依据票据主体信息,而不是单个噪声词。"
"例如滴滴行程单/网约车发票,即使 OCR 混入酒店名称,也不能直接判成酒店票据。"
"如果能从 OCR 或图片中明确确认字段,可在 fields 中返回。"
"fields 只允许包含 key, label, valuekey 只能是 amount, date, merchant_name, invoice_number, "
"invoice_code, trip_no, route。无法确认就不要返回该字段。"
"输出字段document_type, scene_code, scene_label, expense_type, confidence, evidence, fields。"
)
user_prompt = (
"请根据以下票据事实给出最终分类 JSON\n"
f"{json.dumps(facts, ensure_ascii=False, indent=2)}\n\n"
"示例输出:\n"
"{\n"
' "document_type": "taxi_receipt",\n'
' "scene_code": "transport",\n'
' "scene_label": "交通票据",\n'
' "expense_type": "transport",\n'
' "confidence": 0.86,\n'
' "evidence": ["OCR 中出现 滴滴出行、订单号、上车/下车 等交通特征"],\n'
' "fields": [{"key": "amount", "label": "金额", "value": "32.5"}]\n'
"}"
)
if preview_data_url:
response_text = self.runtime_chat_service.complete(
[
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": [
{"type": "text", "text": user_prompt},
{"type": "image_url", "image_url": {"url": preview_data_url}},
],
},
],
slot_priority=("vlm",),
max_tokens=320,
temperature=0.0,
)
parsed = self._parse_llm_payload(response_text)
if parsed is not None:
return "llm_vision", parsed
response_text = self.runtime_chat_service.complete(
[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
slot_priority=("main", "backup"),
max_tokens=320,
temperature=0.0,
)
parsed = self._parse_llm_payload(response_text)
if parsed is not None:
return "llm_text", parsed
return None
@staticmethod