feat: 集成Hermes智能体系统,增强聊天和差旅报销功能

This commit is contained in:
caoxiaozhu
2026-05-16 06:14:08 +00:00
parent 763afa0ee2
commit 212c935308
46 changed files with 8802 additions and 5372 deletions

View File

@@ -72,6 +72,23 @@ STRUCTURED_PREVIEW_EXTENSIONS = {"docx", "xlsx", "pptx"} | TEXT_EXTENSIONS
INLINE_PREVIEW_EXTENSIONS = {"pdf"} | IMAGE_EXTENSIONS
ONLYOFFICE_EDITABLE_EXTENSIONS = {"docx", "xlsx", "pptx"}
KNOWLEDGE_INGEST_SYNC_STALE_SECONDS = 90
KNOWLEDGE_SEARCH_RESULT_LIMIT = 3
KNOWLEDGE_SEARCH_STOP_TERMS = {
"什么",
"怎么",
"如何",
"多少",
"是否",
"可以",
"一下",
"请问",
"帮我",
"一下子",
"这个",
"那个",
"哪些",
"一下吧",
}
KNOWLEDGE_INGEST_STATUS_PUBLISHED = 1
KNOWLEDGE_INGEST_STATUS_SYNCING = 2
@@ -346,6 +363,156 @@ class KnowledgeService:
self.ensure_library_ready()
return self.llm_wiki_root
def search_llm_wiki(self, query: str, *, limit: int = KNOWLEDGE_SEARCH_RESULT_LIMIT) -> dict[str, Any]:
self.ensure_library_ready()
normalized_query = self._normalize_search_text(query)
if not normalized_query:
return {
"result_type": "knowledge_search",
"query": "",
"record_count": 0,
"hits": [],
"references": [],
"message": "请先输入要检索的制度或规则问题。",
}
index = self._load_index()
if self._reconcile_document_ingest_statuses(index):
self._save_index(index)
entry_by_id = {
str(item.get("id") or "").strip(): item
for item in list(index.get("documents") or [])
if str(item.get("id") or "").strip()
}
wiki_index = self._load_llm_wiki_index()
query_terms = self._extract_search_terms(query)
hits: list[dict[str, Any]] = []
for wiki_document in list(wiki_index.get("documents") or []):
document_id = str(wiki_document.get("document_id") or "").strip()
if not document_id:
continue
entry = entry_by_id.get(document_id)
if entry is None or not self._has_matching_llm_wiki_artifact(entry, wiki_document):
continue
quality_status = str(wiki_document.get("quality_status") or "").strip()
if quality_status == "failed":
continue
document_name = str(wiki_document.get("document_name") or entry.get("original_name") or "").strip()
document_dir = self.llm_wiki_documents_root / document_id
candidates = self._load_json_file(document_dir / "knowledge_candidates.json", default=[])
matched_in_document = False
for index, candidate in enumerate(candidates, start=1):
if not isinstance(candidate, dict):
continue
title = str(candidate.get("title") or "").strip()
content = str(candidate.get("content") or "").strip()
tags = [str(item).strip() for item in list(candidate.get("tags") or []) if str(item).strip()]
evidence = [
str(item).strip() for item in list(candidate.get("evidence") or []) if str(item).strip()
]
score, matched_terms = self._score_knowledge_search_match(
query_text=normalized_query,
query_terms=query_terms,
title=title,
content=content,
tags=tags,
document_name=document_name,
evidence=evidence,
)
if score <= 0:
continue
matched_in_document = True
candidate_id = str(candidate.get("candidate_id") or f"candidate_{index}").strip()
hits.append(
{
"code": f"knowledge.{document_id}.{candidate_id}",
"candidate_id": candidate_id,
"title": title or document_name or "制度知识条目",
"content": content,
"excerpt": self._build_search_excerpt(content or title, query_terms),
"document_id": document_id,
"document_name": document_name,
"version": str(wiki_document.get("document_version") or "").strip() or None,
"updated_at": self._format_search_timestamp(wiki_document.get("updated_at")),
"quality_status": quality_status,
"tags": tags,
"evidence": evidence,
"score": score,
"matched_terms": matched_terms,
}
)
self._boost_title_family_hits(hits)
ranked_hits = sorted(
hits,
key=lambda item: (
-int(item.get("score") or 0),
str(item.get("quality_status") or "") != "formal",
str(item.get("title") or ""),
),
)[: max(1, limit)]
if ranked_hits:
titles = "".join(str(item.get("title") or "") for item in ranked_hits[:2] if str(item.get("title") or "").strip())
return {
"result_type": "knowledge_search",
"query": str(query).strip(),
"record_count": len(ranked_hits),
"hits": ranked_hits,
"references": [str(item.get("code") or "").strip() for item in ranked_hits if str(item.get("code") or "").strip()],
"message": (
f"已从已归纳制度知识中检索到 {len(ranked_hits)} 条相关内容。"
f"{f'优先参考:{titles}' if titles else ''}"
),
}
return {
"result_type": "knowledge_search",
"query": str(query).strip(),
"record_count": 0,
"hits": [],
"references": [],
"message": (
f"当前未在已归纳制度知识中检索到与“{str(query).strip()}”直接匹配的内容。"
"知识问答仅基于 LLM Wiki 已形成的知识条目回答;当前依据不足,不能继续扩展回答。"
),
}
@staticmethod
def _boost_title_family_hits(hits: list[dict[str, Any]]) -> None:
if len(hits) < 2:
return
preliminary = sorted(
hits,
key=lambda item: (
-int(item.get("score") or 0),
str(item.get("quality_status") or "") != "formal",
str(item.get("title") or ""),
),
)
primary = preliminary[0]
primary_title = str(primary.get("title") or "").strip()
primary_document_id = str(primary.get("document_id") or "").strip()
if len(primary_title) < 3 or not primary_document_id:
return
family_key = primary_title[:3]
family_hits = [
item
for item in hits
if str(item.get("document_id") or "").strip() == primary_document_id
and str(item.get("title") or "").strip().startswith(family_key)
]
if len(family_hits) < 2:
return
for item in family_hits:
item["score"] = int(item.get("score") or 0) + 20
def extract_document_text(self, document_id: str) -> str:
self.ensure_library_ready()
entry = self.get_document_entry(document_id)
@@ -830,6 +997,151 @@ class KnowledgeService:
if str(item.get("document_id") or "").strip()
}
@staticmethod
def _load_json_file(path: Path, *, default: Any) -> Any:
try:
return json.loads(path.read_text(encoding="utf-8"))
except (FileNotFoundError, json.JSONDecodeError):
return default
@staticmethod
def _load_text_file(path: Path) -> str:
try:
return path.read_text(encoding="utf-8").strip()
except FileNotFoundError:
return ""
@staticmethod
def _normalize_search_text(value: Any) -> str:
text = str(value or "").strip().lower()
return re.sub(r"[^0-9a-z\u4e00-\u9fff]+", "", text)
@staticmethod
def _extract_search_terms(query: str) -> list[str]:
normalized = KnowledgeService._normalize_search_text(query)
if not normalized:
return []
terms: set[str] = set()
for part in re.findall(r"[0-9a-z]+|[\u4e00-\u9fff]+", normalized):
if len(part) <= 1:
continue
if part not in KNOWLEDGE_SEARCH_STOP_TERMS:
terms.add(part)
if not re.fullmatch(r"[\u4e00-\u9fff]+", part):
continue
upper_size = min(4, len(part))
for size in range(2, upper_size + 1):
for index in range(0, len(part) - size + 1):
gram = part[index : index + size]
if gram in KNOWLEDGE_SEARCH_STOP_TERMS:
continue
terms.add(gram)
return sorted(terms, key=lambda item: (-len(item), item))
@staticmethod
def _score_knowledge_search_match(
*,
query_text: str,
query_terms: list[str],
title: str,
content: str,
tags: list[str],
document_name: str,
evidence: list[str],
) -> tuple[int, list[str]]:
normalized_title = KnowledgeService._normalize_search_text(title)
normalized_content = KnowledgeService._normalize_search_text(content)
normalized_tags = [KnowledgeService._normalize_search_text(item) for item in tags]
normalized_document_name = KnowledgeService._normalize_search_text(document_name)
normalized_evidence = [KnowledgeService._normalize_search_text(item) for item in evidence]
score = 0
matched_terms: list[str] = []
if query_text and query_text in normalized_title:
score += 140
elif query_text and any(query_text in item for item in normalized_tags):
score += 120
elif query_text and query_text in normalized_content:
score += 88
for phrase in [normalized_title, *normalized_tags, normalized_document_name]:
if not phrase:
continue
if phrase in query_text:
score += 24 + min(18, len(phrase) * 2)
matched_terms.append(phrase)
elif query_text and query_text in phrase:
score += 16
for term in query_terms:
if len(term) <= 1:
continue
term_score = 0
if term in normalized_title:
term_score = 18 if len(term) >= 4 else 14
elif any(term in item for item in normalized_tags):
term_score = 16 if len(term) >= 4 else 12
elif term in normalized_content:
term_score = 10 if len(term) >= 4 else 8
elif term in normalized_document_name or any(term in item for item in normalized_evidence):
term_score = 6
if term_score:
score += term_score
matched_terms.append(term)
if score <= 0:
return 0, []
distinct_matches = []
for item in matched_terms:
if item and item not in distinct_matches:
distinct_matches.append(item)
score += min(24, len(distinct_matches) * 4)
return score, distinct_matches[:6]
@staticmethod
def _build_search_excerpt(text: str, query_terms: list[str], *, max_length: int = 140) -> str:
plain_text = re.sub(r"[#*_`>\-\[\]]+", " ", str(text or ""))
plain_text = re.sub(r"\s+", " ", plain_text).strip()
if not plain_text:
return ""
normalized_text = KnowledgeService._normalize_search_text(plain_text)
for term in query_terms:
if not term or term not in normalized_text:
continue
raw_index = plain_text.find(term)
if raw_index == -1:
continue
start = max(0, raw_index - 36)
end = min(len(plain_text), raw_index + max_length - 36)
snippet = plain_text[start:end].strip(" ,。;:")
if start > 0:
snippet = f"...{snippet}"
if end < len(plain_text):
snippet = f"{snippet}..."
return snippet
if len(plain_text) <= max_length:
return plain_text
return f"{plain_text[: max_length - 3].rstrip()}..."
@staticmethod
def _format_search_timestamp(value: Any) -> str | None:
raw_value = str(value or "").strip()
if not raw_value:
return None
try:
parsed = datetime.fromisoformat(raw_value)
except ValueError:
return raw_value or None
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=UTC)
return parsed.astimezone(UTC).date().isoformat()
def _has_ingested_llm_wiki_document(
self,
entry: dict[str, Any],