feat: 集成Hermes智能体系统,增强聊天和差旅报销功能
This commit is contained in:
@@ -72,6 +72,23 @@ STRUCTURED_PREVIEW_EXTENSIONS = {"docx", "xlsx", "pptx"} | TEXT_EXTENSIONS
|
||||
INLINE_PREVIEW_EXTENSIONS = {"pdf"} | IMAGE_EXTENSIONS
|
||||
ONLYOFFICE_EDITABLE_EXTENSIONS = {"docx", "xlsx", "pptx"}
|
||||
KNOWLEDGE_INGEST_SYNC_STALE_SECONDS = 90
|
||||
KNOWLEDGE_SEARCH_RESULT_LIMIT = 3
|
||||
KNOWLEDGE_SEARCH_STOP_TERMS = {
|
||||
"什么",
|
||||
"怎么",
|
||||
"如何",
|
||||
"多少",
|
||||
"是否",
|
||||
"可以",
|
||||
"一下",
|
||||
"请问",
|
||||
"帮我",
|
||||
"一下子",
|
||||
"这个",
|
||||
"那个",
|
||||
"哪些",
|
||||
"一下吧",
|
||||
}
|
||||
|
||||
KNOWLEDGE_INGEST_STATUS_PUBLISHED = 1
|
||||
KNOWLEDGE_INGEST_STATUS_SYNCING = 2
|
||||
@@ -346,6 +363,156 @@ class KnowledgeService:
|
||||
self.ensure_library_ready()
|
||||
return self.llm_wiki_root
|
||||
|
||||
def search_llm_wiki(self, query: str, *, limit: int = KNOWLEDGE_SEARCH_RESULT_LIMIT) -> dict[str, Any]:
|
||||
self.ensure_library_ready()
|
||||
normalized_query = self._normalize_search_text(query)
|
||||
if not normalized_query:
|
||||
return {
|
||||
"result_type": "knowledge_search",
|
||||
"query": "",
|
||||
"record_count": 0,
|
||||
"hits": [],
|
||||
"references": [],
|
||||
"message": "请先输入要检索的制度或规则问题。",
|
||||
}
|
||||
|
||||
index = self._load_index()
|
||||
if self._reconcile_document_ingest_statuses(index):
|
||||
self._save_index(index)
|
||||
entry_by_id = {
|
||||
str(item.get("id") or "").strip(): item
|
||||
for item in list(index.get("documents") or [])
|
||||
if str(item.get("id") or "").strip()
|
||||
}
|
||||
wiki_index = self._load_llm_wiki_index()
|
||||
query_terms = self._extract_search_terms(query)
|
||||
hits: list[dict[str, Any]] = []
|
||||
|
||||
for wiki_document in list(wiki_index.get("documents") or []):
|
||||
document_id = str(wiki_document.get("document_id") or "").strip()
|
||||
if not document_id:
|
||||
continue
|
||||
entry = entry_by_id.get(document_id)
|
||||
if entry is None or not self._has_matching_llm_wiki_artifact(entry, wiki_document):
|
||||
continue
|
||||
|
||||
quality_status = str(wiki_document.get("quality_status") or "").strip()
|
||||
if quality_status == "failed":
|
||||
continue
|
||||
|
||||
document_name = str(wiki_document.get("document_name") or entry.get("original_name") or "").strip()
|
||||
document_dir = self.llm_wiki_documents_root / document_id
|
||||
candidates = self._load_json_file(document_dir / "knowledge_candidates.json", default=[])
|
||||
matched_in_document = False
|
||||
|
||||
for index, candidate in enumerate(candidates, start=1):
|
||||
if not isinstance(candidate, dict):
|
||||
continue
|
||||
title = str(candidate.get("title") or "").strip()
|
||||
content = str(candidate.get("content") or "").strip()
|
||||
tags = [str(item).strip() for item in list(candidate.get("tags") or []) if str(item).strip()]
|
||||
evidence = [
|
||||
str(item).strip() for item in list(candidate.get("evidence") or []) if str(item).strip()
|
||||
]
|
||||
score, matched_terms = self._score_knowledge_search_match(
|
||||
query_text=normalized_query,
|
||||
query_terms=query_terms,
|
||||
title=title,
|
||||
content=content,
|
||||
tags=tags,
|
||||
document_name=document_name,
|
||||
evidence=evidence,
|
||||
)
|
||||
if score <= 0:
|
||||
continue
|
||||
|
||||
matched_in_document = True
|
||||
candidate_id = str(candidate.get("candidate_id") or f"candidate_{index}").strip()
|
||||
hits.append(
|
||||
{
|
||||
"code": f"knowledge.{document_id}.{candidate_id}",
|
||||
"candidate_id": candidate_id,
|
||||
"title": title or document_name or "制度知识条目",
|
||||
"content": content,
|
||||
"excerpt": self._build_search_excerpt(content or title, query_terms),
|
||||
"document_id": document_id,
|
||||
"document_name": document_name,
|
||||
"version": str(wiki_document.get("document_version") or "").strip() or None,
|
||||
"updated_at": self._format_search_timestamp(wiki_document.get("updated_at")),
|
||||
"quality_status": quality_status,
|
||||
"tags": tags,
|
||||
"evidence": evidence,
|
||||
"score": score,
|
||||
"matched_terms": matched_terms,
|
||||
}
|
||||
)
|
||||
|
||||
self._boost_title_family_hits(hits)
|
||||
ranked_hits = sorted(
|
||||
hits,
|
||||
key=lambda item: (
|
||||
-int(item.get("score") or 0),
|
||||
str(item.get("quality_status") or "") != "formal",
|
||||
str(item.get("title") or ""),
|
||||
),
|
||||
)[: max(1, limit)]
|
||||
|
||||
if ranked_hits:
|
||||
titles = "、".join(str(item.get("title") or "") for item in ranked_hits[:2] if str(item.get("title") or "").strip())
|
||||
return {
|
||||
"result_type": "knowledge_search",
|
||||
"query": str(query).strip(),
|
||||
"record_count": len(ranked_hits),
|
||||
"hits": ranked_hits,
|
||||
"references": [str(item.get("code") or "").strip() for item in ranked_hits if str(item.get("code") or "").strip()],
|
||||
"message": (
|
||||
f"已从已归纳制度知识中检索到 {len(ranked_hits)} 条相关内容。"
|
||||
f"{f'优先参考:{titles}。' if titles else ''}"
|
||||
),
|
||||
}
|
||||
|
||||
return {
|
||||
"result_type": "knowledge_search",
|
||||
"query": str(query).strip(),
|
||||
"record_count": 0,
|
||||
"hits": [],
|
||||
"references": [],
|
||||
"message": (
|
||||
f"当前未在已归纳制度知识中检索到与“{str(query).strip()}”直接匹配的内容。"
|
||||
"知识问答仅基于 LLM Wiki 已形成的知识条目回答;当前依据不足,不能继续扩展回答。"
|
||||
),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _boost_title_family_hits(hits: list[dict[str, Any]]) -> None:
|
||||
if len(hits) < 2:
|
||||
return
|
||||
preliminary = sorted(
|
||||
hits,
|
||||
key=lambda item: (
|
||||
-int(item.get("score") or 0),
|
||||
str(item.get("quality_status") or "") != "formal",
|
||||
str(item.get("title") or ""),
|
||||
),
|
||||
)
|
||||
primary = preliminary[0]
|
||||
primary_title = str(primary.get("title") or "").strip()
|
||||
primary_document_id = str(primary.get("document_id") or "").strip()
|
||||
if len(primary_title) < 3 or not primary_document_id:
|
||||
return
|
||||
|
||||
family_key = primary_title[:3]
|
||||
family_hits = [
|
||||
item
|
||||
for item in hits
|
||||
if str(item.get("document_id") or "").strip() == primary_document_id
|
||||
and str(item.get("title") or "").strip().startswith(family_key)
|
||||
]
|
||||
if len(family_hits) < 2:
|
||||
return
|
||||
for item in family_hits:
|
||||
item["score"] = int(item.get("score") or 0) + 20
|
||||
|
||||
def extract_document_text(self, document_id: str) -> str:
|
||||
self.ensure_library_ready()
|
||||
entry = self.get_document_entry(document_id)
|
||||
@@ -830,6 +997,151 @@ class KnowledgeService:
|
||||
if str(item.get("document_id") or "").strip()
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _load_json_file(path: Path, *, default: Any) -> Any:
|
||||
try:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
except (FileNotFoundError, json.JSONDecodeError):
|
||||
return default
|
||||
|
||||
@staticmethod
|
||||
def _load_text_file(path: Path) -> str:
|
||||
try:
|
||||
return path.read_text(encoding="utf-8").strip()
|
||||
except FileNotFoundError:
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _normalize_search_text(value: Any) -> str:
|
||||
text = str(value or "").strip().lower()
|
||||
return re.sub(r"[^0-9a-z\u4e00-\u9fff]+", "", text)
|
||||
|
||||
@staticmethod
|
||||
def _extract_search_terms(query: str) -> list[str]:
|
||||
normalized = KnowledgeService._normalize_search_text(query)
|
||||
if not normalized:
|
||||
return []
|
||||
|
||||
terms: set[str] = set()
|
||||
for part in re.findall(r"[0-9a-z]+|[\u4e00-\u9fff]+", normalized):
|
||||
if len(part) <= 1:
|
||||
continue
|
||||
if part not in KNOWLEDGE_SEARCH_STOP_TERMS:
|
||||
terms.add(part)
|
||||
if not re.fullmatch(r"[\u4e00-\u9fff]+", part):
|
||||
continue
|
||||
upper_size = min(4, len(part))
|
||||
for size in range(2, upper_size + 1):
|
||||
for index in range(0, len(part) - size + 1):
|
||||
gram = part[index : index + size]
|
||||
if gram in KNOWLEDGE_SEARCH_STOP_TERMS:
|
||||
continue
|
||||
terms.add(gram)
|
||||
|
||||
return sorted(terms, key=lambda item: (-len(item), item))
|
||||
|
||||
@staticmethod
|
||||
def _score_knowledge_search_match(
|
||||
*,
|
||||
query_text: str,
|
||||
query_terms: list[str],
|
||||
title: str,
|
||||
content: str,
|
||||
tags: list[str],
|
||||
document_name: str,
|
||||
evidence: list[str],
|
||||
) -> tuple[int, list[str]]:
|
||||
normalized_title = KnowledgeService._normalize_search_text(title)
|
||||
normalized_content = KnowledgeService._normalize_search_text(content)
|
||||
normalized_tags = [KnowledgeService._normalize_search_text(item) for item in tags]
|
||||
normalized_document_name = KnowledgeService._normalize_search_text(document_name)
|
||||
normalized_evidence = [KnowledgeService._normalize_search_text(item) for item in evidence]
|
||||
|
||||
score = 0
|
||||
matched_terms: list[str] = []
|
||||
|
||||
if query_text and query_text in normalized_title:
|
||||
score += 140
|
||||
elif query_text and any(query_text in item for item in normalized_tags):
|
||||
score += 120
|
||||
elif query_text and query_text in normalized_content:
|
||||
score += 88
|
||||
|
||||
for phrase in [normalized_title, *normalized_tags, normalized_document_name]:
|
||||
if not phrase:
|
||||
continue
|
||||
if phrase in query_text:
|
||||
score += 24 + min(18, len(phrase) * 2)
|
||||
matched_terms.append(phrase)
|
||||
elif query_text and query_text in phrase:
|
||||
score += 16
|
||||
|
||||
for term in query_terms:
|
||||
if len(term) <= 1:
|
||||
continue
|
||||
term_score = 0
|
||||
if term in normalized_title:
|
||||
term_score = 18 if len(term) >= 4 else 14
|
||||
elif any(term in item for item in normalized_tags):
|
||||
term_score = 16 if len(term) >= 4 else 12
|
||||
elif term in normalized_content:
|
||||
term_score = 10 if len(term) >= 4 else 8
|
||||
elif term in normalized_document_name or any(term in item for item in normalized_evidence):
|
||||
term_score = 6
|
||||
if term_score:
|
||||
score += term_score
|
||||
matched_terms.append(term)
|
||||
|
||||
if score <= 0:
|
||||
return 0, []
|
||||
|
||||
distinct_matches = []
|
||||
for item in matched_terms:
|
||||
if item and item not in distinct_matches:
|
||||
distinct_matches.append(item)
|
||||
score += min(24, len(distinct_matches) * 4)
|
||||
return score, distinct_matches[:6]
|
||||
|
||||
@staticmethod
|
||||
def _build_search_excerpt(text: str, query_terms: list[str], *, max_length: int = 140) -> str:
|
||||
plain_text = re.sub(r"[#*_`>\-\[\]]+", " ", str(text or ""))
|
||||
plain_text = re.sub(r"\s+", " ", plain_text).strip()
|
||||
if not plain_text:
|
||||
return ""
|
||||
|
||||
normalized_text = KnowledgeService._normalize_search_text(plain_text)
|
||||
for term in query_terms:
|
||||
if not term or term not in normalized_text:
|
||||
continue
|
||||
raw_index = plain_text.find(term)
|
||||
if raw_index == -1:
|
||||
continue
|
||||
start = max(0, raw_index - 36)
|
||||
end = min(len(plain_text), raw_index + max_length - 36)
|
||||
snippet = plain_text[start:end].strip(" ,。;:")
|
||||
if start > 0:
|
||||
snippet = f"...{snippet}"
|
||||
if end < len(plain_text):
|
||||
snippet = f"{snippet}..."
|
||||
return snippet
|
||||
|
||||
if len(plain_text) <= max_length:
|
||||
return plain_text
|
||||
return f"{plain_text[: max_length - 3].rstrip()}..."
|
||||
|
||||
@staticmethod
|
||||
def _format_search_timestamp(value: Any) -> str | None:
|
||||
raw_value = str(value or "").strip()
|
||||
if not raw_value:
|
||||
return None
|
||||
try:
|
||||
parsed = datetime.fromisoformat(raw_value)
|
||||
except ValueError:
|
||||
return raw_value or None
|
||||
if parsed.tzinfo is None:
|
||||
parsed = parsed.replace(tzinfo=UTC)
|
||||
return parsed.astimezone(UTC).date().isoformat()
|
||||
|
||||
def _has_ingested_llm_wiki_document(
|
||||
self,
|
||||
entry: dict[str, Any],
|
||||
|
||||
Reference in New Issue
Block a user