feat(server): 重构知识库服务和路由配置,优化LLM维基知识管理接口,增强知识检索能力

This commit is contained in:
caoxiaozhu
2026-05-15 09:33:59 +00:00
parent 7a3feb14a0
commit 6793b6f832
7 changed files with 812 additions and 123 deletions

View File

@@ -16,18 +16,19 @@ from app.schemas.knowledge import (
KnowledgeActionResponse, KnowledgeActionResponse,
KnowledgeDocumentDetailRead, KnowledgeDocumentDetailRead,
KnowledgeLibraryRead, KnowledgeLibraryRead,
LlmWikiDocumentDetailRead,
LlmWikiIndexRead,
LlmWikiSummaryUpdateWrite,
KnowledgeOnlyOfficeCallbackRead, KnowledgeOnlyOfficeCallbackRead,
KnowledgeOnlyOfficeCallbackWrite, KnowledgeOnlyOfficeCallbackWrite,
KnowledgeOnlyOfficeConfigRead, KnowledgeOnlyOfficeConfigRead,
LlmWikiSyncRead, LlmWikiDocumentDetailRead,
LlmWikiIndexRead,
LlmWikiSyncTaskRead,
LlmWikiSyncWrite, LlmWikiSyncWrite,
LlmWikiSummaryUpdateWrite,
) )
from app.services.agent_runs import AgentRunService from app.services.agent_runs import AgentRunService
from app.services.knowledge import KnowledgeService from app.services.knowledge import KNOWLEDGE_INGEST_STATUS_SYNCING, KnowledgeService
from app.services.llm_wiki import LlmWikiService from app.services.llm_wiki import LlmWikiService
from app.services.llm_wiki_tasks import llm_wiki_task_manager
router = APIRouter(prefix="/knowledge") router = APIRouter(prefix="/knowledge")
@@ -46,8 +47,9 @@ router = APIRouter(prefix="/knowledge")
) )
def get_knowledge_library( def get_knowledge_library(
_: Annotated[CurrentUserContext, Depends(get_current_user)], _: Annotated[CurrentUserContext, Depends(get_current_user)],
db: Annotated[Session, Depends(get_db)],
) -> KnowledgeLibraryRead: ) -> KnowledgeLibraryRead:
return KnowledgeService().list_library() return KnowledgeService(db=db).list_library()
@router.get( @router.get(
@@ -140,9 +142,9 @@ def update_llm_wiki_document_summary(
@router.post( @router.post(
"/llm-wiki/sync", "/llm-wiki/sync",
response_model=LlmWikiSyncRead, response_model=LlmWikiSyncTaskRead,
summary="触发 Hermes 形成 LLM Wiki 与规则草稿", summary="异步触发 Hermes 形成 LLM Wiki 与规则草稿",
description="按知识库文档变化情况增量触发系统 Hermes,形成知识候选和规则草稿", description="按知识库文档变化情况系统 Hermes 归纳任务放入后台执行,并返回可追踪的 AgentRun 编号",
responses={ responses={
status.HTTP_401_UNAUTHORIZED: { status.HTTP_401_UNAUTHORIZED: {
"model": ErrorResponse, "model": ErrorResponse,
@@ -158,8 +160,15 @@ def sync_llm_wiki(
payload: LlmWikiSyncWrite, payload: LlmWikiSyncWrite,
current_user: Annotated[CurrentUserContext, Depends(require_admin_user)], current_user: Annotated[CurrentUserContext, Depends(require_admin_user)],
db: Annotated[Session, Depends(get_db)], db: Annotated[Session, Depends(get_db)],
) -> LlmWikiSyncRead: ) -> LlmWikiSyncTaskRead:
run_service = AgentRunService(db) run_service = AgentRunService(db)
knowledge_service = KnowledgeService(db=db)
requested_ids = {str(item).strip() for item in payload.document_ids if str(item).strip()}
target_document_ids = [
str(item.get("id") or "").strip()
for item in knowledge_service.list_folder_documents(folder=payload.folder)
if str(item.get("id") or "").strip() and (not requested_ids or str(item.get("id") or "").strip() in requested_ids)
]
task_asset = db.scalar( task_asset = db.scalar(
select(AgentAsset).where(AgentAsset.code == "task.hermes.llm_wiki_rule_formation") select(AgentAsset).where(AgentAsset.code == "task.hermes.llm_wiki_rule_formation")
) )
@@ -170,47 +179,52 @@ def sync_llm_wiki(
task_id=task_asset.id if task_asset is not None else None, task_id=task_asset.id if task_asset is not None else None,
permission_level=AgentPermissionLevel.READ.value, permission_level=AgentPermissionLevel.READ.value,
status=AgentRunStatus.RUNNING.value, status=AgentRunStatus.RUNNING.value,
result_summary="Hermes 正在形成 LLM Wiki 与规则草稿", result_summary="Hermes 归纳任务已入队,等待后台执行",
route_json={
"job_type": "llm_wiki_sync",
"phase": "queued",
"folder": payload.folder,
"force": payload.force,
"requested_document_ids": target_document_ids,
"progress": {
"total_documents": len(target_document_ids),
"completed_documents": 0,
"failed_documents": 0,
"skipped_documents": 0,
"percent": 0,
},
},
) )
try: try:
result = LlmWikiService(db).sync_folder( if target_document_ids:
knowledge_service.set_document_ingest_statuses(
target_document_ids,
status_code=KNOWLEDGE_INGEST_STATUS_SYNCING,
agent_run_id=run.run_id,
)
llm_wiki_task_manager.submit_sync(
agent_run_id=run.run_id,
folder=payload.folder, folder=payload.folder,
current_user=current_user, current_user=current_user,
document_ids=payload.document_ids, document_ids=target_document_ids,
force=payload.force, force=payload.force,
) )
run_service.record_tool_call( return LlmWikiSyncTaskRead(
run_id=run.run_id, ok=True,
tool_type="llm", agent_run_id=run.run_id,
tool_name="system_hermes_llm_wiki_sync", folder=payload.folder,
request_json=payload.model_dump(), document_ids=target_document_ids,
response_json=result.model_dump(), queued_at=run.started_at,
status="succeeded", status=run.status,
duration_ms=0, summary="Hermes 已进入后台归纳,可在日志管理查看进度。",
) )
run_service.update_run(
run.run_id,
status=AgentRunStatus.SUCCEEDED.value,
result_summary=result.summary,
finished_at=datetime.now(UTC),
)
return result
except Exception as exc: except Exception as exc:
run_service.record_tool_call(
run_id=run.run_id,
tool_type="llm",
tool_name="system_hermes_llm_wiki_sync",
request_json=payload.model_dump(),
response_json={"error": str(exc)},
status="failed",
duration_ms=0,
error_message=str(exc),
)
run_service.update_run( run_service.update_run(
run.run_id, run.run_id,
status=AgentRunStatus.FAILED.value, status=AgentRunStatus.FAILED.value,
error_message=str(exc), error_message=str(exc),
result_summary=str(exc),
finished_at=datetime.now(UTC), finished_at=datetime.now(UTC),
) )
if isinstance(exc, ValueError): if isinstance(exc, ValueError):
@@ -239,9 +253,10 @@ def sync_llm_wiki(
def get_knowledge_document( def get_knowledge_document(
document_id: str, document_id: str,
_: Annotated[CurrentUserContext, Depends(get_current_user)], _: Annotated[CurrentUserContext, Depends(get_current_user)],
db: Annotated[Session, Depends(get_db)],
) -> KnowledgeDocumentDetailRead: ) -> KnowledgeDocumentDetailRead:
try: try:
return KnowledgeService().get_document_detail(document_id) return KnowledgeService(db=db).get_document_detail(document_id)
except FileNotFoundError as exc: except FileNotFoundError as exc:
raise HTTPException( raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, status_code=status.HTTP_404_NOT_FOUND,

View File

@@ -13,6 +13,7 @@ from app.api.v1.endpoints.ontology import router as ontology_router
from app.api.v1.endpoints.orchestrator import router as orchestrator_router from app.api.v1.endpoints.orchestrator import router as orchestrator_router
from app.api.v1.endpoints.reimbursements import router as reimbursements_router from app.api.v1.endpoints.reimbursements import router as reimbursements_router
from app.api.v1.endpoints.settings import router as settings_router from app.api.v1.endpoints.settings import router as settings_router
from app.api.v1.endpoints.system_logs import router as system_logs_router
router = APIRouter() router = APIRouter()
router.include_router(health_router, tags=["health"]) router.include_router(health_router, tags=["health"])
@@ -28,3 +29,4 @@ router.include_router(orchestrator_router, tags=["orchestrator"])
router.include_router(employees_router, prefix="/employees", tags=["employees"]) router.include_router(employees_router, prefix="/employees", tags=["employees"])
router.include_router(reimbursements_router, prefix="/reimbursements", tags=["reimbursements"]) router.include_router(reimbursements_router, prefix="/reimbursements", tags=["reimbursements"])
router.include_router(settings_router, tags=["settings"]) router.include_router(settings_router, tags=["settings"])
router.include_router(system_logs_router, tags=["system-logs"])

View File

@@ -15,6 +15,7 @@ from app.schemas.common import RootStatusRead
from app.services.agent_foundation import prepare_agent_foundation from app.services.agent_foundation import prepare_agent_foundation
from app.services.employee import prepare_employee_directory from app.services.employee import prepare_employee_directory
from app.services.knowledge import prepare_knowledge_library from app.services.knowledge import prepare_knowledge_library
from app.services.llm_wiki_tasks import llm_wiki_task_manager
@asynccontextmanager @asynccontextmanager
@@ -32,6 +33,7 @@ async def lifespan(_: FastAPI) -> AsyncIterator[None]:
settings.api_v1_prefix, settings.api_v1_prefix,
) )
yield yield
llm_wiki_task_manager.shutdown()
def create_app() -> FastAPI: def create_app() -> FastAPI:

View File

@@ -193,5 +193,15 @@ class LlmWikiSyncRead(BaseModel):
summary: str = "" summary: str = ""
class LlmWikiSyncTaskRead(BaseModel):
ok: bool = True
agent_run_id: str
folder: str
document_ids: list[str] = Field(default_factory=list)
queued_at: datetime
status: str = "running"
summary: str = ""
class LlmWikiSummaryUpdateWrite(BaseModel): class LlmWikiSummaryUpdateWrite(BaseModel):
knowledge_summary_markdown: str = Field(min_length=1) knowledge_summary_markdown: str = Field(min_length=1)

View File

@@ -16,10 +16,14 @@ from xml.etree import ElementTree
from zipfile import BadZipFile, ZipFile from zipfile import BadZipFile, ZipFile
import jwt import jwt
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.api.deps import CurrentUserContext from app.api.deps import CurrentUserContext
from app.core.agent_enums import AgentRunStatus
from app.core.config import get_settings from app.core.config import get_settings
from app.core.logging import get_logger from app.core.logging import get_logger
from app.models.agent_run import AgentRun
from app.schemas.knowledge import ( from app.schemas.knowledge import (
KnowledgeDocumentDetailRead, KnowledgeDocumentDetailRead,
KnowledgeDocumentRead, KnowledgeDocumentRead,
@@ -94,8 +98,9 @@ def prepare_knowledge_library() -> None:
class KnowledgeService: class KnowledgeService:
def __init__(self, storage_root: Path | None = None) -> None: def __init__(self, storage_root: Path | None = None, db: Session | None = None) -> None:
settings = get_settings() settings = get_settings()
self.db = db
self.storage_root = Path(storage_root or settings.resolved_storage_root_dir) self.storage_root = Path(storage_root or settings.resolved_storage_root_dir)
self.library_root = self.storage_root / "knowledge" self.library_root = self.storage_root / "knowledge"
self.index_path = self.library_root / ".index.json" self.index_path = self.library_root / ".index.json"
@@ -147,7 +152,8 @@ class KnowledgeService:
self._save_index(index) self._save_index(index)
entry = self._require_entry(index, document_id) entry = self._require_entry(index, document_id)
preview_kind, preview_pages = self._build_preview(entry) preview_kind, preview_pages = self._build_preview(entry)
document = self._serialize_document(entry) wiki_document = self._build_wiki_document_map().get(str(document_id).strip())
document = self._serialize_document(entry, wiki_document=wiki_document)
return KnowledgeDocumentDetailRead( return KnowledgeDocumentDetailRead(
**document.model_dump(), **document.model_dump(),
previewKind=preview_kind, previewKind=preview_kind,
@@ -210,6 +216,7 @@ class KnowledgeService:
"uploaded_by": current_user.name, "uploaded_by": current_user.name,
"version_number": 1, "version_number": 1,
"ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED, "ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
"ingest_agent_run_id": "",
} }
index["documents"].append(entry) index["documents"].append(entry)
logger.info( logger.info(
@@ -231,6 +238,7 @@ class KnowledgeService:
"uploaded_by": current_user.name, "uploaded_by": current_user.name,
"version_number": int(existing_entry.get("version_number", 1)) + 1, "version_number": int(existing_entry.get("version_number", 1)) + 1,
"ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED, "ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
"ingest_agent_run_id": "",
} }
) )
entry = existing_entry entry = existing_entry
@@ -286,7 +294,13 @@ class KnowledgeService:
self._save_index(index) self._save_index(index)
return dict(self._require_entry(index, document_id)) return dict(self._require_entry(index, document_id))
def set_document_ingest_statuses(self, document_ids: list[str], status_code: int) -> None: def set_document_ingest_statuses(
self,
document_ids: list[str],
status_code: int,
*,
agent_run_id: str | None = None,
) -> None:
self.ensure_library_ready() self.ensure_library_ready()
normalized_ids = {str(item).strip() for item in document_ids if str(item).strip()} normalized_ids = {str(item).strip() for item in document_ids if str(item).strip()}
if not normalized_ids: if not normalized_ids:
@@ -299,9 +313,15 @@ class KnowledgeService:
if str(entry.get("id") or "").strip() not in normalized_ids: if str(entry.get("id") or "").strip() not in normalized_ids:
continue continue
if self._normalize_ingest_status_code(entry.get("ingest_status")) == status_code: if self._normalize_ingest_status_code(entry.get("ingest_status")) == status_code:
if agent_run_id is not None and entry.get("ingest_agent_run_id") != agent_run_id:
entry["ingest_agent_run_id"] = agent_run_id
entry["ingest_status_updated_at"] = updated_at
changed = True
continue continue
entry["ingest_status"] = status_code entry["ingest_status"] = status_code
entry["ingest_status_updated_at"] = updated_at entry["ingest_status_updated_at"] = updated_at
if agent_run_id is not None:
entry["ingest_agent_run_id"] = agent_run_id
changed = True changed = True
if changed: if changed:
@@ -703,6 +723,9 @@ class KnowledgeService:
if item.get("ingest_status") != normalized_status: if item.get("ingest_status") != normalized_status:
item["ingest_status"] = normalized_status item["ingest_status"] = normalized_status
changed = True changed = True
if "ingest_agent_run_id" not in item:
item["ingest_agent_run_id"] = ""
changed = True
existing_items.append(item) existing_items.append(item)
else: else:
changed = True changed = True
@@ -735,6 +758,7 @@ class KnowledgeService:
"uploaded_by": "系统导入", "uploaded_by": "系统导入",
"version_number": 1, "version_number": 1,
"ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED, "ingest_status": KNOWLEDGE_INGEST_STATUS_PUBLISHED,
"ingest_agent_run_id": "",
} }
) )
changed = True changed = True
@@ -773,7 +797,7 @@ class KnowledgeService:
if ( if (
current_status == KNOWLEDGE_INGEST_STATUS_SYNCING current_status == KNOWLEDGE_INGEST_STATUS_SYNCING
and preserve_syncing and preserve_syncing
and not self._is_syncing_status_stale(entry) and self._should_preserve_syncing_status(entry)
): ):
continue continue
@@ -881,6 +905,24 @@ class KnowledgeService:
age_seconds = (datetime.now(UTC) - updated_at.astimezone(UTC)).total_seconds() age_seconds = (datetime.now(UTC) - updated_at.astimezone(UTC)).total_seconds()
return age_seconds >= KNOWLEDGE_INGEST_SYNC_STALE_SECONDS return age_seconds >= KNOWLEDGE_INGEST_SYNC_STALE_SECONDS
def _should_preserve_syncing_status(self, entry: dict[str, Any]) -> bool:
agent_run_id = str(entry.get("ingest_agent_run_id") or "").strip()
if not agent_run_id or self.db is None:
return not self._is_syncing_status_stale(entry)
run = self.db.scalar(select(AgentRun).where(AgentRun.run_id == agent_run_id))
if run is None:
return not self._is_syncing_status_stale(entry)
if run.status != AgentRunStatus.RUNNING.value:
return False
heartbeat_at = str((run.route_json or {}).get("heartbeat_at") or "").strip()
if heartbeat_at:
probe_entry = {"ingest_status_updated_at": heartbeat_at}
return not self._is_syncing_status_stale(probe_entry)
return not self._is_syncing_status_stale(entry)
def _require_entry(self, index: dict[str, Any], document_id: str) -> dict[str, Any]: def _require_entry(self, index: dict[str, Any], document_id: str) -> dict[str, Any]:
for entry in index["documents"]: for entry in index["documents"]:
if entry["id"] == document_id: if entry["id"] == document_id:

View File

@@ -8,7 +8,7 @@ from dataclasses import dataclass
from datetime import UTC, datetime from datetime import UTC, datetime
from decimal import Decimal from decimal import Decimal
from pathlib import Path from pathlib import Path
from typing import Any, Literal from typing import Any, Callable, Literal
from uuid import uuid4 from uuid import uuid4
from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator
@@ -47,7 +47,8 @@ from app.services.system_hermes import SystemHermesService
logger = get_logger("app.services.llm_wiki") logger = get_logger("app.services.llm_wiki")
HERMES_CANDIDATE_MODEL_TIMEOUT_SECONDS = 10 HERMES_CANDIDATE_MODEL_TIMEOUT_SECONDS = 10
HERMES_CANDIDATE_GROUP_SIZE = 3 HERMES_CANDIDATE_GROUP_SIZE = 2
HERMES_CANDIDATE_CONTENT_LIMIT = 520
LOW_SIGNAL_DOTTED_LINE_PATTERN = re.compile(r"[..。·•]{6,}\s*[0-9]{0,3}$") LOW_SIGNAL_DOTTED_LINE_PATTERN = re.compile(r"[..。·•]{6,}\s*[0-9]{0,3}$")
PAGE_FOOTER_PATTERN = re.compile(r"^第\s*\d+\s*页\s*共\s*\d+\s*页$") PAGE_FOOTER_PATTERN = re.compile(r"^第\s*\d+\s*页\s*共\s*\d+\s*页$")
POLICY_SUBSTANCE_KEYWORDS = ( POLICY_SUBSTANCE_KEYWORDS = (
@@ -412,6 +413,8 @@ class LlmWikiService:
current_user: CurrentUserContext, current_user: CurrentUserContext,
document_ids: list[str] | None = None, document_ids: list[str] | None = None,
force: bool = False, force: bool = False,
agent_run_id: str | None = None,
progress_callback: Callable[[dict[str, Any], str], None] | None = None,
) -> LlmWikiSyncRead: ) -> LlmWikiSyncRead:
self.knowledge_service.ensure_library_ready() self.knowledge_service.ensure_library_ready()
documents = self.knowledge_service.list_folder_documents(folder=folder) documents = self.knowledge_service.list_folder_documents(folder=folder)
@@ -427,6 +430,7 @@ class LlmWikiService:
self.knowledge_service.set_document_ingest_statuses( self.knowledge_service.set_document_ingest_statuses(
target_document_ids, target_document_ids,
status_code=KNOWLEDGE_INGEST_STATUS_SYNCING, status_code=KNOWLEDGE_INGEST_STATUS_SYNCING,
agent_run_id=agent_run_id,
) )
try: try:
@@ -441,9 +445,27 @@ class LlmWikiService:
rule_candidate_count = 0 rule_candidate_count = 0
generated_rule_asset_ids: list[str] = [] generated_rule_asset_ids: list[str] = []
changed_document_count = 0 changed_document_count = 0
skipped_document_count = 0
sync_summaries: list[str] = [] sync_summaries: list[str] = []
failed_document_ids: list[str] = []
total_documents = len(documents)
for entry in documents: self._emit_progress(
progress_callback,
{
"phase": "running",
"progress": {
"total_documents": total_documents,
"completed_documents": 0,
"failed_documents": 0,
"skipped_documents": 0,
"percent": 0,
},
},
f"Hermes 已开始归纳,待处理文档 {total_documents} 个。",
)
for index_value, entry in enumerate(documents, start=1):
document_id = str(entry.get("id") or "").strip() document_id = str(entry.get("id") or "").strip()
if not document_id: if not document_id:
continue continue
@@ -451,15 +473,85 @@ class LlmWikiService:
existing = existing_by_id.get(document_id) existing = existing_by_id.get(document_id)
sync_reason = self._resolve_sync_reason(entry=entry, existing=existing, force=force) sync_reason = self._resolve_sync_reason(entry=entry, existing=existing, force=force)
if sync_reason == "unchanged_skipped": if sync_reason == "unchanged_skipped":
skipped_document_count += 1
sync_summaries.append(f"{entry['original_name']}:未变化,跳过。") sync_summaries.append(f"{entry['original_name']}:未变化,跳过。")
self._emit_progress(
progress_callback,
{
"phase": "running",
"progress": {
"total_documents": total_documents,
"completed_documents": changed_document_count,
"failed_documents": len(failed_document_ids),
"skipped_documents": skipped_document_count,
"current_document_index": index_value,
"current_document_id": document_id,
"current_document_name": entry["original_name"],
"current_stage": "skipped",
"percent": self._calculate_progress_percent(
completed_documents=changed_document_count,
skipped_documents=skipped_document_count,
total_documents=total_documents,
),
},
},
f"{entry['original_name']}》未变化,跳过本次归纳。",
)
continue continue
self._emit_progress(
progress_callback,
{
"phase": "running",
"progress": {
"total_documents": total_documents,
"completed_documents": changed_document_count,
"failed_documents": len(failed_document_ids),
"skipped_documents": skipped_document_count,
"current_document_index": index_value,
"current_document_id": document_id,
"current_document_name": entry["original_name"],
"current_stage": "document_started",
"percent": self._calculate_progress_percent(
completed_documents=changed_document_count,
skipped_documents=skipped_document_count,
total_documents=total_documents,
),
},
},
f"Hermes 正在归纳《{entry['original_name']}》。",
)
changed_document_count += 1 changed_document_count += 1
document_payload = self._sync_single_document( document_payload = self._sync_single_document(
entry=entry, entry=entry,
folder=folder, folder=folder,
current_user=current_user, current_user=current_user,
sync_reason=sync_reason, sync_reason=sync_reason,
progress_callback=lambda payload, summary, *, document_id=document_id, document_name=entry["original_name"], document_index=index_value: self._emit_progress(
progress_callback,
{
"phase": "running",
"progress": {
"total_documents": total_documents,
"completed_documents": max(changed_document_count - 1, 0),
"failed_documents": len(failed_document_ids),
"skipped_documents": skipped_document_count,
"current_document_index": document_index,
"current_document_id": document_id,
"current_document_name": document_name,
**payload,
"percent": self._calculate_progress_percent(
completed_documents=max(changed_document_count - 1, 0),
skipped_documents=skipped_document_count,
total_documents=total_documents,
group_count=int(payload.get("group_count") or 0),
current_group_index=int(payload.get("current_group_index") or 0),
),
},
},
summary,
),
) )
existing_by_id[document_id] = document_payload["document"] existing_by_id[document_id] = document_payload["document"]
knowledge_candidate_count += len(document_payload["knowledge_candidates"]) knowledge_candidate_count += len(document_payload["knowledge_candidates"])
@@ -471,13 +563,47 @@ class LlmWikiService:
if str(item.get("generated_asset_id") or "").strip() if str(item.get("generated_asset_id") or "").strip()
] ]
) )
if document_payload["document"].get("quality_status") in {"fallback_only", "runtime_only", "failed"}:
failed_document_ids.append(document_id)
sync_summaries.append( sync_summaries.append(
f"{entry['original_name']}{sync_reason},知识候选 {len(document_payload['knowledge_candidates'])} 条," f"{entry['original_name']}{sync_reason},知识候选 {len(document_payload['knowledge_candidates'])} 条,"
f"规则候选 {len(document_payload['rule_candidates'])}" f"规则候选 {len(document_payload['rule_candidates'])}"
f"归纳质量 {document_payload['document'].get('quality_status') or 'formal'}"
)
self._emit_progress(
progress_callback,
{
"phase": "running",
"progress": {
"total_documents": total_documents,
"completed_documents": changed_document_count,
"failed_documents": len(failed_document_ids),
"skipped_documents": skipped_document_count,
"current_document_index": index_value,
"current_document_id": document_id,
"current_document_name": entry["original_name"],
"current_stage": "document_completed",
"knowledge_candidate_count": len(document_payload["knowledge_candidates"]),
"rule_candidate_count": len(document_payload["rule_candidates"]),
"quality_status": document_payload["document"].get("quality_status") or "formal",
"percent": self._calculate_progress_percent(
completed_documents=changed_document_count,
skipped_documents=skipped_document_count,
total_documents=total_documents,
),
},
},
f"{entry['original_name']}》归纳完成,质量状态为 {document_payload['document'].get('quality_status') or 'formal'}",
) )
index["documents"] = list(existing_by_id.values()) index["documents"] = list(existing_by_id.values())
self._write_json_file(self.knowledge_service.llm_wiki_index_path, index) self._write_json_file(self.knowledge_service.llm_wiki_index_path, index)
if failed_document_ids:
self.knowledge_service.set_document_ingest_statuses(
failed_document_ids,
status_code=KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
)
sync_runs.setdefault("runs", []) sync_runs.setdefault("runs", [])
sync_runs["runs"].append( sync_runs["runs"].append(
@@ -502,6 +628,22 @@ class LlmWikiService:
generated_rule_ids = list(dict.fromkeys(generated_rule_asset_ids)) generated_rule_ids = list(dict.fromkeys(generated_rule_asset_ids))
summary = "".join(sync_summaries) if sync_summaries else "未发现需要同步的知识文档。" summary = "".join(sync_summaries) if sync_summaries else "未发现需要同步的知识文档。"
self._emit_progress(
progress_callback,
{
"phase": "running",
"progress": {
"total_documents": total_documents,
"completed_documents": changed_document_count,
"failed_documents": len(failed_document_ids),
"skipped_documents": skipped_document_count,
"knowledge_candidate_count": knowledge_candidate_count,
"rule_candidate_count": rule_candidate_count,
"percent": 100,
},
},
summary,
)
return LlmWikiSyncRead( return LlmWikiSyncRead(
ok=True, ok=True,
run_id=run_id, run_id=run_id,
@@ -518,6 +660,7 @@ class LlmWikiService:
self.knowledge_service.set_document_ingest_statuses( self.knowledge_service.set_document_ingest_statuses(
target_document_ids, target_document_ids,
status_code=KNOWLEDGE_INGEST_STATUS_FAILED, status_code=KNOWLEDGE_INGEST_STATUS_FAILED,
agent_run_id=agent_run_id,
) )
raise raise
@@ -528,6 +671,7 @@ class LlmWikiService:
folder: str, folder: str,
current_user: CurrentUserContext, current_user: CurrentUserContext,
sync_reason: str, sync_reason: str,
progress_callback: Callable[[dict[str, Any], str], None] | None = None,
) -> dict[str, Any]: ) -> dict[str, Any]:
document_id = str(entry["id"]) document_id = str(entry["id"])
document_name = str(entry["original_name"]) document_name = str(entry["original_name"])
@@ -539,10 +683,19 @@ class LlmWikiService:
text_path.write_text(extracted_text, encoding="utf-8") text_path.write_text(extracted_text, encoding="utf-8")
chunks = self._build_chunks(document_id=document_id, text=extracted_text) chunks = self._build_chunks(document_id=document_id, text=extracted_text)
knowledge_candidates, rule_candidates = self._extract_candidates( self._emit_progress(
progress_callback,
{
"current_stage": "text_extracted",
"chunk_count": len(chunks),
},
f"{document_name}》文本提取完成,共形成 {len(chunks)} 个分块。",
)
knowledge_candidates, rule_candidates, extraction_stats = self._extract_candidates(
entry=entry, entry=entry,
chunks=chunks, chunks=chunks,
current_user=current_user, current_user=current_user,
progress_callback=progress_callback,
) )
generated_candidates: list[dict[str, Any]] = [] generated_candidates: list[dict[str, Any]] = []
@@ -563,8 +716,17 @@ class LlmWikiService:
"checksum": str(entry.get("sha256") or ""), "checksum": str(entry.get("sha256") or ""),
"extracted_text_path": str(text_path), "extracted_text_path": str(text_path),
"chunk_count": len(chunks), "chunk_count": len(chunks),
"candidate_chunk_count": extraction_stats.candidate_chunk_count,
"filtered_chunk_count": extraction_stats.filtered_chunk_count,
"group_count": extraction_stats.group_count,
"successful_group_count": extraction_stats.successful_group_count,
"failed_group_count": extraction_stats.failed_group_count,
"knowledge_candidate_count": len(knowledge_candidates), "knowledge_candidate_count": len(knowledge_candidates),
"formal_knowledge_candidate_count": extraction_stats.formal_knowledge_candidate_count,
"fallback_knowledge_candidate_count": extraction_stats.fallback_knowledge_candidate_count,
"rule_candidate_count": len(generated_candidates), "rule_candidate_count": len(generated_candidates),
"quality_status": extraction_stats.quality_status,
"quality_note": extraction_stats.quality_note,
"updated_at": datetime.now(UTC).isoformat(), "updated_at": datetime.now(UTC).isoformat(),
"signature": self._build_document_signature(entry), "signature": self._build_document_signature(entry),
"sync_reason": sync_reason, "sync_reason": sync_reason,
@@ -593,49 +755,147 @@ class LlmWikiService:
entry: dict[str, Any], entry: dict[str, Any],
chunks: list[dict[str, Any]], chunks: list[dict[str, Any]],
current_user: CurrentUserContext, current_user: CurrentUserContext,
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: progress_callback: Callable[[dict[str, Any], str], None] | None = None,
) -> tuple[list[dict[str, Any]], list[dict[str, Any]], CandidateExtractionStats]:
stats = CandidateExtractionStats(raw_chunk_count=len(chunks))
if not chunks: if not chunks:
return [], [] stats.quality_status = "failed"
stats.quality_note = "文档未提取到可用分块,无法形成 LLM Wiki。"
return [], [], stats
candidate_chunks = self._select_candidate_chunks(chunks)
stats.candidate_chunk_count = len(candidate_chunks)
stats.filtered_chunk_count = max(0, len(chunks) - len(candidate_chunks))
if not candidate_chunks:
stats.quality_status = "failed"
stats.quality_note = "正文条款分块为空,当前仅识别到封面、目录或低信息量内容,未形成正式归纳。"
return [], [], stats
projected_group_count = len(self._group_chunks(candidate_chunks, size=HERMES_CANDIDATE_GROUP_SIZE))
self._emit_progress(
progress_callback,
{
"current_stage": "candidate_chunks_selected",
"candidate_chunk_count": stats.candidate_chunk_count,
"filtered_chunk_count": stats.filtered_chunk_count,
"group_count": projected_group_count,
"current_group_index": 0,
"successful_group_count": 0,
"failed_group_count": 0,
},
f"{entry['original_name']}》已筛出 {stats.candidate_chunk_count} 个有效正文分块,准备分 {projected_group_count} 组归纳。",
)
knowledge_candidates: list[dict[str, Any]] = [] knowledge_candidates: list[dict[str, Any]] = []
rule_candidates: list[dict[str, Any]] = [] rule_candidates: list[dict[str, Any]] = []
seen_knowledge_keys: set[str] = set() seen_knowledge_keys: set[str] = set()
seen_rule_keys: set[str] = set() seen_rule_keys: set[str] = set()
for chunk_group in self._group_chunks(chunks, size=4): for chunk_group in self._group_chunks(candidate_chunks, size=HERMES_CANDIDATE_GROUP_SIZE):
payload = self._call_candidate_model(entry=entry, chunk_group=chunk_group) stats.group_count += 1
attempt = self._call_candidate_model(entry=entry, chunk_group=chunk_group)
if isinstance(attempt, dict):
attempt = CandidateModelAttempt(payload=attempt, source="hermes", ok=True)
if not attempt.ok:
stats.failed_group_count += 1
self._emit_progress(
progress_callback,
{
"current_stage": "extracting_candidates",
"group_count": projected_group_count,
"current_group_index": stats.group_count,
"successful_group_count": stats.successful_group_count,
"failed_group_count": stats.failed_group_count,
},
f"{entry['original_name']}》第 {stats.group_count}/{projected_group_count} 组归纳失败,继续处理下一组。",
)
continue
stats.successful_group_count += 1
batch_knowledge = self._normalize_knowledge_candidates( batch_knowledge = self._normalize_knowledge_candidates(
raw_items=list(payload.get("knowledge_candidates") or []), raw_items=list(attempt.payload.get("knowledge_candidates") or []),
entry=entry, entry=entry,
chunk_group=chunk_group, chunk_group=chunk_group,
seen_keys=seen_knowledge_keys, seen_keys=seen_knowledge_keys,
extraction_mode=attempt.source,
) )
batch_rules = self._normalize_rule_candidates( batch_rules: list[dict[str, Any]] = []
raw_items=list(payload.get("rule_candidates") or []), if attempt.source == "hermes":
entry=entry, batch_rules = self._normalize_rule_candidates(
chunk_group=chunk_group, raw_items=list(attempt.payload.get("rule_candidates") or []),
current_user=current_user, entry=entry,
seen_keys=seen_rule_keys, chunk_group=chunk_group,
) current_user=current_user,
seen_keys=seen_rule_keys,
)
knowledge_candidates.extend(batch_knowledge) knowledge_candidates.extend(batch_knowledge)
rule_candidates.extend(batch_rules) rule_candidates.extend(batch_rules)
self._emit_progress(
progress_callback,
{
"current_stage": "extracting_candidates",
"group_count": projected_group_count,
"current_group_index": stats.group_count,
"successful_group_count": stats.successful_group_count,
"failed_group_count": stats.failed_group_count,
"knowledge_candidate_count": len(knowledge_candidates),
"rule_candidate_count": len(rule_candidates),
},
f"{entry['original_name']}》已完成第 {stats.group_count}/{projected_group_count} 组归纳。",
)
if not knowledge_candidates: formal_knowledge_candidate_count = sum(
fallback = self._build_fallback_knowledge_candidate(entry=entry, chunks=chunks) 1 for item in knowledge_candidates if str(item.get("extraction_mode") or "hermes") == "hermes"
)
if formal_knowledge_candidate_count <= 0:
fallback = self._build_fallback_knowledge_candidate(
entry=entry,
chunks=candidate_chunks,
reason=(
"Hermes 未能从正文条款中形成正式知识候选。当前结果仅为降级兜底预览,不能视为正式归纳。"
),
)
if fallback is not None: if fallback is not None:
knowledge_candidates.append(fallback) knowledge_candidates.append(fallback)
return knowledge_candidates[:12], rule_candidates[:12] truncated_knowledge_candidates = knowledge_candidates[:12]
truncated_rule_candidates = rule_candidates[:12]
stats.formal_knowledge_candidate_count = sum(
1 for item in truncated_knowledge_candidates if str(item.get("extraction_mode") or "hermes") == "hermes"
)
stats.fallback_knowledge_candidate_count = max(
0,
len(truncated_knowledge_candidates) - stats.formal_knowledge_candidate_count,
)
stats.quality_status, stats.quality_note = self._resolve_quality_status(
stats=stats,
knowledge_candidates=truncated_knowledge_candidates,
)
self._emit_progress(
progress_callback,
{
"current_stage": "candidate_extraction_completed",
"group_count": projected_group_count,
"current_group_index": projected_group_count,
"successful_group_count": stats.successful_group_count,
"failed_group_count": stats.failed_group_count,
"knowledge_candidate_count": len(truncated_knowledge_candidates),
"formal_knowledge_candidate_count": stats.formal_knowledge_candidate_count,
"fallback_knowledge_candidate_count": stats.fallback_knowledge_candidate_count,
"rule_candidate_count": len(truncated_rule_candidates),
"quality_status": stats.quality_status,
},
f"{entry['original_name']}》候选提炼完成,质量状态为 {stats.quality_status}",
)
return truncated_knowledge_candidates, truncated_rule_candidates, stats
def _call_candidate_model( def _call_candidate_model(
self, self,
*, *,
entry: dict[str, Any], entry: dict[str, Any],
chunk_group: list[dict[str, Any]], chunk_group: list[dict[str, Any]],
) -> dict[str, Any]: ) -> CandidateModelAttempt:
if self._candidate_model_disabled:
return {}
facts = { facts = {
"document_id": entry["id"], "document_id": entry["id"],
"document_name": entry["original_name"], "document_name": entry["original_name"],
@@ -653,7 +913,7 @@ class LlmWikiService:
{ {
"chunk_id": item["chunk_id"], "chunk_id": item["chunk_id"],
"title": item["title"], "title": item["title"],
"content": item["content"][:900], "content": item["content"][:HERMES_CANDIDATE_CONTENT_LIMIT],
"source_page": item.get("source_page"), "source_page": item.get("source_page"),
"tags": item.get("tags", []), "tags": item.get("tags", []),
} }
@@ -663,6 +923,8 @@ class LlmWikiService:
system_prompt = ( system_prompt = (
"你是企业财务制度知识库的 Hermes 规则形成器。" "你是企业财务制度知识库的 Hermes 规则形成器。"
"你只能基于提供的制度条款生成结构化知识候选和规则候选,不能自由发散。" "你只能基于提供的制度条款生成结构化知识候选和规则候选,不能自由发散。"
"封面、目录、通知、页眉页脚、密级说明、印发信息不属于知识候选,必须忽略。"
"只提炼具有执行意义、审核意义、报销约束意义的条款。"
"规则候选必须从允许模板中选 template_key严禁自创模板。" "规则候选必须从允许模板中选 template_key严禁自创模板。"
"runtime_rule 必须严格遵守 runtime_rule_contracts 中对应模板的字段结构和允许值。" "runtime_rule 必须严格遵守 runtime_rule_contracts 中对应模板的字段结构和允许值。"
"如果条款不适合自动规则化,可以只返回 knowledge_candidates。" "如果条款不适合自动规则化,可以只返回 knowledge_candidates。"
@@ -675,6 +937,7 @@ class LlmWikiService:
) )
user_prompt = ( user_prompt = (
"请根据以下制度分块生成候选。" "请根据以下制度分块生成候选。"
"每组最多提炼 3 条高价值 knowledge_candidates优先保留可直接供报销审核、附件校验、审批判断使用的知识。"
"只返回 JSON 对象,不要输出解释,不要调用工具,不要追加任何其他文本。\n" "只返回 JSON 对象,不要输出解释,不要调用工具,不要追加任何其他文本。\n"
f"{json.dumps(facts, ensure_ascii=False, indent=2)}" f"{json.dumps(facts, ensure_ascii=False, indent=2)}"
) )
@@ -693,29 +956,44 @@ class LlmWikiService:
timeout_seconds=HERMES_CANDIDATE_MODEL_TIMEOUT_SECONDS, timeout_seconds=HERMES_CANDIDATE_MODEL_TIMEOUT_SECONDS,
) )
payload = self._extract_json_payload(cli_result.response_text) payload = self._extract_json_payload(cli_result.response_text)
if payload: if payload is not None:
return payload return CandidateModelAttempt(payload=payload, source="hermes", ok=True)
self._candidate_model_disabled = True
logger.warning( logger.warning(
"System Hermes returned no parseable JSON for LLM Wiki doc=%s; using fallback candidates.", "System Hermes returned no parseable JSON for LLM Wiki doc=%s chunk_group=%s.",
entry.get("id"), entry.get("id"),
",".join(item.get("chunk_id", "") for item in chunk_group),
)
return CandidateModelAttempt(
payload={},
source="hermes",
ok=False,
failure_reason="system_hermes_no_json",
) )
return {}
except TimeoutExpired: except TimeoutExpired:
self._candidate_model_disabled = True
logger.warning( logger.warning(
"System Hermes timed out during LLM Wiki candidate extraction doc=%s; using fallback candidates.", "System Hermes timed out during LLM Wiki candidate extraction doc=%s chunk_group=%s.",
entry.get("id"), entry.get("id"),
",".join(item.get("chunk_id", "") for item in chunk_group),
)
return CandidateModelAttempt(
payload={},
source="hermes",
ok=False,
failure_reason="system_hermes_timeout",
) )
return {}
except Exception as exc: except Exception as exc:
self._candidate_model_disabled = True
logger.warning( logger.warning(
"System Hermes failed during LLM Wiki candidate extraction doc=%s: %s", "System Hermes failed during LLM Wiki candidate extraction doc=%s chunk_group=%s: %s",
entry.get("id"), entry.get("id"),
",".join(item.get("chunk_id", "") for item in chunk_group),
exc, exc,
) )
return {} return CandidateModelAttempt(
payload={},
source="hermes",
ok=False,
failure_reason=str(exc) or "system_hermes_failed",
)
response_text = self.runtime_chat_service.complete( response_text = self.runtime_chat_service.complete(
[ [
@@ -727,10 +1005,129 @@ class LlmWikiService:
temperature=0.0, temperature=0.0,
) )
payload = self._extract_json_payload(response_text) payload = self._extract_json_payload(response_text)
if not payload: if payload is None:
self._candidate_model_disabled = True return CandidateModelAttempt(
return {} payload={},
return payload source="runtime",
ok=False,
failure_reason="runtime_no_json",
)
return CandidateModelAttempt(
payload=payload,
source="runtime",
ok=True,
failure_reason="system_hermes_unavailable",
)
def _select_candidate_chunks(self, chunks: list[dict[str, Any]]) -> list[dict[str, Any]]:
selected: list[dict[str, Any]] = []
for chunk in chunks:
if self._is_low_signal_chunk(chunk):
continue
selected.append(chunk)
return selected
def _is_low_signal_chunk(self, chunk: dict[str, Any]) -> bool:
title = str(chunk.get("title") or "").strip()
content = str(chunk.get("content") or "").strip()
page = int(chunk.get("source_page") or 0)
if not content:
return True
if self._looks_like_table_of_contents(title=title, content=content):
return True
if self._looks_like_cover_notice(title=title, content=content, source_page=page):
return True
compact_content = re.sub(r"\s+", "", content)
if len(compact_content) < 24 and not self._has_policy_substance(content):
return True
if title.startswith("附件") and len(compact_content) < 40:
return True
return False
def _is_low_signal_candidate(self, *, title: str, content: str) -> bool:
compact_content = re.sub(r"\s+", "", content)
if len(compact_content) < 24 and not self._has_policy_substance(content):
return True
if self._looks_like_table_of_contents(title=title, content=content):
return True
if self._looks_like_cover_notice(title=title, content=content, source_page=0):
return True
return False
@staticmethod
def _has_policy_substance(text: str) -> bool:
sample = str(text or "")
return any(keyword in sample for keyword in POLICY_SUBSTANCE_KEYWORDS)
@staticmethod
def _looks_like_table_of_contents(*, title: str, content: str) -> bool:
title_text = str(title or "").strip()
content_text = str(content or "").strip()
if title_text == "目录" or content_text == "目录":
return True
lines = [line.strip() for line in content_text.splitlines() if line.strip()]
if lines and sum(1 for line in lines if LOW_SIGNAL_DOTTED_LINE_PATTERN.search(line)) >= max(2, len(lines) // 2):
return True
if LOW_SIGNAL_DOTTED_LINE_PATTERN.search(content_text) and "" in title_text:
return True
return False
def _looks_like_cover_notice(self, *, title: str, content: str, source_page: int) -> bool:
text = f"{title}\n{content}"
if PAGE_FOOTER_PATTERN.fullmatch(str(content or "").strip()):
return True
cover_keywords = ("关于颁布", "特此通知", "印发", "商密", "制度〔", "有限公司文件", "通知")
if source_page == 1 and any(keyword in text for keyword in cover_keywords):
return True
if source_page > 2:
return False
if any(keyword in text for keyword in cover_keywords):
if "必须" not in text and "应当" not in text and "不得" not in text:
return True
return False
def _resolve_quality_status(
self,
*,
stats: CandidateExtractionStats,
knowledge_candidates: list[dict[str, Any]],
) -> tuple[str, str]:
if stats.formal_knowledge_candidate_count <= 0:
runtime_count = sum(
1 for item in knowledge_candidates if str(item.get("extraction_mode") or "") == "runtime"
)
if runtime_count > 0:
return (
"runtime_only",
"当前知识候选来自运行时模型而非系统 Hermes仅供人工参考不计入正式归纳。",
)
if stats.fallback_knowledge_candidate_count > 0:
return (
"fallback_only",
"Hermes 未形成正式知识候选,当前仅保留降级兜底预览,不能作为正式知识上线。",
)
return (
"failed",
"Hermes 未能从当前文档提炼出可用知识候选,请调整文档内容或重新归纳。",
)
if stats.failed_group_count > 0:
return (
"partial_degraded",
f"Hermes 成功处理 {stats.successful_group_count}/{stats.group_count} 个分组,"
f"仍有 {stats.failed_group_count} 个分组未成功,请人工复核后再使用。",
)
if stats.filtered_chunk_count > 0:
return (
"formal",
f"已自动过滤 {stats.filtered_chunk_count} 个封面、目录或低信息量分块,当前结果来自正文条款。",
)
return ("formal", "Hermes 已基于正文条款完成正式归纳。")
def _normalize_knowledge_candidates( def _normalize_knowledge_candidates(
self, self,
@@ -739,6 +1136,7 @@ class LlmWikiService:
entry: dict[str, Any], entry: dict[str, Any],
chunk_group: list[dict[str, Any]], chunk_group: list[dict[str, Any]],
seen_keys: set[str], seen_keys: set[str],
extraction_mode: str,
) -> list[dict[str, Any]]: ) -> list[dict[str, Any]]:
normalized: list[dict[str, Any]] = [] normalized: list[dict[str, Any]] = []
default_chunk_ids = [item["chunk_id"] for item in chunk_group] default_chunk_ids = [item["chunk_id"] for item in chunk_group]
@@ -750,10 +1148,17 @@ class LlmWikiService:
content = str(item.get("content") or "").strip() content = str(item.get("content") or "").strip()
if not title or not content: if not title or not content:
continue continue
if self._is_low_signal_candidate(title=title, content=content):
continue
candidate_key = f"{title.casefold()}::{content[:80].casefold()}" candidate_key = f"{title.casefold()}::{content[:80].casefold()}"
if candidate_key in seen_keys: if candidate_key in seen_keys:
continue continue
seen_keys.add(candidate_key) seen_keys.add(candidate_key)
quality_flags: list[str] = []
fallback_reason = ""
if extraction_mode != "hermes":
quality_flags.append("non_hermes_source")
fallback_reason = "当前知识候选不是由系统 Hermes 正式提炼,不能视为正式归纳。"
normalized.append( normalized.append(
{ {
"candidate_id": f"kc_{uuid4().hex[:12]}", "candidate_id": f"kc_{uuid4().hex[:12]}",
@@ -775,6 +1180,9 @@ class LlmWikiService:
"status": "draft", "status": "draft",
"created_by": "hermes", "created_by": "hermes",
"created_at": datetime.now(UTC).isoformat(), "created_at": datetime.now(UTC).isoformat(),
"extraction_mode": extraction_mode,
"quality_flags": quality_flags,
"fallback_reason": fallback_reason,
} }
) )
return normalized return normalized
@@ -1218,6 +1626,7 @@ class LlmWikiService:
*, *,
entry: dict[str, Any], entry: dict[str, Any],
chunks: list[dict[str, Any]], chunks: list[dict[str, Any]],
reason: str,
) -> dict[str, Any] | None: ) -> dict[str, Any] | None:
first_chunk = next((item for item in chunks if str(item.get("content") or "").strip()), None) first_chunk = next((item for item in chunks if str(item.get("content") or "").strip()), None)
if first_chunk is None: if first_chunk is None:
@@ -1244,6 +1653,9 @@ class LlmWikiService:
"status": "draft", "status": "draft",
"created_by": "hermes", "created_by": "hermes",
"created_at": datetime.now(UTC).isoformat(), "created_at": datetime.now(UTC).isoformat(),
"extraction_mode": "fallback",
"quality_flags": ["fallback_only", "not_formal_ingest"],
"fallback_reason": reason,
} }
@staticmethod @staticmethod
@@ -1266,10 +1678,38 @@ class LlmWikiService:
f"- 来源文档:{document_name}", f"- 来源文档:{document_name}",
f"- 知识条目数:{len(knowledge_candidates)}", f"- 知识条目数:{len(knowledge_candidates)}",
"", "",
"## 核心知识",
"",
] ]
quality_status = str(entry.get("quality_status") or "formal").strip() or "formal"
quality_note = str(entry.get("quality_note") or "").strip()
if quality_status != "formal":
lines.extend(
[
"## 归纳状态",
"",
f"- 质量状态:{quality_status}",
f"- 说明:{quality_note or '当前结果不是正式 Hermes 归纳。'}",
"",
]
)
elif quality_note:
lines.extend(
[
"## 归纳状态",
"",
f"- 质量状态:{quality_status}",
f"- 说明:{quality_note}",
"",
]
)
lines.extend(
[
"## 核心知识",
"",
]
)
if not knowledge_candidates: if not knowledge_candidates:
lines.extend( lines.extend(
[ [
@@ -1624,6 +2064,10 @@ class LlmWikiService:
def _clean_line(line: str) -> str: def _clean_line(line: str) -> str:
cleaned = str(line or "").replace("\u3000", " ").strip() cleaned = str(line or "").replace("\u3000", " ").strip()
cleaned = re.sub(r"\s+", " ", cleaned) cleaned = re.sub(r"\s+", " ", cleaned)
if PAGE_FOOTER_PATTERN.fullmatch(cleaned):
return ""
if cleaned in {"商密【中】", "商密【高】", "商密【低】"}:
return ""
return cleaned return cleaned
@staticmethod @staticmethod
@@ -1701,6 +2145,11 @@ class LlmWikiService:
return "forced_rebuild" return "forced_rebuild"
if existing is None: if existing is None:
return "initial_build" return "initial_build"
existing_quality_status = str(existing.get("quality_status") or "").strip()
if existing_quality_status and existing_quality_status != "formal":
return f"quality_{existing_quality_status}_rebuild"
if int(existing.get("formal_knowledge_candidate_count") or 0) <= 0:
return "formal_candidate_missing_rebuild"
previous_signature = existing.get("signature") previous_signature = existing.get("signature")
if not isinstance(previous_signature, dict): if not isinstance(previous_signature, dict):
@@ -1723,6 +2172,35 @@ class LlmWikiService:
return "unchanged_skipped" return "unchanged_skipped"
return ",".join(reasons) return ",".join(reasons)
@staticmethod
def _emit_progress(
progress_callback: Callable[[dict[str, Any], str], None] | None,
payload: dict[str, Any],
summary: str,
) -> None:
if progress_callback is None:
return
progress_callback(payload, summary)
@staticmethod
def _calculate_progress_percent(
*,
completed_documents: int,
skipped_documents: int,
total_documents: int,
group_count: int = 0,
current_group_index: int = 0,
) -> int:
if total_documents <= 0:
return 100
completed_units = completed_documents + skipped_documents
if group_count > 0 and current_group_index > 0:
completed_units += min(current_group_index, group_count) / group_count
percent = round((completed_units / total_documents) * 100)
return max(0, min(percent, 100))
@staticmethod @staticmethod
def _build_document_signature(entry: dict[str, Any]) -> dict[str, Any]: def _build_document_signature(entry: dict[str, Any]) -> dict[str, Any]:
return { return {

View File

@@ -16,7 +16,7 @@ from app.core.agent_enums import AgentReviewStatus, AgentRunSource, AgentRunStat
from app.db.base import Base from app.db.base import Base
from app.main import create_app from app.main import create_app
from app.schemas.agent_asset import AgentAssetReviewCreate from app.schemas.agent_asset import AgentAssetReviewCreate
from app.schemas.knowledge import LlmWikiSummaryUpdateWrite, LlmWikiSyncRead from app.schemas.knowledge import LlmWikiSummaryUpdateWrite
from app.services.agent_assets import AgentAssetService from app.services.agent_assets import AgentAssetService
from app.services.agent_runs import AgentRunService from app.services.agent_runs import AgentRunService
from app.services.knowledge import ( from app.services.knowledge import (
@@ -25,7 +25,7 @@ from app.services.knowledge import (
KNOWLEDGE_INGEST_STATUS_PUBLISHED, KNOWLEDGE_INGEST_STATUS_PUBLISHED,
KnowledgeService, KnowledgeService,
) )
from app.services.llm_wiki import LlmWikiService from app.services.llm_wiki import CandidateModelAttempt, LlmWikiService
def build_session() -> Session: def build_session() -> Session:
@@ -86,6 +86,36 @@ def upload_policy_document(storage_root: Path, *, filename: str = "公司差旅
return document.id return document.id
def upload_multipage_policy_document(storage_root: Path, *, filename: str = "公司支出管理办法.txt") -> str:
service = KnowledgeService(storage_root=storage_root)
service.ensure_library_ready()
document = service.upload_document(
folder="报销制度",
filename=filename,
content=(
"商密【中】\n"
"关于颁布《公司支出管理办法》的通知\n"
"特此通知。\n"
"\f"
"目录\n"
"第一章 总则................................4\n"
"第二章 报销审批................................7\n"
"\f"
"第一条 报销申请\n"
"员工提交报销申请时,应附发票、行程单和审批说明。\n"
"第二条 报销审批\n"
"住宿费超过制度标准时,必须升级至总经理审批。\n"
"第三条 附件补充\n"
"缺少附件时不得提交报销。\n"
"\f"
"第四条 财务复核\n"
"财务复核时应校验预算、发票真伪和审批链完整性。\n"
).encode("utf-8"),
current_user=build_admin_user(),
)
return document.id
def build_candidate_payload(chunk_id: str, *, summary: str = "住宿费超过标准时必须升级审批。") -> dict[str, object]: def build_candidate_payload(chunk_id: str, *, summary: str = "住宿费超过标准时必须升级审批。") -> dict[str, object]:
return { return {
"knowledge_candidates": [ "knowledge_candidates": [
@@ -222,10 +252,14 @@ def test_llm_wiki_sync_creates_artifacts_and_draft_rule(tmp_path, monkeypatch) -
document_payload = json.loads((document_dir / "document.json").read_text(encoding="utf-8")) document_payload = json.loads((document_dir / "document.json").read_text(encoding="utf-8"))
assert document_payload["sync_reason"] == "initial_build" assert document_payload["sync_reason"] == "initial_build"
assert document_payload["quality_status"] == "formal"
assert document_payload["formal_knowledge_candidate_count"] == 1
assert document_payload["fallback_knowledge_candidate_count"] == 0
detail = service.get_document_detail(document_id) detail = service.get_document_detail(document_id)
assert "公司差旅报销制度.txt 知识总结" in detail.knowledge_summary_markdown assert "公司差旅报销制度.txt 知识总结" in detail.knowledge_summary_markdown
assert "住宿费升级审批要求" in detail.knowledge_summary_markdown assert "住宿费升级审批要求" in detail.knowledge_summary_markdown
assert detail.quality_status == "formal"
asset = AgentAssetService(db).get_asset(result.generated_rule_asset_ids[0]) asset = AgentAssetService(db).get_asset(result.generated_rule_asset_ids[0])
assert asset is not None assert asset is not None
@@ -386,9 +420,91 @@ def test_llm_wiki_sync_uses_fallback_candidates_when_system_hermes_times_out(
assert result.knowledge_candidate_count >= 1 assert result.knowledge_candidate_count >= 1
assert runtime_called["count"] == 0 assert runtime_called["count"] == 0
detail = KnowledgeService(storage_root=tmp_path).get_document_detail(document_id) knowledge_service = KnowledgeService(storage_root=tmp_path)
assert detail.stateCode == KNOWLEDGE_INGEST_STATUS_INGESTED detail = knowledge_service.get_document_detail(document_id)
assert detail.state == "已归纳" assert detail.stateCode == KNOWLEDGE_INGEST_STATUS_FAILED
assert detail.state == "归纳失败"
assert detail.llmWikiAvailable is True
assert detail.llmWikiQualityStatus == "fallback_only"
document_payload = json.loads(
(
tmp_path
/ "knowledge"
/ ".llm_wiki"
/ "documents"
/ document_id
/ "document.json"
).read_text(encoding="utf-8")
)
assert document_payload["quality_status"] == "fallback_only"
assert document_payload["formal_knowledge_candidate_count"] == 0
assert document_payload["fallback_knowledge_candidate_count"] == 1
candidates_payload = json.loads(
(
tmp_path
/ "knowledge"
/ ".llm_wiki"
/ "documents"
/ document_id
/ "knowledge_candidates.json"
).read_text(encoding="utf-8")
)
assert candidates_payload[0]["extraction_mode"] == "fallback"
assert "fallback_only" in candidates_payload[0]["quality_flags"]
def test_llm_wiki_sync_continues_after_single_group_failure(tmp_path, monkeypatch) -> None:
document_id = upload_multipage_policy_document(tmp_path, filename="多页支出制度.txt")
call_count = {"count": 0}
def fake_call_candidate_model(self, *, entry, chunk_group):
call_count["count"] += 1
if call_count["count"] == 1:
return CandidateModelAttempt(
payload={},
source="hermes",
ok=False,
failure_reason="simulated_timeout",
)
return build_candidate_payload(chunk_group[0]["chunk_id"])
monkeypatch.setattr(LlmWikiService, "_call_candidate_model", fake_call_candidate_model)
with build_session() as db:
service = LlmWikiService(db, storage_root=tmp_path)
result = service.sync_folder(
folder="报销制度",
current_user=build_admin_user(),
document_ids=[document_id],
)
detail = service.get_document_detail(document_id)
assert result.document_count == 1
assert call_count["count"] >= 2
assert detail.quality_status == "partial_degraded"
assert detail.successful_group_count >= 1
assert detail.failed_group_count >= 1
assert detail.formal_knowledge_candidate_count >= 1
knowledge_detail = KnowledgeService(storage_root=tmp_path).get_document_detail(document_id)
assert knowledge_detail.stateCode == KNOWLEDGE_INGEST_STATUS_INGESTED
assert knowledge_detail.llmWikiQualityStatus == "partial_degraded"
def test_llm_wiki_filters_cover_and_catalog_chunks_before_candidate_extraction(tmp_path) -> None:
document_id = upload_multipage_policy_document(tmp_path, filename="封面目录过滤测试.txt")
with build_session() as db:
service = LlmWikiService(db, storage_root=tmp_path)
text = service.knowledge_service.extract_document_text(document_id)
chunks = service._build_chunks(document_id=document_id, text=text)
candidate_chunks = service._select_candidate_chunks(chunks)
assert len(chunks) > len(candidate_chunks)
assert candidate_chunks
assert all(int(item.get("source_page") or 0) >= 3 for item in candidate_chunks)
def test_llm_wiki_sync_skips_unchanged_and_rebuilds_on_updated_at_change(tmp_path, monkeypatch) -> None: def test_llm_wiki_sync_skips_unchanged_and_rebuilds_on_updated_at_change(tmp_path, monkeypatch) -> None:
@@ -475,22 +591,46 @@ def test_llm_wiki_sync_does_not_overwrite_active_rule(tmp_path, monkeypatch) ->
def test_llm_wiki_sync_endpoint_records_agent_run(monkeypatch) -> None: def test_llm_wiki_sync_endpoint_records_agent_run(monkeypatch) -> None:
def fake_sync_folder(self, *, folder="报销制度", current_user, document_ids=None, force=False):
return LlmWikiSyncRead(
ok=True,
run_id="wiki_test_sync",
folder=folder,
document_count=1,
knowledge_candidate_count=2,
rule_candidate_count=1,
generated_rule_count=1,
generated_rule_asset_ids=["asset-rule-1"],
summary="已完成 Hermes LLM Wiki 同步。",
)
monkeypatch.setattr(LlmWikiService, "sync_folder", fake_sync_folder)
client, session_factory = build_client() client, session_factory = build_client()
def fake_submit_sync(*, agent_run_id, folder, current_user, document_ids=None, force=False):
with session_factory() as db:
service = AgentRunService(db)
service.record_tool_call(
run_id=agent_run_id,
tool_type="llm",
tool_name="system_hermes_llm_wiki_sync",
request_json={
"folder": folder,
"document_ids": list(document_ids or []),
"force": force,
},
response_json={"run_id": "wiki_test_sync"},
status="succeeded",
duration_ms=0,
)
service.merge_route_json(
agent_run_id,
{
"phase": "succeeded",
"sync_run_id": "wiki_test_sync",
"progress": {
"total_documents": len(document_ids or []),
"completed_documents": len(document_ids or []),
"failed_documents": 0,
"skipped_documents": 0,
"percent": 100,
},
},
status=AgentRunStatus.SUCCEEDED.value,
result_summary="已完成 Hermes LLM Wiki 同步。",
)
monkeypatch.setattr(
"app.services.llm_wiki_tasks.llm_wiki_task_manager.submit_sync",
fake_submit_sync,
)
with session_factory() as db: with session_factory() as db:
before_count = len(AgentRunService(db).list_runs(limit=100)) before_count = len(AgentRunService(db).list_runs(limit=100))
@@ -506,8 +646,8 @@ def test_llm_wiki_sync_endpoint_records_agent_run(monkeypatch) -> None:
assert response.status_code == 200 assert response.status_code == 200
payload = response.json() payload = response.json()
assert payload["run_id"] == "wiki_test_sync" assert payload["agent_run_id"].startswith("run_")
assert payload["generated_rule_count"] == 1 assert payload["status"] == AgentRunStatus.RUNNING.value
with session_factory() as db: with session_factory() as db:
service = AgentRunService(db) service = AgentRunService(db)
@@ -521,4 +661,4 @@ def test_llm_wiki_sync_endpoint_records_agent_run(monkeypatch) -> None:
assert latest_run.tool_calls assert latest_run.tool_calls
assert latest_run.tool_calls[0].tool_name == "system_hermes_llm_wiki_sync" assert latest_run.tool_calls[0].tool_name == "system_hermes_llm_wiki_sync"
assert latest_run.tool_calls[0].status == "succeeded" assert latest_run.tool_calls[0].status == "succeeded"
assert latest_run.tool_calls[0].response_json["run_id"] == "wiki_test_sync" assert latest_run.route_json["sync_run_id"] == "wiki_test_sync"