from __future__ import annotations import json import threading from app.services import knowledge_rag as knowledge_rag_module from app.services.knowledge_ingest_log import ( build_document_graph_summary, build_ingest_document_summary, build_ingest_status_summary, enrich_knowledge_ingest_route_json, ) from app.services.knowledge_rag import KnowledgeRagService from app.services.knowledge_rag_local import query_local_text_chunks def test_build_hits_prioritizes_structured_table_evidence_for_standard_queries() -> None: hits = KnowledgeRagService._build_hits_from_query_data( query="住宿费标准是多少?", chunks=[ { "chunk_id": "plain-1", "file_path": "/tmp/doc-1__差旅制度.md", "content": "住宿费说明文字,提到了出差和报销要求,但没有清晰表格。", }, { "chunk_id": "table-1", "file_path": "/tmp/doc-1__差旅制度.md", "content": "# 结构化表格补充\n\n| 城市 | 住宿费标准 |\n| 北京 | 500 |", }, ], entities=[], limit=2, ) assert [item["candidate_id"] for item in hits] == ["table-1", "plain-1"] def test_build_hits_boosts_query_term_matches() -> None: hits = KnowledgeRagService._build_hits_from_query_data( query="招待费报销标准", chunks=[ { "chunk_id": "travel-1", "file_path": "/tmp/doc-1__费用制度.md", "content": "差旅费包含交通费、住宿费和餐补标准。", }, { "chunk_id": "ent-1", "file_path": "/tmp/doc-1__费用制度.md", "content": "业务招待费报销标准:应结合客户接待场景、人数和审批要求执行。", }, ], entities=[], limit=2, ) assert [item["candidate_id"] for item in hits] == ["ent-1", "travel-1"] def test_build_hits_keeps_long_query_anchor_terms_for_accounting_table() -> None: hits = KnowledgeRagService._build_hits_from_query_data( query="远光软件财务基础知识手册里的常用会计科目是什么?", chunks=[ { "chunk_id": "generic-1", "file_path": "/tmp/doc-1__远光软件财务制度培训手册.docx", "content": "远光软件股份有限公司财务培训内容,介绍费用报销和财务制度。", }, { "chunk_id": "accounts-1", "file_path": "/tmp/doc-2__远光软件财务基础知识手册.docx", "content": ( "二、常用会计科目\n\n" "| 科目类别 | 科目名称 | 说明 |\n" "| --- | --- | --- |\n" "| 资产类 | 库存现金 | 公司持有的现金 |\n" "| 损益类 | 销售费用 | 为销售产品发生的费用 |" ), }, ], entities=[], limit=2, ) assert [item["candidate_id"] for item in hits] == ["accounts-1", "generic-1"] def test_build_hits_prioritizes_answer_clue_appendix_for_rule_queries() -> None: hits = KnowledgeRagService._build_hits_from_query_data( query="报销时限是多少?", chunks=[ { "chunk_id": "plain-1", "file_path": "/tmp/doc-1__费用制度.md", "content": "本制度用于规范报销流程,员工应遵守公司审批要求。", }, { "chunk_id": "clue-1", "file_path": "/tmp/doc-1__费用制度.md", "content": ( "# 问答线索补充\n\n" "- 第二章 报销时限:费用发生后 30 日内提交申请。\n" "- 第二章 报销时限:超过 30 日需补充审批说明。" ), }, ], entities=[], limit=2, ) assert [item["candidate_id"] for item in hits] == ["clue-1", "plain-1"] def test_query_local_text_chunks_prioritizes_relevant_policy_chunk(tmp_path) -> None: workspace = tmp_path / "knowledge" / ".lightrag" / "x_financial_knowledge" workspace.mkdir(parents=True) (workspace / "kv_store_text_chunks.json").write_text( json.dumps( { "chunk-travel": { "_id": "chunk-travel", "full_doc_id": "doc-1", "chunk_order_index": 1, "file_path": "/tmp/doc-1__差旅费管理办法.pdf", "content": ( "第十三条 差旅费。酒店住宿限额标准如下:其他员工直辖市350元、" "省会城市300元、其他地区250元。确因紧急公务、特别情形等事项" "导致住宿超过规定标准时,超标20%以内由部门负责人审批," "超标20%以上需分管领导审批。" ), }, "chunk-office": { "_id": "chunk-office", "full_doc_id": "doc-2", "chunk_order_index": 1, "file_path": "/tmp/doc-2__办公用品管理办法.pdf", "content": "办公用品采购应遵循预算和验收流程。", }, } ), encoding="utf-8", ) result = query_local_text_chunks( lightrag_root=tmp_path / "knowledge" / ".lightrag", workspace="x_financial_knowledge", query="住宿费超过标准审批依据是什么?", limit=2, ) assert result.confident is True assert result.hits[0]["candidate_id"] == "chunk-travel" assert "住宿超过规定标准" in result.hits[0]["content"] def test_query_knowledge_uses_local_chunks_before_lightrag_runtime(tmp_path, monkeypatch) -> None: workspace = tmp_path / "knowledge" / ".lightrag" / "x_financial_knowledge" workspace.mkdir(parents=True) (workspace / "kv_store_text_chunks.json").write_text( json.dumps( { "chunk-1": { "_id": "chunk-1", "full_doc_id": "doc-1", "chunk_order_index": 1, "file_path": "/tmp/doc-1__公司支出管理办法.pdf", "content": ( "第八条 支出报销申请时限。公司各类支出报销结算申请时限为三个月。" "逾期需说明原因,经分管领导审批后方可报销。" ), } } ), encoding="utf-8", ) def fail_if_runtime_is_used(_self): raise AssertionError("local high-confidence queries should not initialize LightRAG") monkeypatch.setattr(KnowledgeRagService, "_get_runtime", fail_if_runtime_is_used) payload = KnowledgeRagService(storage_root=tmp_path).query_knowledge( "费用发生后多久内必须报销?超过三个月还能不能报?", limit=3, ) assert payload["record_count"] == 1 assert payload["metadata"]["retrieval_strategy"] == "local_text_chunks" assert "三个月" in payload["hits"][0]["content"] def test_build_hits_demotes_chapter_navigation_for_specific_rule_queries() -> None: hits = KnowledgeRagService._build_hits_from_query_data( query="探亲差旅归哪个部门管理?", chunks=[ { "chunk_id": "toc-1", "file_path": "/tmp/doc-1__费用制度.md", "content": "# 章节导航\n\n- 第一章 总则\n- 第二章 职责分工\n- 第三章 支出归口", }, { "chunk_id": "body-1", "file_path": "/tmp/doc-1__费用制度.md", "content": ( "附表3:支出归口管理部门与归口业务范围\n" "组织人事部:探亲差旅、条件艰苦及安全风险较高区域补助等支出。" ), }, ], entities=[], limit=2, ) assert [item["candidate_id"] for item in hits] == ["body-1", "toc-1"] def test_resolve_default_qdrant_url_prefers_container_host(monkeypatch) -> None: monkeypatch.setattr( knowledge_rag_module.socket, "getaddrinfo", lambda hostname, port: ( [("family", "type", "proto", "canonname", ("172.21.0.2", 0))] if hostname == "qdrant" else [] ), ) assert knowledge_rag_module._resolve_default_qdrant_url() == "http://qdrant:6333" def test_resolve_default_qdrant_url_falls_back_to_loopback(monkeypatch) -> None: def raise_lookup_error(_hostname, _port): raise OSError("lookup failed") monkeypatch.setattr(knowledge_rag_module.socket, "getaddrinfo", raise_lookup_error) assert knowledge_rag_module._resolve_default_qdrant_url() == "http://127.0.0.1:6333" def test_runtime_cache_uses_dedicated_instance_across_calling_threads(monkeypatch) -> None: knowledge_rag_module.shutdown_knowledge_rag_runtime() created_runtimes = [] class FakeRuntime: def __init__(self, **_kwargs): self.finalized = False created_runtimes.append(self) def finalize(self): self.finalized = True monkeypatch.setattr(knowledge_rag_module, "_LightRagRuntime", FakeRuntime) monkeypatch.setattr( KnowledgeRagService, "_build_runtime_signature", lambda self: (("same-config",), {}), ) service = KnowledgeRagService() main_runtime = service._get_runtime() assert service._get_runtime() is main_runtime worker_runtimes = [] def load_worker_runtime() -> None: worker_runtimes.append(KnowledgeRagService()._get_runtime()) thread = threading.Thread(target=load_worker_runtime) thread.start() thread.join(timeout=5) assert len(created_runtimes) == 1 assert worker_runtimes[0] is main_runtime knowledge_rag_module.shutdown_knowledge_rag_runtime() assert all(runtime.finalized for runtime in created_runtimes) def test_is_query_ready_status_rejects_failed_status_even_with_chunks() -> None: assert ( KnowledgeRagService.is_query_ready_status( { "status": "failed", "chunks_count": 11, "chunks_list": ["chunk-1"], } ) is False ) def test_is_query_ready_status_rejects_processing_status_even_with_chunks() -> None: assert ( KnowledgeRagService.is_query_ready_status( { "status": "processing", "chunks_count": 11, "chunks_list": ["chunk-1"], } ) is False ) def test_build_document_graph_summary_reads_lightrag_storage(tmp_path) -> None: workspace = tmp_path / "knowledge" / ".lightrag" / "test_workspace" workspace.mkdir(parents=True) (workspace / "kv_store_full_entities.json").write_text( json.dumps({"doc-1": {"entity_names": ["远光软件", "支出管理", "远光软件"]}}), encoding="utf-8", ) (workspace / "kv_store_full_relations.json").write_text( json.dumps({"doc-1": {"relation_pairs": [["远光软件", "支出管理"]]}}), encoding="utf-8", ) (workspace / "kv_store_text_chunks.json").write_text( json.dumps( { "chunk-2": { "_id": "chunk-2", "full_doc_id": "doc-1", "chunk_order_index": 1, "tokens": 45, "content": "第二条 支出审批需要结合预算、归口部门和授权标准执行。", }, "chunk-1": { "_id": "chunk-1", "full_doc_id": "doc-1", "chunk_order_index": 0, "tokens": 31, "content": "第一条 本办法适用于公司支出管理。", }, } ), encoding="utf-8", ) (workspace / "kv_store_entity_chunks.json").write_text( json.dumps( { "远光软件": {"chunk_ids": ["chunk-1", "chunk-missing"]}, "支出管理": {"chunk_ids": ["chunk-2"]}, } ), encoding="utf-8", ) (workspace / "graph_chunk_entity_relation.graphml").write_text( """ 远光软件 ORGANIZATION 公司主体<SEP>费用制度适用公司 2026-05-23 支出管理 TOPIC 规范费用支出、预算和审批。 2.5 远光软件通过支出管理制度约束费用审批。 制度<SEP>审批 """, encoding="utf-8", ) summary = build_document_graph_summary( tmp_path, workspace="test_workspace", document_id="doc-1", ) assert summary["entity_count"] == 2 assert [item["name"] for item in summary["entities"]] == ["远光软件", "支出管理"] assert summary["entities"][0]["type"] == "ORGANIZATION" assert summary["entities"][0]["descriptions"][0] == "公司主体" assert summary["relation_count"] == 1 assert summary["relations"][0]["source"] == "远光软件" assert summary["relations"][0]["target"] == "支出管理" assert summary["relations"][0]["description"] == "远光软件通过支出管理制度约束费用审批。" assert summary["relations"][0]["keywords"] == ["制度", "审批"] assert summary["relations"][0]["weight"] == 2.5 assert [item["id"] for item in summary["chunks"]] == ["chunk-1", "chunk-2"] assert summary["chunks"][0]["excerpt"].startswith("第一条") assert summary["entity_chunks"] == [ {"entity": "远光软件", "chunk_ids": ["chunk-1"]}, {"entity": "支出管理", "chunk_ids": ["chunk-2"]}, ] enriched_route = enrich_knowledge_ingest_route_json( { "lightrag_workspace": "test_workspace", "knowledge_ingest": { "graph": { "entities": ["远光软件"], "relations": [ {"source": "远光软件", "target": "支出管理", "type": "关联"} ], } }, }, storage_root=tmp_path, ) enriched_entities = enriched_route["knowledge_ingest"]["graph"]["entities"] assert [item["name"] for item in enriched_entities] == ["远光软件", "支出管理"] assert enriched_entities[1]["type"] == "TOPIC" def test_build_ingest_document_summary_extracts_sections() -> None: summary = build_ingest_document_summary( document_id="doc-1", entry={ "original_name": "公司支出管理办法.pdf", "folder": "制度文件", "extension": "pdf", "mime_type": "application/pdf", }, raw_text="第一章 总则\n本办法用于规范公司支出。", indexed_text="# 第一章 总则\n本办法用于规范公司支出。\n第二条 审批\n审批需按授权执行。", ) assert summary["name"] == "公司支出管理办法.pdf" assert summary["section_count"] == 2 assert summary["sections"][0]["title"] == "第一章 总则" def test_build_ingest_status_summary_keeps_chunk_status() -> None: summary = build_ingest_status_summary( status_payload={ "status": "processed", "query_ready": True, "chunks_count": 2, "chunks_list": ["chunk-1", "chunk-2"], }, graph_summary={ "entity_count": 1, "relation_count": 0, "entities": ["预算"], "relations": [], }, ) assert summary["lightrag_status"] == "processed" assert summary["query_ready"] is True assert summary["chunk_count"] == 2 assert summary["chunk_ids"] == ["chunk-1", "chunk-2"]