Files
X-Financial/server/tests/test_knowledge_document_extractors.py
caoxiaozhu 88ff04bef8 feat: 新增归档中心页面并完善知识库与报销查询能力
新增前端归档中心视图及相关工具函数,扩充知识库文档分类和
提取器支持多种格式,增强编排器报销查询的多维度检索,优
化本体规则和用户代理审核消息,前端完善报销创建和审批详
情交互细节,补充单元测试覆盖。
2026-05-22 16:00:19 +08:00

97 lines
3.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
from zipfile import ZipFile
from app.services.knowledge_document_extractors import _extract_document_text_from_path
def test_extract_xlsx_document_text_builds_markdown_with_row_clues(tmp_path) -> None:
file_path = tmp_path / "company-expense-rules.xlsx"
_write_minimal_xlsx(
file_path,
sheet_name="报销标准",
rows=[
["费用类型", "标准", "说明"],
["住宿费", "500", "超标准需事前审批"],
["交通费", "据实", "保留发票"],
],
)
text = _extract_document_text_from_path(
file_path=file_path,
original_name="公司支出管理办法.xlsx",
mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
)
assert "# Excel 工作簿:公司支出管理办法.xlsx" in text
assert "## 工作表 1报销标准" in text
assert "| 费用类型 | 标准 | 说明 |" in text
assert "费用类型=住宿费;标准=500说明=超标准需事前审批" in text
assert "费用类型=交通费;标准=据实;说明=保留发票" in text
def test_extract_pptx_document_text_builds_markdown_slides(tmp_path) -> None:
file_path = tmp_path / "training.pptx"
slide_xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
<p:cSld>
<p:spTree>
<p:sp><p:txBody><a:p><a:r><a:t>差旅报销培训</a:t></a:r></a:p></p:txBody></p:sp>
<p:sp><p:txBody><a:p><a:r><a:t>发票、审批、预算三项要素必须齐全</a:t></a:r></a:p></p:txBody></p:sp>
</p:spTree>
</p:cSld>
</p:sld>
"""
with ZipFile(file_path, "w") as archive:
archive.writestr("ppt/slides/slide1.xml", slide_xml)
text = _extract_document_text_from_path(
file_path=file_path,
original_name="报销培训.pptx",
mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
)
assert "# PowerPoint 演示文稿:报销培训.pptx" in text
assert "## 幻灯片 1" in text
assert "- 差旅报销培训" in text
assert "- 发票、审批、预算三项要素必须齐全" in text
def _write_minimal_xlsx(file_path, *, sheet_name: str, rows: list[list[str]]) -> None:
workbook_xml = f"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<sheets>
<sheet name="{sheet_name}" sheetId="1" r:id="rId1"/>
</sheets>
</workbook>
"""
rels_xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1"
Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"
Target="worksheets/sheet1.xml"/>
</Relationships>
"""
row_xml = "\n".join(
f'<row r="{row_index}">'
+ "".join(
f'<c r="{chr(65 + column_index)}{row_index}" t="inlineStr"><is><t>{cell}</t></is></c>'
for column_index, cell in enumerate(row)
)
+ "</row>"
for row_index, row in enumerate(rows, start=1)
)
sheet_xml = f"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
<sheetData>
{row_xml}
</sheetData>
</worksheet>
"""
with ZipFile(file_path, "w") as archive:
archive.writestr("xl/workbook.xml", workbook_xml)
archive.writestr("xl/_rels/workbook.xml.rels", rels_xml)
archive.writestr("xl/worksheets/sheet1.xml", sheet_xml)