Files
X-Financial/server/tests/test_knowledge_document_extractors.py
caoxiaozhu 50b1c3f9a9 feat: 增强规则资产管理与审计页面运行时调试
后端新增规则资产版本管理和规则文件 CRUD 接口,优化风险
规则生成模板执行和员工数据模型字段,知识库 RAG 增强本
地回退和文档提取能力,清理旧风险规则文件统一由生成引擎
管理,前端审计页面增加运行时调试面板和规则资产编辑交互,
补充单元测试覆盖。
2026-05-24 21:44:17 +08:00

162 lines
6.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
from zipfile import ZipFile
from app.services.knowledge_document_extractors import _extract_document_text_from_path
def test_extract_docx_document_text_preserves_tables_as_markdown(tmp_path) -> None:
file_path = tmp_path / "financial-basic.docx"
_write_minimal_docx_with_table(
file_path,
paragraphs=[
"远光软件股份有限公司",
"财务基础知识手册",
"二、常用会计科目",
],
table=[
["科目类别", "科目名称", "说明"],
["资产类", "库存现金", "公司持有的现金"],
["负债类", "应付账款", "因购买商品或接受劳务应付的款项"],
["损益类", "销售费用", "为销售产品发生的费用"],
],
)
text = _extract_document_text_from_path(
file_path=file_path,
original_name="远光软件财务基础知识手册.docx",
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
assert "二、常用会计科目" in text
assert "| 科目类别 | 科目名称 | 说明 |" in text
assert "| 资产类 | 库存现金 | 公司持有的现金 |" in text
assert "| 负债类 | 应付账款 | 因购买商品或接受劳务应付的款项 |" in text
assert "| 损益类 | 销售费用 | 为销售产品发生的费用 |" in text
assert "表格第 2 行:科目类别=资产类;科目名称=库存现金;说明=公司持有的现金" in text
assert "科目类别\n科目名称\n说明" not in text
def test_extract_xlsx_document_text_builds_markdown_with_row_clues(tmp_path) -> None:
file_path = tmp_path / "company-expense-rules.xlsx"
_write_minimal_xlsx(
file_path,
sheet_name="报销标准",
rows=[
["费用类型", "标准", "说明"],
["住宿费", "500", "超标准需事前审批"],
["交通费", "据实", "保留发票"],
],
)
text = _extract_document_text_from_path(
file_path=file_path,
original_name="公司支出管理办法.xlsx",
mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
)
assert "# Excel 工作簿:公司支出管理办法.xlsx" in text
assert "## 工作表 1报销标准" in text
assert "| 费用类型 | 标准 | 说明 |" in text
assert "费用类型=住宿费;标准=500说明=超标准需事前审批" in text
assert "费用类型=交通费;标准=据实;说明=保留发票" in text
def test_extract_pptx_document_text_builds_markdown_slides(tmp_path) -> None:
file_path = tmp_path / "training.pptx"
slide_xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
<p:cSld>
<p:spTree>
<p:sp><p:txBody><a:p><a:r><a:t>差旅报销培训</a:t></a:r></a:p></p:txBody></p:sp>
<p:sp><p:txBody><a:p><a:r><a:t>发票、审批、预算三项要素必须齐全</a:t></a:r></a:p></p:txBody></p:sp>
</p:spTree>
</p:cSld>
</p:sld>
"""
with ZipFile(file_path, "w") as archive:
archive.writestr("ppt/slides/slide1.xml", slide_xml)
text = _extract_document_text_from_path(
file_path=file_path,
original_name="报销培训.pptx",
mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
)
assert "# PowerPoint 演示文稿:报销培训.pptx" in text
assert "## 幻灯片 1" in text
assert "- 差旅报销培训" in text
assert "- 发票、审批、预算三项要素必须齐全" in text
def _write_minimal_docx_with_table(
file_path,
*,
paragraphs: list[str],
table: list[list[str]],
) -> None:
paragraph_xml = "\n".join(f"<w:p>{_docx_text_run(text)}</w:p>" for text in paragraphs)
table_xml = (
"<w:tbl>"
+ "".join(
"<w:tr>"
+ "".join(f"<w:tc><w:p>{_docx_text_run(cell)}</w:p></w:tc>" for cell in row)
+ "</w:tr>"
for row in table
)
+ "</w:tbl>"
)
document_xml = f"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
{paragraph_xml}
{table_xml}
</w:body>
</w:document>
"""
with ZipFile(file_path, "w") as archive:
archive.writestr("word/document.xml", document_xml)
def _docx_text_run(text: str) -> str:
return f"<w:r><w:t>{text}</w:t></w:r>"
def _write_minimal_xlsx(file_path, *, sheet_name: str, rows: list[list[str]]) -> None:
workbook_xml = f"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<sheets>
<sheet name="{sheet_name}" sheetId="1" r:id="rId1"/>
</sheets>
</workbook>
"""
rels_xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1"
Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"
Target="worksheets/sheet1.xml"/>
</Relationships>
"""
row_xml = "\n".join(
f'<row r="{row_index}">'
+ "".join(
f'<c r="{chr(65 + column_index)}{row_index}" t="inlineStr"><is><t>{cell}</t></is></c>'
for column_index, cell in enumerate(row)
)
+ "</row>"
for row_index, row in enumerate(rows, start=1)
)
sheet_xml = f"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
<sheetData>
{row_xml}
</sheetData>
</worksheet>
"""
with ZipFile(file_path, "w") as archive:
archive.writestr("xl/workbook.xml", workbook_xml)
archive.writestr("xl/_rels/workbook.xml.rels", rels_xml)
archive.writestr("xl/worksheets/sheet1.xml", sheet_xml)