2026-05-22 16:00:19 +08:00
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
from zipfile import ZipFile
|
|
|
|
|
|
|
|
|
|
|
|
from app.services.knowledge_document_extractors import _extract_document_text_from_path
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-24 21:44:17 +08:00
|
|
|
|
def test_extract_docx_document_text_preserves_tables_as_markdown(tmp_path) -> None:
|
|
|
|
|
|
file_path = tmp_path / "financial-basic.docx"
|
|
|
|
|
|
_write_minimal_docx_with_table(
|
|
|
|
|
|
file_path,
|
|
|
|
|
|
paragraphs=[
|
|
|
|
|
|
"远光软件股份有限公司",
|
|
|
|
|
|
"财务基础知识手册",
|
|
|
|
|
|
"二、常用会计科目",
|
|
|
|
|
|
],
|
|
|
|
|
|
table=[
|
|
|
|
|
|
["科目类别", "科目名称", "说明"],
|
|
|
|
|
|
["资产类", "库存现金", "公司持有的现金"],
|
|
|
|
|
|
["负债类", "应付账款", "因购买商品或接受劳务应付的款项"],
|
|
|
|
|
|
["损益类", "销售费用", "为销售产品发生的费用"],
|
|
|
|
|
|
],
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
text = _extract_document_text_from_path(
|
|
|
|
|
|
file_path=file_path,
|
|
|
|
|
|
original_name="远光软件财务基础知识手册.docx",
|
|
|
|
|
|
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
assert "二、常用会计科目" in text
|
|
|
|
|
|
assert "| 科目类别 | 科目名称 | 说明 |" in text
|
|
|
|
|
|
assert "| 资产类 | 库存现金 | 公司持有的现金 |" in text
|
|
|
|
|
|
assert "| 负债类 | 应付账款 | 因购买商品或接受劳务应付的款项 |" in text
|
|
|
|
|
|
assert "| 损益类 | 销售费用 | 为销售产品发生的费用 |" in text
|
|
|
|
|
|
assert "表格第 2 行:科目类别=资产类;科目名称=库存现金;说明=公司持有的现金" in text
|
|
|
|
|
|
assert "科目类别\n科目名称\n说明" not in text
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-22 16:00:19 +08:00
|
|
|
|
def test_extract_xlsx_document_text_builds_markdown_with_row_clues(tmp_path) -> None:
|
|
|
|
|
|
file_path = tmp_path / "company-expense-rules.xlsx"
|
|
|
|
|
|
_write_minimal_xlsx(
|
|
|
|
|
|
file_path,
|
|
|
|
|
|
sheet_name="报销标准",
|
|
|
|
|
|
rows=[
|
|
|
|
|
|
["费用类型", "标准", "说明"],
|
|
|
|
|
|
["住宿费", "500", "超标准需事前审批"],
|
|
|
|
|
|
["交通费", "据实", "保留发票"],
|
|
|
|
|
|
],
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
text = _extract_document_text_from_path(
|
|
|
|
|
|
file_path=file_path,
|
|
|
|
|
|
original_name="公司支出管理办法.xlsx",
|
|
|
|
|
|
mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
assert "# Excel 工作簿:公司支出管理办法.xlsx" in text
|
|
|
|
|
|
assert "## 工作表 1:报销标准" in text
|
|
|
|
|
|
assert "| 费用类型 | 标准 | 说明 |" in text
|
|
|
|
|
|
assert "费用类型=住宿费;标准=500;说明=超标准需事前审批" in text
|
|
|
|
|
|
assert "费用类型=交通费;标准=据实;说明=保留发票" in text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_extract_pptx_document_text_builds_markdown_slides(tmp_path) -> None:
|
|
|
|
|
|
file_path = tmp_path / "training.pptx"
|
|
|
|
|
|
slide_xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
|
|
|
|
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
|
|
|
|
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
|
|
|
|
|
|
<p:cSld>
|
|
|
|
|
|
<p:spTree>
|
|
|
|
|
|
<p:sp><p:txBody><a:p><a:r><a:t>差旅报销培训</a:t></a:r></a:p></p:txBody></p:sp>
|
|
|
|
|
|
<p:sp><p:txBody><a:p><a:r><a:t>发票、审批、预算三项要素必须齐全</a:t></a:r></a:p></p:txBody></p:sp>
|
|
|
|
|
|
</p:spTree>
|
|
|
|
|
|
</p:cSld>
|
|
|
|
|
|
</p:sld>
|
|
|
|
|
|
"""
|
|
|
|
|
|
with ZipFile(file_path, "w") as archive:
|
|
|
|
|
|
archive.writestr("ppt/slides/slide1.xml", slide_xml)
|
|
|
|
|
|
|
|
|
|
|
|
text = _extract_document_text_from_path(
|
|
|
|
|
|
file_path=file_path,
|
|
|
|
|
|
original_name="报销培训.pptx",
|
|
|
|
|
|
mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
assert "# PowerPoint 演示文稿:报销培训.pptx" in text
|
|
|
|
|
|
assert "## 幻灯片 1" in text
|
|
|
|
|
|
assert "- 差旅报销培训" in text
|
|
|
|
|
|
assert "- 发票、审批、预算三项要素必须齐全" in text
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-24 21:44:17 +08:00
|
|
|
|
def _write_minimal_docx_with_table(
|
|
|
|
|
|
file_path,
|
|
|
|
|
|
*,
|
|
|
|
|
|
paragraphs: list[str],
|
|
|
|
|
|
table: list[list[str]],
|
|
|
|
|
|
) -> None:
|
|
|
|
|
|
paragraph_xml = "\n".join(f"<w:p>{_docx_text_run(text)}</w:p>" for text in paragraphs)
|
|
|
|
|
|
table_xml = (
|
|
|
|
|
|
"<w:tbl>"
|
|
|
|
|
|
+ "".join(
|
|
|
|
|
|
"<w:tr>"
|
|
|
|
|
|
+ "".join(f"<w:tc><w:p>{_docx_text_run(cell)}</w:p></w:tc>" for cell in row)
|
|
|
|
|
|
+ "</w:tr>"
|
|
|
|
|
|
for row in table
|
|
|
|
|
|
)
|
|
|
|
|
|
+ "</w:tbl>"
|
|
|
|
|
|
)
|
|
|
|
|
|
document_xml = f"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
|
|
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
|
|
|
|
<w:body>
|
|
|
|
|
|
{paragraph_xml}
|
|
|
|
|
|
{table_xml}
|
|
|
|
|
|
</w:body>
|
|
|
|
|
|
</w:document>
|
|
|
|
|
|
"""
|
|
|
|
|
|
with ZipFile(file_path, "w") as archive:
|
|
|
|
|
|
archive.writestr("word/document.xml", document_xml)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _docx_text_run(text: str) -> str:
|
|
|
|
|
|
return f"<w:r><w:t>{text}</w:t></w:r>"
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-05-22 16:00:19 +08:00
|
|
|
|
def _write_minimal_xlsx(file_path, *, sheet_name: str, rows: list[list[str]]) -> None:
|
|
|
|
|
|
workbook_xml = f"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
|
|
|
|
<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
|
|
|
|
|
|
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
|
|
|
|
<sheets>
|
|
|
|
|
|
<sheet name="{sheet_name}" sheetId="1" r:id="rId1"/>
|
|
|
|
|
|
</sheets>
|
|
|
|
|
|
</workbook>
|
|
|
|
|
|
"""
|
|
|
|
|
|
rels_xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
|
|
|
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
|
|
|
|
<Relationship Id="rId1"
|
|
|
|
|
|
Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"
|
|
|
|
|
|
Target="worksheets/sheet1.xml"/>
|
|
|
|
|
|
</Relationships>
|
|
|
|
|
|
"""
|
|
|
|
|
|
row_xml = "\n".join(
|
|
|
|
|
|
f'<row r="{row_index}">'
|
|
|
|
|
|
+ "".join(
|
|
|
|
|
|
f'<c r="{chr(65 + column_index)}{row_index}" t="inlineStr"><is><t>{cell}</t></is></c>'
|
|
|
|
|
|
for column_index, cell in enumerate(row)
|
|
|
|
|
|
)
|
|
|
|
|
|
+ "</row>"
|
|
|
|
|
|
for row_index, row in enumerate(rows, start=1)
|
|
|
|
|
|
)
|
|
|
|
|
|
sheet_xml = f"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
|
|
|
|
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
|
|
|
|
|
|
<sheetData>
|
|
|
|
|
|
{row_xml}
|
|
|
|
|
|
</sheetData>
|
|
|
|
|
|
</worksheet>
|
|
|
|
|
|
"""
|
|
|
|
|
|
with ZipFile(file_path, "w") as archive:
|
|
|
|
|
|
archive.writestr("xl/workbook.xml", workbook_xml)
|
|
|
|
|
|
archive.writestr("xl/_rels/workbook.xml.rels", rels_xml)
|
|
|
|
|
|
archive.writestr("xl/worksheets/sheet1.xml", sheet_xml)
|