from __future__ import annotations from zipfile import ZipFile from app.services.knowledge_document_extractors import _extract_document_text_from_path def test_extract_docx_document_text_preserves_tables_as_markdown(tmp_path) -> None: file_path = tmp_path / "financial-basic.docx" _write_minimal_docx_with_table( file_path, paragraphs=[ "远光软件股份有限公司", "财务基础知识手册", "二、常用会计科目", ], table=[ ["科目类别", "科目名称", "说明"], ["资产类", "库存现金", "公司持有的现金"], ["负债类", "应付账款", "因购买商品或接受劳务应付的款项"], ["损益类", "销售费用", "为销售产品发生的费用"], ], ) text = _extract_document_text_from_path( file_path=file_path, original_name="远光软件财务基础知识手册.docx", mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", ) assert "二、常用会计科目" in text assert "| 科目类别 | 科目名称 | 说明 |" in text assert "| 资产类 | 库存现金 | 公司持有的现金 |" in text assert "| 负债类 | 应付账款 | 因购买商品或接受劳务应付的款项 |" in text assert "| 损益类 | 销售费用 | 为销售产品发生的费用 |" in text assert "表格第 2 行:科目类别=资产类;科目名称=库存现金;说明=公司持有的现金" in text assert "科目类别\n科目名称\n说明" not in text def test_extract_xlsx_document_text_builds_markdown_with_row_clues(tmp_path) -> None: file_path = tmp_path / "company-expense-rules.xlsx" _write_minimal_xlsx( file_path, sheet_name="报销标准", rows=[ ["费用类型", "标准", "说明"], ["住宿费", "500", "超标准需事前审批"], ["交通费", "据实", "保留发票"], ], ) text = _extract_document_text_from_path( file_path=file_path, original_name="公司支出管理办法.xlsx", mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ) assert "# Excel 工作簿:公司支出管理办法.xlsx" in text assert "## 工作表 1:报销标准" in text assert "| 费用类型 | 标准 | 说明 |" in text assert "费用类型=住宿费;标准=500;说明=超标准需事前审批" in text assert "费用类型=交通费;标准=据实;说明=保留发票" in text def test_extract_pptx_document_text_builds_markdown_slides(tmp_path) -> None: file_path = tmp_path / "training.pptx" slide_xml = """ 差旅报销培训 发票、审批、预算三项要素必须齐全 """ with ZipFile(file_path, "w") as archive: archive.writestr("ppt/slides/slide1.xml", slide_xml) text = _extract_document_text_from_path( file_path=file_path, original_name="报销培训.pptx", mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation", ) assert "# PowerPoint 演示文稿:报销培训.pptx" in text assert "## 幻灯片 1" in text assert "- 差旅报销培训" in text assert "- 发票、审批、预算三项要素必须齐全" in text def _write_minimal_docx_with_table( file_path, *, paragraphs: list[str], table: list[list[str]], ) -> None: paragraph_xml = "\n".join(f"{_docx_text_run(text)}" for text in paragraphs) table_xml = ( "" + "".join( "" + "".join(f"{_docx_text_run(cell)}" for cell in row) + "" for row in table ) + "" ) document_xml = f""" {paragraph_xml} {table_xml} """ with ZipFile(file_path, "w") as archive: archive.writestr("word/document.xml", document_xml) def _docx_text_run(text: str) -> str: return f"{text}" def _write_minimal_xlsx(file_path, *, sheet_name: str, rows: list[list[str]]) -> None: workbook_xml = f""" """ rels_xml = """ """ row_xml = "\n".join( f'' + "".join( f'{cell}' for column_index, cell in enumerate(row) ) + "" for row_index, row in enumerate(rows, start=1) ) sheet_xml = f""" {row_xml} """ with ZipFile(file_path, "w") as archive: archive.writestr("xl/workbook.xml", workbook_xml) archive.writestr("xl/_rels/workbook.xml.rels", rels_xml) archive.writestr("xl/worksheets/sheet1.xml", sheet_xml)