from __future__ import annotations from typing import Any from app.schemas.knowledge import ( KnowledgePreviewBlockRead, KnowledgePreviewPageRead, KnowledgePreviewStatRead, ) from app.services.knowledge_constants import IMAGE_EXTENSIONS, TEXT_EXTENSIONS from app.services.knowledge_document_extractors import ( _extract_docx_text, _extract_pptx_slides, _extract_xlsx_sheets, _read_text_preview, ) from app.services.knowledge_file_utils import extract_extension, format_size def build_preview( entry: dict[str, Any], *, resolve_document_path, ) -> tuple[str, list[KnowledgePreviewPageRead]]: extension = extract_extension(entry["original_name"]) file_path = resolve_document_path(entry) if extension == "pdf": return "pdf", [] if extension in IMAGE_EXTENSIONS: return "image", [] if extension in TEXT_EXTENSIONS: text = _read_text_preview(file_path) return "text", [_build_text_preview_page(entry, text)] if extension == "docx": text = _extract_docx_text(file_path) return "text", [_build_text_preview_page(entry, text)] if extension == "xlsx": return "table", _build_xlsx_preview_pages(entry, file_path) if extension == "pptx": return "slides", _build_pptx_preview_pages(entry, file_path) return ( "unsupported", [ KnowledgePreviewPageRead( title=entry["original_name"], subtitle="当前格式暂不支持在线解析预览。", stats=[ KnowledgePreviewStatRead(label="文件格式", value=extension.upper() or "FILE"), KnowledgePreviewStatRead( label="文件大小", value=format_size(entry["size_bytes"]) ), KnowledgePreviewStatRead(label="建议操作", value="下载后查看"), ], blocks=[ KnowledgePreviewBlockRead( heading="预览说明", lines=[ "当前系统已支持该文件的上传、下载和权限控制。", "如需在线预览,可后续接入专门的文档转换服务。", ], ) ], ) ], ) def _build_text_preview_page(entry: dict[str, Any], text: str) -> KnowledgePreviewPageRead: lines = [line.strip() for line in text.splitlines() if line.strip()] if not lines: lines = ["文件内容为空,或当前文档未提取到可展示文本。"] groups = [lines[index : index + 8] for index in range(0, min(len(lines), 24), 8)] blocks = [ KnowledgePreviewBlockRead(heading=f"内容片段 {index + 1}", lines=group) for index, group in enumerate(groups) ] return KnowledgePreviewPageRead( title=entry["original_name"], subtitle="文本提取预览", stats=[ KnowledgePreviewStatRead(label="文件格式", value=entry["extension"].upper() or "TEXT"), KnowledgePreviewStatRead(label="可见行数", value=str(len(lines))), KnowledgePreviewStatRead(label="文件大小", value=format_size(entry["size_bytes"])), ], blocks=blocks, ) def _build_xlsx_preview_pages(entry: dict[str, Any], file_path) -> list[KnowledgePreviewPageRead]: sheets = _extract_xlsx_sheets(file_path) if not sheets: sheets = [("Sheet 1", [["未提取到表格内容。"]])] preview_pages: list[KnowledgePreviewPageRead] = [] sheet_count = len(sheets) for sheet_name, rows in sheets[:8]: visible_rows = rows[:12] if rows else [["未提取到表格内容。"]] blocks = [ KnowledgePreviewBlockRead( heading=f"第 {index + 1} 行", lines=[" | ".join((cell or "") for cell in row)], ) for index, row in enumerate(visible_rows) ] preview_pages.append( KnowledgePreviewPageRead( title=sheet_name, subtitle="表格内容预览", stats=[ KnowledgePreviewStatRead(label="工作表数量", value=str(sheet_count)), KnowledgePreviewStatRead(label="预览行数", value=str(len(visible_rows))), KnowledgePreviewStatRead( label="文件大小", value=format_size(entry["size_bytes"]) ), ], blocks=blocks, ) ) return preview_pages def _build_pptx_preview_pages(entry: dict[str, Any], file_path) -> list[KnowledgePreviewPageRead]: slides = _extract_pptx_slides(file_path) if not slides: slides = [["未提取到幻灯片文本。"]] pages: list[KnowledgePreviewPageRead] = [] for index, slide_lines in enumerate(slides[:8]): pages.append( KnowledgePreviewPageRead( title=entry["original_name"], subtitle=f"幻灯片 {index + 1}", stats=[ KnowledgePreviewStatRead(label="页码", value=str(index + 1)), KnowledgePreviewStatRead(label="文本条数", value=str(len(slide_lines))), KnowledgePreviewStatRead(label="文件格式", value="PPTX"), ], blocks=[ KnowledgePreviewBlockRead( heading="幻灯片内容", lines=slide_lines or ["该页未提取到文本内容。"], ) ], ) ) return pages