158 lines
5.5 KiB
Python
158 lines
5.5 KiB
Python
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from typing import Any
|
||
|
|
|
||
|
|
from app.schemas.knowledge import (
|
||
|
|
KnowledgePreviewBlockRead,
|
||
|
|
KnowledgePreviewPageRead,
|
||
|
|
KnowledgePreviewStatRead,
|
||
|
|
)
|
||
|
|
from app.services.knowledge_constants import IMAGE_EXTENSIONS, TEXT_EXTENSIONS
|
||
|
|
from app.services.knowledge_document_extractors import (
|
||
|
|
_extract_docx_text,
|
||
|
|
_extract_pptx_slides,
|
||
|
|
_extract_xlsx_sheets,
|
||
|
|
_read_text_preview,
|
||
|
|
)
|
||
|
|
from app.services.knowledge_file_utils import extract_extension, format_size
|
||
|
|
|
||
|
|
def build_preview(
|
||
|
|
entry: dict[str, Any],
|
||
|
|
*,
|
||
|
|
resolve_document_path,
|
||
|
|
) -> tuple[str, list[KnowledgePreviewPageRead]]:
|
||
|
|
extension = extract_extension(entry["original_name"])
|
||
|
|
file_path = resolve_document_path(entry)
|
||
|
|
|
||
|
|
if extension == "pdf":
|
||
|
|
return "pdf", []
|
||
|
|
|
||
|
|
if extension in IMAGE_EXTENSIONS:
|
||
|
|
return "image", []
|
||
|
|
|
||
|
|
if extension in TEXT_EXTENSIONS:
|
||
|
|
text = _read_text_preview(file_path)
|
||
|
|
return "text", [_build_text_preview_page(entry, text)]
|
||
|
|
|
||
|
|
if extension == "docx":
|
||
|
|
text = _extract_docx_text(file_path)
|
||
|
|
return "text", [_build_text_preview_page(entry, text)]
|
||
|
|
|
||
|
|
if extension == "xlsx":
|
||
|
|
return "table", _build_xlsx_preview_pages(entry, file_path)
|
||
|
|
|
||
|
|
if extension == "pptx":
|
||
|
|
return "slides", _build_pptx_preview_pages(entry, file_path)
|
||
|
|
|
||
|
|
return (
|
||
|
|
"unsupported",
|
||
|
|
[
|
||
|
|
KnowledgePreviewPageRead(
|
||
|
|
title=entry["original_name"],
|
||
|
|
subtitle="当前格式暂不支持在线解析预览。",
|
||
|
|
stats=[
|
||
|
|
KnowledgePreviewStatRead(label="文件格式", value=extension.upper() or "FILE"),
|
||
|
|
KnowledgePreviewStatRead(label="文件大小", value=format_size(entry["size_bytes"])),
|
||
|
|
KnowledgePreviewStatRead(label="建议操作", value="下载后查看"),
|
||
|
|
],
|
||
|
|
blocks=[
|
||
|
|
KnowledgePreviewBlockRead(
|
||
|
|
heading="预览说明",
|
||
|
|
lines=[
|
||
|
|
"当前系统已支持该文件的上传、下载和权限控制。",
|
||
|
|
"如需在线预览,可后续接入专门的文档转换服务。",
|
||
|
|
],
|
||
|
|
)
|
||
|
|
],
|
||
|
|
)
|
||
|
|
],
|
||
|
|
)
|
||
|
|
|
||
|
|
def _build_text_preview_page(
|
||
|
|
entry: dict[str, Any], text: str
|
||
|
|
) -> KnowledgePreviewPageRead:
|
||
|
|
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||
|
|
if not lines:
|
||
|
|
lines = ["文件内容为空,或当前文档未提取到可展示文本。"]
|
||
|
|
|
||
|
|
groups = [lines[index : index + 8] for index in range(0, min(len(lines), 24), 8)]
|
||
|
|
blocks = [
|
||
|
|
KnowledgePreviewBlockRead(heading=f"内容片段 {index + 1}", lines=group)
|
||
|
|
for index, group in enumerate(groups)
|
||
|
|
]
|
||
|
|
|
||
|
|
return KnowledgePreviewPageRead(
|
||
|
|
title=entry["original_name"],
|
||
|
|
subtitle="文本提取预览",
|
||
|
|
stats=[
|
||
|
|
KnowledgePreviewStatRead(label="文件格式", value=entry["extension"].upper() or "TEXT"),
|
||
|
|
KnowledgePreviewStatRead(label="可见行数", value=str(len(lines))),
|
||
|
|
KnowledgePreviewStatRead(label="文件大小", value=format_size(entry["size_bytes"])),
|
||
|
|
],
|
||
|
|
blocks=blocks,
|
||
|
|
)
|
||
|
|
|
||
|
|
def _build_xlsx_preview_pages(
|
||
|
|
entry: dict[str, Any], file_path
|
||
|
|
) -> list[KnowledgePreviewPageRead]:
|
||
|
|
sheets = self._extract_xlsx_sheets(file_path)
|
||
|
|
if not sheets:
|
||
|
|
sheets = [("Sheet 1", [["未提取到表格内容。"]])]
|
||
|
|
|
||
|
|
preview_pages: list[KnowledgePreviewPageRead] = []
|
||
|
|
sheet_count = len(sheets)
|
||
|
|
for sheet_name, rows in sheets[:8]:
|
||
|
|
visible_rows = rows[:12] if rows else [["未提取到表格内容。"]]
|
||
|
|
blocks = [
|
||
|
|
KnowledgePreviewBlockRead(
|
||
|
|
heading=f"第 {index + 1} 行",
|
||
|
|
lines=[" | ".join((cell or "") for cell in row)],
|
||
|
|
)
|
||
|
|
for index, row in enumerate(visible_rows)
|
||
|
|
]
|
||
|
|
|
||
|
|
preview_pages.append(
|
||
|
|
KnowledgePreviewPageRead(
|
||
|
|
title=sheet_name,
|
||
|
|
subtitle="表格内容预览",
|
||
|
|
stats=[
|
||
|
|
KnowledgePreviewStatRead(label="工作表数量", value=str(sheet_count)),
|
||
|
|
KnowledgePreviewStatRead(label="预览行数", value=str(len(visible_rows))),
|
||
|
|
KnowledgePreviewStatRead(label="文件大小", value=format_size(entry["size_bytes"])),
|
||
|
|
],
|
||
|
|
blocks=blocks,
|
||
|
|
)
|
||
|
|
)
|
||
|
|
|
||
|
|
return preview_pages
|
||
|
|
|
||
|
|
def _build_pptx_preview_pages(
|
||
|
|
entry: dict[str, Any], file_path
|
||
|
|
) -> list[KnowledgePreviewPageRead]:
|
||
|
|
slides = self._extract_pptx_slides(file_path)
|
||
|
|
if not slides:
|
||
|
|
slides = [["未提取到幻灯片文本。"]]
|
||
|
|
|
||
|
|
pages: list[KnowledgePreviewPageRead] = []
|
||
|
|
for index, slide_lines in enumerate(slides[:8]):
|
||
|
|
pages.append(
|
||
|
|
KnowledgePreviewPageRead(
|
||
|
|
title=entry["original_name"],
|
||
|
|
subtitle=f"幻灯片 {index + 1}",
|
||
|
|
stats=[
|
||
|
|
KnowledgePreviewStatRead(label="页码", value=str(index + 1)),
|
||
|
|
KnowledgePreviewStatRead(label="文本条数", value=str(len(slide_lines))),
|
||
|
|
KnowledgePreviewStatRead(label="文件格式", value="PPTX"),
|
||
|
|
],
|
||
|
|
blocks=[
|
||
|
|
KnowledgePreviewBlockRead(
|
||
|
|
heading="幻灯片内容",
|
||
|
|
lines=slide_lines or ["该页未提取到文本内容。"],
|
||
|
|
)
|
||
|
|
],
|
||
|
|
)
|
||
|
|
)
|
||
|
|
|
||
|
|
return pages
|
||
|
|
|