export const DOCUMENT_DETAIL_HREF_PREFIX = '#ai-open-document-detail:' const ALLOWED_COLON_HEADING_TITLES = new Set([ '基础信息识别结果', '报销测算参考', '补充信息' ]) const BUSINESS_FIELD_LABELS = new Set([ '时间', '地点', '事由', '金额', '费用类型', '报销类型', '商户', '商户/开票方', '客户', '客户/项目对象', '附件', '附件/凭证', '出行方式' ]) const TRUSTED_HTML_BLOCK_RE = /\s*([\s\S]*?)\s*/g const TRUSTED_HTML_PLACEHOLDER_PREFIX = 'AI_TRUSTED_HTML_BLOCK_' const TRUSTED_HTML_ALLOWED_TAGS = new Set([ 'section', 'article', 'header', 'footer', 'div', 'span', 'strong', 'a' ]) const TRUSTED_HTML_ALLOWED_ATTRS = new Set([ 'aria-label', 'class', 'data-ai-action', 'href' ]) function splitColonHeadingLine(line) { const rawLine = String(line || '') const trimmed = rawLine.trim() if (!trimmed || trimmed.startsWith('|') || /^#{1,6}\s/.test(trimmed)) { return [rawLine] } const chineseColonIndex = trimmed.indexOf(':') const asciiColonIndex = trimmed.indexOf(':') const colonIndexes = [chineseColonIndex, asciiColonIndex].filter((index) => index > 0) if (!colonIndexes.length) { return [rawLine] } const colonIndex = Math.min(...colonIndexes) const title = trimmed.slice(0, colonIndex) const body = trimmed.slice(colonIndex + 1).trim() if (!ALLOWED_COLON_HEADING_TITLES.has(title)) { return [rawLine] } return body ? [`### ${title}`, '', body] : [`### ${title}`] } function normalizeBusinessFieldLine(line) { const rawLine = String(line || '') const trimmed = rawLine.trim() if ( !trimmed || trimmed.startsWith('|') || /^[-*+]\s/.test(trimmed) || /^#{1,6}\s/.test(trimmed) ) { return rawLine } const match = trimmed.match(/^([^::\n]{1,16})[::]\s*(.+)$/u) if (!match) { return rawLine } const label = match[1].trim() const value = match[2].trim() if (!BUSINESS_FIELD_LABELS.has(label) || !value) { return rawLine } return `- **${label}**:${value}` } function hasOnlyTrustedHtmlTags(html = '') { const tagPattern = /<\/?([a-z][\w-]*)([^>]*)>/gi let match = tagPattern.exec(html) while (match) { const tagName = String(match[1] || '').toLowerCase() if (!TRUSTED_HTML_ALLOWED_TAGS.has(tagName)) { return false } const attrText = String(match[2] || '') const attrPattern = /\s([:@\w-]+)\s*=/g let attrMatch = attrPattern.exec(attrText) while (attrMatch) { const attrName = String(attrMatch[1] || '').toLowerCase() if (!TRUSTED_HTML_ALLOWED_ATTRS.has(attrName)) { return false } attrMatch = attrPattern.exec(attrText) } match = tagPattern.exec(html) } return true } function sanitizeTrustedHtmlBlock(html = '') { const value = String(html || '').trim() if (!value || !value.includes('class="ai-document-card-list"')) { return '' } if (/<(?:script|style|iframe|object|embed|link|meta|form|input|button|textarea|select)\b/i.test(value)) { return '' } if (/\son[a-z]+\s*=/i.test(value) || /javascript\s*:/i.test(value)) { return '' } if (!hasOnlyTrustedHtmlTags(value)) { return '' } const hrefs = [...value.matchAll(/\shref="([^"]*)"/gi)].map((match) => String(match[1] || '').trim()) if (hrefs.some((href) => !href.startsWith(DOCUMENT_DETAIL_HREF_PREFIX))) { return '' } return value } export function normalizeConversationText(text = '', options = {}) { const shouldTrim = Boolean(options.trim) const lines = String(text || '').replace(/\r\n?/g, '\n').split('\n') const normalizedLines = [] let inFence = false lines.forEach((line) => { if (/^\s*(```|~~~)/.test(line)) { inFence = !inFence normalizedLines.push(line) return } if (inFence) { normalizedLines.push(line) return } const nextLines = splitColonHeadingLine(line) if (nextLines[0]?.startsWith('### ') && normalizedLines.length) { const previousLine = normalizedLines[normalizedLines.length - 1] if (String(previousLine || '').trim()) { normalizedLines.push('') } } normalizedLines.push(...nextLines.map((nextLine) => normalizeBusinessFieldLine(nextLine))) }) const normalized = normalizedLines.join('\n').replace(/\n{3,}/g, '\n\n') return shouldTrim ? normalized.trim() : normalized } export function extractTrustedHtmlBlocks(text = '') { const trustedHtmlBlocks = [] const content = String(text || '').replace(TRUSTED_HTML_BLOCK_RE, (_match, html) => { const sanitizedHtml = sanitizeTrustedHtmlBlock(html) if (!sanitizedHtml) { return '' } const placeholder = `${TRUSTED_HTML_PLACEHOLDER_PREFIX}${trustedHtmlBlocks.length}` trustedHtmlBlocks.push(sanitizedHtml) return `\n\n${placeholder}\n\n` }) return { content, trustedHtmlBlocks } } export function restoreTrustedHtmlBlocks(html = '', trustedHtmlBlocks = [], options = {}) { const paragraphClass = String(options.paragraphClass || '').trim() return trustedHtmlBlocks.reduce((nextHtml, block, index) => { const placeholder = `${TRUSTED_HTML_PLACEHOLDER_PREFIX}${index}` const paragraphPattern = paragraphClass ? new RegExp(`
${placeholder}
\\n?`, 'g') : new RegExp(`${placeholder}
\\n?`, 'g') return nextHtml .replace(paragraphPattern, block) .replaceAll(placeholder, block) }, html) }