Files
X-Financial/web/src/utils/conversationTrustedHtml.js
2026-06-23 11:21:18 +08:00

192 lines
5.4 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
export const DOCUMENT_DETAIL_HREF_PREFIX = '#ai-open-document-detail:'
const ALLOWED_COLON_HEADING_TITLES = new Set([
'基础信息识别结果',
'报销测算参考',
'补充信息'
])
const BUSINESS_FIELD_LABELS = new Set([
'时间',
'地点',
'事由',
'金额',
'费用类型',
'报销类型',
'商户',
'商户/开票方',
'客户',
'客户/项目对象',
'附件',
'附件/凭证',
'出行方式'
])
const TRUSTED_HTML_BLOCK_RE = /<!--\s*ai-trusted-html:start\s*-->\s*([\s\S]*?)\s*<!--\s*ai-trusted-html:end\s*-->/g
const TRUSTED_HTML_PLACEHOLDER_PREFIX = 'AI_TRUSTED_HTML_BLOCK_'
const TRUSTED_HTML_ALLOWED_TAGS = new Set([
'section',
'article',
'header',
'footer',
'div',
'span',
'strong',
'a'
])
const TRUSTED_HTML_ALLOWED_ATTRS = new Set([
'aria-label',
'class',
'data-ai-action',
'href'
])
function splitColonHeadingLine(line) {
const rawLine = String(line || '')
const trimmed = rawLine.trim()
if (!trimmed || trimmed.startsWith('|') || /^#{1,6}\s/.test(trimmed)) {
return [rawLine]
}
const chineseColonIndex = trimmed.indexOf('')
const asciiColonIndex = trimmed.indexOf(':')
const colonIndexes = [chineseColonIndex, asciiColonIndex].filter((index) => index > 0)
if (!colonIndexes.length) {
return [rawLine]
}
const colonIndex = Math.min(...colonIndexes)
const title = trimmed.slice(0, colonIndex)
const body = trimmed.slice(colonIndex + 1).trim()
if (!ALLOWED_COLON_HEADING_TITLES.has(title)) {
return [rawLine]
}
return body ? [`### ${title}`, '', body] : [`### ${title}`]
}
function normalizeBusinessFieldLine(line) {
const rawLine = String(line || '')
const trimmed = rawLine.trim()
if (
!trimmed ||
trimmed.startsWith('|') ||
/^[-*+]\s/.test(trimmed) ||
/^#{1,6}\s/.test(trimmed)
) {
return rawLine
}
const match = trimmed.match(/^([^:\n]{1,16})[:]\s*(.+)$/u)
if (!match) {
return rawLine
}
const label = match[1].trim()
const value = match[2].trim()
if (!BUSINESS_FIELD_LABELS.has(label) || !value) {
return rawLine
}
return `- **${label}**${value}`
}
function hasOnlyTrustedHtmlTags(html = '') {
const tagPattern = /<\/?([a-z][\w-]*)([^>]*)>/gi
let match = tagPattern.exec(html)
while (match) {
const tagName = String(match[1] || '').toLowerCase()
if (!TRUSTED_HTML_ALLOWED_TAGS.has(tagName)) {
return false
}
const attrText = String(match[2] || '')
const attrPattern = /\s([:@\w-]+)\s*=/g
let attrMatch = attrPattern.exec(attrText)
while (attrMatch) {
const attrName = String(attrMatch[1] || '').toLowerCase()
if (!TRUSTED_HTML_ALLOWED_ATTRS.has(attrName)) {
return false
}
attrMatch = attrPattern.exec(attrText)
}
match = tagPattern.exec(html)
}
return true
}
function sanitizeTrustedHtmlBlock(html = '') {
const value = String(html || '').trim()
if (!value || !value.includes('class="ai-document-card-list"')) {
return ''
}
if (/<(?:script|style|iframe|object|embed|link|meta|form|input|button|textarea|select)\b/i.test(value)) {
return ''
}
if (/\son[a-z]+\s*=/i.test(value) || /javascript\s*:/i.test(value)) {
return ''
}
if (!hasOnlyTrustedHtmlTags(value)) {
return ''
}
const hrefs = [...value.matchAll(/\shref="([^"]*)"/gi)].map((match) => String(match[1] || '').trim())
if (hrefs.some((href) => !href.startsWith(DOCUMENT_DETAIL_HREF_PREFIX))) {
return ''
}
return value
}
export function normalizeConversationText(text = '', options = {}) {
const shouldTrim = Boolean(options.trim)
const lines = String(text || '').replace(/\r\n?/g, '\n').split('\n')
const normalizedLines = []
let inFence = false
lines.forEach((line) => {
if (/^\s*(```|~~~)/.test(line)) {
inFence = !inFence
normalizedLines.push(line)
return
}
if (inFence) {
normalizedLines.push(line)
return
}
const nextLines = splitColonHeadingLine(line)
if (nextLines[0]?.startsWith('### ') && normalizedLines.length) {
const previousLine = normalizedLines[normalizedLines.length - 1]
if (String(previousLine || '').trim()) {
normalizedLines.push('')
}
}
normalizedLines.push(...nextLines.map((nextLine) => normalizeBusinessFieldLine(nextLine)))
})
const normalized = normalizedLines.join('\n').replace(/\n{3,}/g, '\n\n')
return shouldTrim ? normalized.trim() : normalized
}
export function extractTrustedHtmlBlocks(text = '') {
const trustedHtmlBlocks = []
const content = String(text || '').replace(TRUSTED_HTML_BLOCK_RE, (_match, html) => {
const sanitizedHtml = sanitizeTrustedHtmlBlock(html)
if (!sanitizedHtml) {
return ''
}
const placeholder = `${TRUSTED_HTML_PLACEHOLDER_PREFIX}${trustedHtmlBlocks.length}`
trustedHtmlBlocks.push(sanitizedHtml)
return `\n\n${placeholder}\n\n`
})
return { content, trustedHtmlBlocks }
}
export function restoreTrustedHtmlBlocks(html = '', trustedHtmlBlocks = [], options = {}) {
const paragraphClass = String(options.paragraphClass || '').trim()
return trustedHtmlBlocks.reduce((nextHtml, block, index) => {
const placeholder = `${TRUSTED_HTML_PLACEHOLDER_PREFIX}${index}`
const paragraphPattern = paragraphClass
? new RegExp(`<p class="${paragraphClass}">${placeholder}</p>\\n?`, 'g')
: new RegExp(`<p>${placeholder}</p>\\n?`, 'g')
return nextHtml
.replace(paragraphPattern, block)
.replaceAll(placeholder, block)
}, html)
}