Files
X-Financial/web/src/utils/conversationTrustedHtml.js

192 lines
5.4 KiB
JavaScript
Raw Normal View History

export const DOCUMENT_DETAIL_HREF_PREFIX = '#ai-open-document-detail:'
const ALLOWED_COLON_HEADING_TITLES = new Set([
'基础信息识别结果',
'报销测算参考',
'补充信息'
])
const BUSINESS_FIELD_LABELS = new Set([
'时间',
'地点',
'事由',
'金额',
'费用类型',
'报销类型',
'商户',
'商户/开票方',
'客户',
'客户/项目对象',
'附件',
'附件/凭证',
'出行方式'
])
const TRUSTED_HTML_BLOCK_RE = /<!--\s*ai-trusted-html:start\s*-->\s*([\s\S]*?)\s*<!--\s*ai-trusted-html:end\s*-->/g
const TRUSTED_HTML_PLACEHOLDER_PREFIX = 'AI_TRUSTED_HTML_BLOCK_'
const TRUSTED_HTML_ALLOWED_TAGS = new Set([
'section',
'article',
'header',
'footer',
'div',
'span',
'strong',
'a'
])
const TRUSTED_HTML_ALLOWED_ATTRS = new Set([
'aria-label',
'class',
'data-ai-action',
'href'
])
function splitColonHeadingLine(line) {
const rawLine = String(line || '')
const trimmed = rawLine.trim()
if (!trimmed || trimmed.startsWith('|') || /^#{1,6}\s/.test(trimmed)) {
return [rawLine]
}
const chineseColonIndex = trimmed.indexOf('')
const asciiColonIndex = trimmed.indexOf(':')
const colonIndexes = [chineseColonIndex, asciiColonIndex].filter((index) => index > 0)
if (!colonIndexes.length) {
return [rawLine]
}
const colonIndex = Math.min(...colonIndexes)
const title = trimmed.slice(0, colonIndex)
const body = trimmed.slice(colonIndex + 1).trim()
if (!ALLOWED_COLON_HEADING_TITLES.has(title)) {
return [rawLine]
}
return body ? [`### ${title}`, '', body] : [`### ${title}`]
}
function normalizeBusinessFieldLine(line) {
const rawLine = String(line || '')
const trimmed = rawLine.trim()
if (
!trimmed ||
trimmed.startsWith('|') ||
/^[-*+]\s/.test(trimmed) ||
/^#{1,6}\s/.test(trimmed)
) {
return rawLine
}
const match = trimmed.match(/^([^:\n]{1,16})[:]\s*(.+)$/u)
if (!match) {
return rawLine
}
const label = match[1].trim()
const value = match[2].trim()
if (!BUSINESS_FIELD_LABELS.has(label) || !value) {
return rawLine
}
return `- **${label}**${value}`
}
function hasOnlyTrustedHtmlTags(html = '') {
const tagPattern = /<\/?([a-z][\w-]*)([^>]*)>/gi
let match = tagPattern.exec(html)
while (match) {
const tagName = String(match[1] || '').toLowerCase()
if (!TRUSTED_HTML_ALLOWED_TAGS.has(tagName)) {
return false
}
const attrText = String(match[2] || '')
const attrPattern = /\s([:@\w-]+)\s*=/g
let attrMatch = attrPattern.exec(attrText)
while (attrMatch) {
const attrName = String(attrMatch[1] || '').toLowerCase()
if (!TRUSTED_HTML_ALLOWED_ATTRS.has(attrName)) {
return false
}
attrMatch = attrPattern.exec(attrText)
}
match = tagPattern.exec(html)
}
return true
}
function sanitizeTrustedHtmlBlock(html = '') {
const value = String(html || '').trim()
if (!value || !value.includes('class="ai-document-card-list"')) {
return ''
}
if (/<(?:script|style|iframe|object|embed|link|meta|form|input|button|textarea|select)\b/i.test(value)) {
return ''
}
if (/\son[a-z]+\s*=/i.test(value) || /javascript\s*:/i.test(value)) {
return ''
}
if (!hasOnlyTrustedHtmlTags(value)) {
return ''
}
const hrefs = [...value.matchAll(/\shref="([^"]*)"/gi)].map((match) => String(match[1] || '').trim())
if (hrefs.some((href) => !href.startsWith(DOCUMENT_DETAIL_HREF_PREFIX))) {
return ''
}
return value
}
export function normalizeConversationText(text = '', options = {}) {
const shouldTrim = Boolean(options.trim)
const lines = String(text || '').replace(/\r\n?/g, '\n').split('\n')
const normalizedLines = []
let inFence = false
lines.forEach((line) => {
if (/^\s*(```|~~~)/.test(line)) {
inFence = !inFence
normalizedLines.push(line)
return
}
if (inFence) {
normalizedLines.push(line)
return
}
const nextLines = splitColonHeadingLine(line)
if (nextLines[0]?.startsWith('### ') && normalizedLines.length) {
const previousLine = normalizedLines[normalizedLines.length - 1]
if (String(previousLine || '').trim()) {
normalizedLines.push('')
}
}
normalizedLines.push(...nextLines.map((nextLine) => normalizeBusinessFieldLine(nextLine)))
})
const normalized = normalizedLines.join('\n').replace(/\n{3,}/g, '\n\n')
return shouldTrim ? normalized.trim() : normalized
}
export function extractTrustedHtmlBlocks(text = '') {
const trustedHtmlBlocks = []
const content = String(text || '').replace(TRUSTED_HTML_BLOCK_RE, (_match, html) => {
const sanitizedHtml = sanitizeTrustedHtmlBlock(html)
if (!sanitizedHtml) {
return ''
}
const placeholder = `${TRUSTED_HTML_PLACEHOLDER_PREFIX}${trustedHtmlBlocks.length}`
trustedHtmlBlocks.push(sanitizedHtml)
return `\n\n${placeholder}\n\n`
})
return { content, trustedHtmlBlocks }
}
export function restoreTrustedHtmlBlocks(html = '', trustedHtmlBlocks = [], options = {}) {
const paragraphClass = String(options.paragraphClass || '').trim()
return trustedHtmlBlocks.reduce((nextHtml, block, index) => {
const placeholder = `${TRUSTED_HTML_PLACEHOLDER_PREFIX}${index}`
const paragraphPattern = paragraphClass
? new RegExp(`<p class="${paragraphClass}">${placeholder}</p>\\n?`, 'g')
: new RegExp(`<p>${placeholder}</p>\\n?`, 'g')
return nextHtml
.replace(paragraphPattern, block)
.replaceAll(placeholder, block)
}, html)
}