192 lines
5.4 KiB
JavaScript
192 lines
5.4 KiB
JavaScript
|
|
export const DOCUMENT_DETAIL_HREF_PREFIX = '#ai-open-document-detail:'
|
|||
|
|
|
|||
|
|
const ALLOWED_COLON_HEADING_TITLES = new Set([
|
|||
|
|
'基础信息识别结果',
|
|||
|
|
'报销测算参考',
|
|||
|
|
'补充信息'
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
const BUSINESS_FIELD_LABELS = new Set([
|
|||
|
|
'时间',
|
|||
|
|
'地点',
|
|||
|
|
'事由',
|
|||
|
|
'金额',
|
|||
|
|
'费用类型',
|
|||
|
|
'报销类型',
|
|||
|
|
'商户',
|
|||
|
|
'商户/开票方',
|
|||
|
|
'客户',
|
|||
|
|
'客户/项目对象',
|
|||
|
|
'附件',
|
|||
|
|
'附件/凭证',
|
|||
|
|
'出行方式'
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
const TRUSTED_HTML_BLOCK_RE = /<!--\s*ai-trusted-html:start\s*-->\s*([\s\S]*?)\s*<!--\s*ai-trusted-html:end\s*-->/g
|
|||
|
|
const TRUSTED_HTML_PLACEHOLDER_PREFIX = 'AI_TRUSTED_HTML_BLOCK_'
|
|||
|
|
const TRUSTED_HTML_ALLOWED_TAGS = new Set([
|
|||
|
|
'section',
|
|||
|
|
'article',
|
|||
|
|
'header',
|
|||
|
|
'footer',
|
|||
|
|
'div',
|
|||
|
|
'span',
|
|||
|
|
'strong',
|
|||
|
|
'a'
|
|||
|
|
])
|
|||
|
|
const TRUSTED_HTML_ALLOWED_ATTRS = new Set([
|
|||
|
|
'aria-label',
|
|||
|
|
'class',
|
|||
|
|
'data-ai-action',
|
|||
|
|
'href'
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
function splitColonHeadingLine(line) {
|
|||
|
|
const rawLine = String(line || '')
|
|||
|
|
const trimmed = rawLine.trim()
|
|||
|
|
if (!trimmed || trimmed.startsWith('|') || /^#{1,6}\s/.test(trimmed)) {
|
|||
|
|
return [rawLine]
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const chineseColonIndex = trimmed.indexOf(':')
|
|||
|
|
const asciiColonIndex = trimmed.indexOf(':')
|
|||
|
|
const colonIndexes = [chineseColonIndex, asciiColonIndex].filter((index) => index > 0)
|
|||
|
|
if (!colonIndexes.length) {
|
|||
|
|
return [rawLine]
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const colonIndex = Math.min(...colonIndexes)
|
|||
|
|
const title = trimmed.slice(0, colonIndex)
|
|||
|
|
const body = trimmed.slice(colonIndex + 1).trim()
|
|||
|
|
if (!ALLOWED_COLON_HEADING_TITLES.has(title)) {
|
|||
|
|
return [rawLine]
|
|||
|
|
}
|
|||
|
|
return body ? [`### ${title}`, '', body] : [`### ${title}`]
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function normalizeBusinessFieldLine(line) {
|
|||
|
|
const rawLine = String(line || '')
|
|||
|
|
const trimmed = rawLine.trim()
|
|||
|
|
if (
|
|||
|
|
!trimmed ||
|
|||
|
|
trimmed.startsWith('|') ||
|
|||
|
|
/^[-*+]\s/.test(trimmed) ||
|
|||
|
|
/^#{1,6}\s/.test(trimmed)
|
|||
|
|
) {
|
|||
|
|
return rawLine
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const match = trimmed.match(/^([^::\n]{1,16})[::]\s*(.+)$/u)
|
|||
|
|
if (!match) {
|
|||
|
|
return rawLine
|
|||
|
|
}
|
|||
|
|
const label = match[1].trim()
|
|||
|
|
const value = match[2].trim()
|
|||
|
|
if (!BUSINESS_FIELD_LABELS.has(label) || !value) {
|
|||
|
|
return rawLine
|
|||
|
|
}
|
|||
|
|
return `- **${label}**:${value}`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function hasOnlyTrustedHtmlTags(html = '') {
|
|||
|
|
const tagPattern = /<\/?([a-z][\w-]*)([^>]*)>/gi
|
|||
|
|
let match = tagPattern.exec(html)
|
|||
|
|
while (match) {
|
|||
|
|
const tagName = String(match[1] || '').toLowerCase()
|
|||
|
|
if (!TRUSTED_HTML_ALLOWED_TAGS.has(tagName)) {
|
|||
|
|
return false
|
|||
|
|
}
|
|||
|
|
const attrText = String(match[2] || '')
|
|||
|
|
const attrPattern = /\s([:@\w-]+)\s*=/g
|
|||
|
|
let attrMatch = attrPattern.exec(attrText)
|
|||
|
|
while (attrMatch) {
|
|||
|
|
const attrName = String(attrMatch[1] || '').toLowerCase()
|
|||
|
|
if (!TRUSTED_HTML_ALLOWED_ATTRS.has(attrName)) {
|
|||
|
|
return false
|
|||
|
|
}
|
|||
|
|
attrMatch = attrPattern.exec(attrText)
|
|||
|
|
}
|
|||
|
|
match = tagPattern.exec(html)
|
|||
|
|
}
|
|||
|
|
return true
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function sanitizeTrustedHtmlBlock(html = '') {
|
|||
|
|
const value = String(html || '').trim()
|
|||
|
|
if (!value || !value.includes('class="ai-document-card-list"')) {
|
|||
|
|
return ''
|
|||
|
|
}
|
|||
|
|
if (/<(?:script|style|iframe|object|embed|link|meta|form|input|button|textarea|select)\b/i.test(value)) {
|
|||
|
|
return ''
|
|||
|
|
}
|
|||
|
|
if (/\son[a-z]+\s*=/i.test(value) || /javascript\s*:/i.test(value)) {
|
|||
|
|
return ''
|
|||
|
|
}
|
|||
|
|
if (!hasOnlyTrustedHtmlTags(value)) {
|
|||
|
|
return ''
|
|||
|
|
}
|
|||
|
|
const hrefs = [...value.matchAll(/\shref="([^"]*)"/gi)].map((match) => String(match[1] || '').trim())
|
|||
|
|
if (hrefs.some((href) => !href.startsWith(DOCUMENT_DETAIL_HREF_PREFIX))) {
|
|||
|
|
return ''
|
|||
|
|
}
|
|||
|
|
return value
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export function normalizeConversationText(text = '', options = {}) {
|
|||
|
|
const shouldTrim = Boolean(options.trim)
|
|||
|
|
const lines = String(text || '').replace(/\r\n?/g, '\n').split('\n')
|
|||
|
|
const normalizedLines = []
|
|||
|
|
let inFence = false
|
|||
|
|
|
|||
|
|
lines.forEach((line) => {
|
|||
|
|
if (/^\s*(```|~~~)/.test(line)) {
|
|||
|
|
inFence = !inFence
|
|||
|
|
normalizedLines.push(line)
|
|||
|
|
return
|
|||
|
|
}
|
|||
|
|
if (inFence) {
|
|||
|
|
normalizedLines.push(line)
|
|||
|
|
return
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const nextLines = splitColonHeadingLine(line)
|
|||
|
|
if (nextLines[0]?.startsWith('### ') && normalizedLines.length) {
|
|||
|
|
const previousLine = normalizedLines[normalizedLines.length - 1]
|
|||
|
|
if (String(previousLine || '').trim()) {
|
|||
|
|
normalizedLines.push('')
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
normalizedLines.push(...nextLines.map((nextLine) => normalizeBusinessFieldLine(nextLine)))
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
const normalized = normalizedLines.join('\n').replace(/\n{3,}/g, '\n\n')
|
|||
|
|
return shouldTrim ? normalized.trim() : normalized
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export function extractTrustedHtmlBlocks(text = '') {
|
|||
|
|
const trustedHtmlBlocks = []
|
|||
|
|
const content = String(text || '').replace(TRUSTED_HTML_BLOCK_RE, (_match, html) => {
|
|||
|
|
const sanitizedHtml = sanitizeTrustedHtmlBlock(html)
|
|||
|
|
if (!sanitizedHtml) {
|
|||
|
|
return ''
|
|||
|
|
}
|
|||
|
|
const placeholder = `${TRUSTED_HTML_PLACEHOLDER_PREFIX}${trustedHtmlBlocks.length}`
|
|||
|
|
trustedHtmlBlocks.push(sanitizedHtml)
|
|||
|
|
return `\n\n${placeholder}\n\n`
|
|||
|
|
})
|
|||
|
|
return { content, trustedHtmlBlocks }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export function restoreTrustedHtmlBlocks(html = '', trustedHtmlBlocks = [], options = {}) {
|
|||
|
|
const paragraphClass = String(options.paragraphClass || '').trim()
|
|||
|
|
return trustedHtmlBlocks.reduce((nextHtml, block, index) => {
|
|||
|
|
const placeholder = `${TRUSTED_HTML_PLACEHOLDER_PREFIX}${index}`
|
|||
|
|
const paragraphPattern = paragraphClass
|
|||
|
|
? new RegExp(`<p class="${paragraphClass}">${placeholder}</p>\\n?`, 'g')
|
|||
|
|
: new RegExp(`<p>${placeholder}</p>\\n?`, 'g')
|
|||
|
|
return nextHtml
|
|||
|
|
.replace(paragraphPattern, block)
|
|||
|
|
.replaceAll(placeholder, block)
|
|||
|
|
}, html)
|
|||
|
|
}
|