feat: 增强知识库功能,优化索引和RAG检索
This commit is contained in:
@@ -33,6 +33,7 @@ DEFAULT_LIGHTRAG_QUERY_MODE = "naive"
|
||||
DEFAULT_LLM_TIMEOUT_SECONDS = 180
|
||||
DEFAULT_EMBEDDING_TIMEOUT_SECONDS = 120
|
||||
MAX_KNOWLEDGE_HIT_CONTENT_LENGTH = 2200
|
||||
MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH = 220
|
||||
MAX_QUERY_TERMS = 12
|
||||
QUERY_TERM_STOPWORDS = {
|
||||
"什么",
|
||||
@@ -62,6 +63,13 @@ TABLE_OR_STANDARD_QUERY_HINTS = (
|
||||
"档位",
|
||||
"额度",
|
||||
)
|
||||
STRUCTURED_APPENDIX_LEADING_MARKERS = (
|
||||
"# 章节导航",
|
||||
"# 重点章节摘录",
|
||||
"# 问答线索补充",
|
||||
"# 结构化表格补充",
|
||||
)
|
||||
STRUCTURED_APPENDIX_LEADING_WINDOW = 220
|
||||
|
||||
_runtime_lock = threading.RLock()
|
||||
_runtime_instance: _LightRagRuntime | None = None
|
||||
@@ -830,7 +838,11 @@ class KnowledgeRagService:
|
||||
document_id, document_name = _parse_document_identity(file_path)
|
||||
normalized_chunk_id = chunk_id or f"path-{rank}"
|
||||
normalized_content = _truncate_text(content, max_length=MAX_KNOWLEDGE_HIT_CONTENT_LENGTH)
|
||||
excerpt = _build_excerpt(normalized_content, max_length=220)
|
||||
excerpt = _build_query_focused_excerpt(
|
||||
normalized_content,
|
||||
query_terms=query_terms,
|
||||
max_length=MAX_KNOWLEDGE_HIT_EXCERPT_LENGTH,
|
||||
)
|
||||
candidates.append(
|
||||
{
|
||||
"code": f"knowledge.{document_id or 'unknown'}.{normalized_chunk_id}",
|
||||
@@ -907,8 +919,12 @@ class KnowledgeRagService:
|
||||
@staticmethod
|
||||
def is_query_ready_status(status_obj: Any) -> bool:
|
||||
status_text = KnowledgeRagService._status_value(status_obj)
|
||||
if status_text in {"failed", "error", "aborted"}:
|
||||
return False
|
||||
if status_text == "processed":
|
||||
return True
|
||||
if status_text in {"pending", "processing", "preprocessed"}:
|
||||
return False
|
||||
|
||||
chunks_count = getattr(status_obj, "chunks_count", None)
|
||||
if chunks_count is None and isinstance(status_obj, dict):
|
||||
@@ -1168,6 +1184,35 @@ def _build_excerpt(text: str, *, max_length: int = 180) -> str:
|
||||
return f"{normalized[: max_length - 3].rstrip()}..."
|
||||
|
||||
|
||||
def _build_query_focused_excerpt(
|
||||
text: str,
|
||||
*,
|
||||
query_terms: list[str],
|
||||
max_length: int = 180,
|
||||
) -> str:
|
||||
normalized = " ".join(str(text or "").split()).strip()
|
||||
if not normalized:
|
||||
return ""
|
||||
|
||||
lowered = normalized.lower()
|
||||
match_positions = [
|
||||
lowered.find(term)
|
||||
for term in query_terms
|
||||
if term and lowered.find(term) >= 0
|
||||
]
|
||||
if not match_positions:
|
||||
return _build_excerpt(normalized, max_length=max_length)
|
||||
|
||||
start = max(0, min(match_positions) - max_length // 3)
|
||||
end = min(len(normalized), start + max_length)
|
||||
snippet = normalized[start:end].strip()
|
||||
if start > 0:
|
||||
snippet = f"...{snippet.lstrip()}"
|
||||
if end < len(normalized):
|
||||
snippet = f"{snippet.rstrip()}..."
|
||||
return snippet
|
||||
|
||||
|
||||
def _truncate_text(text: str, *, max_length: int) -> str:
|
||||
normalized = str(text or "").strip()
|
||||
if len(normalized) <= max_length:
|
||||
@@ -1243,19 +1288,43 @@ def _score_knowledge_hit(
|
||||
score += len(matched_terms) * 8
|
||||
score += sum(1 for term in matched_terms if term in title) * 6
|
||||
|
||||
if "结构化表格补充" in content:
|
||||
score += 18
|
||||
if "问答线索补充" in content:
|
||||
score += 16 if not prefers_tabular_evidence else 8
|
||||
if "重点章节摘录" in content:
|
||||
leading_appendix_marker = _leading_structured_appendix_marker(content)
|
||||
if leading_appendix_marker == "# 章节导航":
|
||||
score -= 24
|
||||
elif leading_appendix_marker == "# 重点章节摘录":
|
||||
score += 4 if matched_terms else -12
|
||||
elif leading_appendix_marker == "# 问答线索补充":
|
||||
score += 8 if matched_terms and not prefers_tabular_evidence else 2 if matched_terms else -20
|
||||
elif leading_appendix_marker == "# 结构化表格补充":
|
||||
if prefers_tabular_evidence and matched_terms:
|
||||
score += 16
|
||||
elif matched_terms:
|
||||
score += 6
|
||||
else:
|
||||
score -= 18
|
||||
|
||||
if prefers_tabular_evidence and matched_terms and ("|" in content or "表" in content):
|
||||
score += 10
|
||||
if "章节导航" in content:
|
||||
if matched_terms and any(marker in content for marker in (":", ":")):
|
||||
score += 10
|
||||
if matched_terms and "\n" in content:
|
||||
score += 4
|
||||
if prefers_tabular_evidence and ("|" in content or "表" in content or "结构化表格补充" in content):
|
||||
score += 12
|
||||
if not prefers_tabular_evidence and any(marker in content for marker in ("第", "条", ":", "-", "•")):
|
||||
if matched_terms and any(marker in content for marker in ("附表", "第", "条")):
|
||||
score += 4
|
||||
if not prefers_tabular_evidence and matched_terms and any(marker in content for marker in ("第", "条", ":", "-", "•")):
|
||||
score += 4
|
||||
if title and any(term in title for term in query_terms):
|
||||
score += 6
|
||||
if re.search(r"没有.{0,8}(信息|规定|说明|依据)", content):
|
||||
score -= 12
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def _leading_structured_appendix_marker(content: str) -> str:
|
||||
normalized = str(content or "").lstrip()
|
||||
for marker in STRUCTURED_APPENDIX_LEADING_MARKERS:
|
||||
index = normalized.find(marker)
|
||||
if 0 <= index <= STRUCTURED_APPENDIX_LEADING_WINDOW:
|
||||
return marker
|
||||
return ""
|
||||
|
||||
Reference in New Issue
Block a user