# Phase R.3:动态权重增强 日期:2026-04-03 状态:已规划 依赖:R.1(Token 感知分块) 工作量:4.5 天 --- ## 1. 本阶段目的 根据查询特性动态调整检索策略,支持核心标签加权。 --- ## 2. 核心任务 ### Task R.3.1:实现查询特性分析 **目标:** 分析查询类型(代码/表格/对话式) **新增文件:** `backend/app/services/query_analyzer.py` ```python import re from dataclasses import dataclass @dataclass class QueryProfile: logic_depth: float # 逻辑深度 (0-1): 意图明确程度 is_code_related: bool # 是否代码相关 is_table_related: bool # 是否表格相关 keyword_density: float # 关键词密度 is_conversational: bool # 是否对话式查询 class QueryAnalyzer: CODE_KEYWORDS = {'code', 'function', 'class', 'api', 'python', 'js', 'bug', '函数', '代码'} TABLE_KEYWORDS = {'table', 'sheet', 'excel', 'csv', 'column', 'row', '数据', '统计', '表格', '列', '行'} def analyze(self, query: str) -> QueryProfile: words = set(re.findall(r'\w+', query.lower())) return QueryProfile( logic_depth=self._calc_logic_depth(query), is_code_related=bool(words & self.CODE_KEYWORDS), is_table_related=bool(words & self.TABLE_KEYWORDS), keyword_density=len(words) / max(len(query), 1), is_conversational=self._is_conversational(query), ) def _calc_logic_depth(self, query: str) -> float: """计算逻辑深度:问句、具体名词越多越聚焦""" question_markers = ['how', 'why', 'what', 'which', '哪个', '如何', '为什么', '怎么'] has_question = any(q in query.lower() for q in question_markers) has_specific_terms = len(re.findall(r'\w{5,}', query)) > 3 return 0.8 if (has_question and has_specific_terms) else 0.5 def _is_conversational(self, query: str) -> bool: """判断是否为对话式查询""" conversational_patterns = ['你', '我想', '能不能', '可以帮我', 'what do you think'] return any(p in query for p in conversational_patterns) ``` --- ### Task R.3.2:实现动态 Reranker **目标:** 根据查询类型动态调整语义/关键词/标题权重 **新增文件:** `backend/app/services/dynamic_reranker.py` ```python import json from dataclasses import dataclass class DynamicReranker: """动态 Reranker,根据查询特性调整权重""" def rerank( self, query: str, results: list[SearchResult], analyzer: QueryAnalyzer ) -> list[SearchResult]: profile = analyzer.analyze(query) weights = self._get_weights(profile) beta = self._calc_beta(profile) scored = [] for r in results: score = r.score * weights["semantic"] score += self._keyword_score(query, r.content) * weights["keyword"] score += self._title_score(query, r.document_title) * weights["title"] # 表格内容加分 if profile.is_table_related: meta = json.loads(r.metadata_ or "{}") if meta.get("content_type") == "table_schema": score += 0.25 elif meta.get("content_type") == "table_rows": score += 0.15 score *= beta scored.append((score, r)) scored.sort(key=lambda x: x[0], reverse=True) return [r for _, r in scored] def _get_weights(self, profile: QueryProfile) -> dict: if profile.is_code_related: return {"semantic": 0.55, "keyword": 0.35, "title": 0.10} elif profile.is_table_related: return {"semantic": 0.50, "keyword": 0.30, "title": 0.20} elif profile.is_conversational: return {"semantic": 0.85, "keyword": 0.10, "title": 0.05} else: return {"semantic": 0.70, "keyword": 0.20, "title": 0.10} def _calc_beta(self, profile: QueryProfile) -> float: """计算动态 Beta:逻辑深度高时加大语义权重""" if profile.logic_depth > 0.7: return 1.2 # 意图明确,加大权重 elif profile.logic_depth < 0.4: return 0.8 # 意图模糊,降低权重 return 1.0 ``` --- ### Task R.3.3:实现核心标签系统 **目标:** 核心标签 1.33x 加权 **新增文件:** `backend/app/services/core_tag_search.py` ```python class CoreTagAwareSearch: """核心标签感知检索""" CORE_BOOST_FACTOR = 1.33 # 33% 加权 async def search( self, query: str, user_id: str, core_tags: list[str] = None, base_search_fn: callable ) -> list[SearchResult]: results = await base_search_fn(query, user_id) if core_tags: for r in results: meta = json.loads(r.metadata_ or "{}") chunk_tags = meta.get("tags", []) if any(tag in chunk_tags for tag in core_tags): r.score *= self.CORE_BOOST_FACTOR return sorted(results, key=lambda x: x.score, reverse=True) ``` --- ## 3. 修改现有文件 ### `backend/app/models/document.py` 增加 `tags` 和 `is_core` 字段: ```python class DocumentChunk(Base): # ... existing fields ... tags = Column(JSON, default=list) # ["重要", "代码", "架构"] is_core = Column(Boolean, default=False) # 是否核心切片 ``` --- ### `backend/app/services/knowledge_service.py` 集成动态权重: ```python from app.services.query_analyzer import QueryAnalyzer from app.services.dynamic_reranker import DynamicReranker from app.services.core_tag_search import CoreTagAwareSearch class KnowledgeService: def __init__(self, ...): # ... existing init self.query_analyzer = QueryAnalyzer() self.dynamic_reranker = DynamicReranker() self.core_tag_search = CoreTagAwareSearch() async def retrieve(self, query: str, user_id: str, ..., core_tags: list[str] = None) -> list[SearchResult]: # ... existing retrieval logic ... # 动态 Rerank results = self.dynamic_reranker.rerank( query, results, self.query_analyzer ) # 核心标签加权 if core_tags: results = await self.core_tag_search.search( query, user_id, core_tags, lambda q, u: results # 使用已检索的结果 ) return results ``` --- ## 4. 新增测试 **新增文件:** `backend/tests/services/test_dynamic_reranker.py` ```python import pytest from app.services.query_analyzer import QueryAnalyzer, QueryProfile from app.services.dynamic_reranker import DynamicReranker class TestQueryAnalyzer: def test_code_query_detection(self): analyzer = QueryAnalyzer() profile = analyzer.analyze("请解释这段 Python 代码") assert profile.is_code_related is True def test_table_query_detection(self): analyzer = QueryAnalyzer() profile = analyzer.analyze("统计这个 Excel 表格的总和") assert profile.is_table_related is True def test_conversational_detection(self): analyzer = QueryAnalyzer() profile = analyzer.analyze("我想了解一下") assert profile.is_conversational is True class TestDynamicReranker: def test_code_query_weights(self): reranker = DynamicReranker() analyzer = QueryAnalyzer() profile = QueryProfile( logic_depth=0.5, is_code_related=True, is_table_related=False, keyword_density=0.3, is_conversational=False ) weights = reranker._get_weights(profile) assert weights["keyword"] > weights["semantic"] * 0.5 # 代码查询关键词权重较高 ``` --- ## 5. 验收标准 - [ ] 查询特性分析准确(代码/表格/对话式识别) - [ ] 动态权重根据查询类型调整 - [ ] 核心标签检索加权 1.33x - [ ] Rerank 集成测试通过 --- ## 6. 变更文件清单 | 文件 | 操作 | 说明 | |------|------|------| | `backend/app/services/query_analyzer.py` | 新增 | 查询特性分析 | | `backend/app/services/dynamic_reranker.py` | 新增 | 动态 Reranker | | `backend/app/services/core_tag_search.py` | 新增 | 核心标签检索 | | `backend/app/services/knowledge_service.py` | 修改 | 集成动态权重 | | `backend/app/models/document.py` | 修改 | 增加 tags/is_core 字段 | | `backend/tests/services/test_dynamic_reranker.py` | 新增 | 动态 Reranker 测试 | --- ## 7. 工作量估算 | 任务 | 估算 | |------|------| | R.3.1 查询特性分析 | 1 天 | | R.3.2 动态 Reranker | 1 天 | | R.3.3 核心标签系统 | 1 天 | | 测试 + 调试 | 1.5 天 | | **R.3 总计** | **4.5 天** |