feat(agents): Phase 8.4-10.5 built-in plugins, bundled skills, coordinator
This commit is contained in:
290
development-doc/plan/rag-update/phase-r-3-dynamic-weight.md
Normal file
290
development-doc/plan/rag-update/phase-r-3-dynamic-weight.md
Normal file
@@ -0,0 +1,290 @@
|
||||
# Phase R.3:动态权重增强
|
||||
|
||||
日期:2026-04-03
|
||||
状态:已规划
|
||||
依赖:R.1(Token 感知分块)
|
||||
工作量:4.5 天
|
||||
|
||||
---
|
||||
|
||||
## 1. 本阶段目的
|
||||
|
||||
根据查询特性动态调整检索策略,支持核心标签加权。
|
||||
|
||||
---
|
||||
|
||||
## 2. 核心任务
|
||||
|
||||
### Task R.3.1:实现查询特性分析
|
||||
|
||||
**目标:** 分析查询类型(代码/表格/对话式)
|
||||
|
||||
**新增文件:** `backend/app/services/query_analyzer.py`
|
||||
|
||||
```python
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class QueryProfile:
|
||||
logic_depth: float # 逻辑深度 (0-1): 意图明确程度
|
||||
is_code_related: bool # 是否代码相关
|
||||
is_table_related: bool # 是否表格相关
|
||||
keyword_density: float # 关键词密度
|
||||
is_conversational: bool # 是否对话式查询
|
||||
|
||||
class QueryAnalyzer:
|
||||
CODE_KEYWORDS = {'code', 'function', 'class', 'api', 'python', 'js', 'bug', '函数', '代码'}
|
||||
TABLE_KEYWORDS = {'table', 'sheet', 'excel', 'csv', 'column', 'row', '数据', '统计', '表格', '列', '行'}
|
||||
|
||||
def analyze(self, query: str) -> QueryProfile:
|
||||
words = set(re.findall(r'\w+', query.lower()))
|
||||
|
||||
return QueryProfile(
|
||||
logic_depth=self._calc_logic_depth(query),
|
||||
is_code_related=bool(words & self.CODE_KEYWORDS),
|
||||
is_table_related=bool(words & self.TABLE_KEYWORDS),
|
||||
keyword_density=len(words) / max(len(query), 1),
|
||||
is_conversational=self._is_conversational(query),
|
||||
)
|
||||
|
||||
def _calc_logic_depth(self, query: str) -> float:
|
||||
"""计算逻辑深度:问句、具体名词越多越聚焦"""
|
||||
question_markers = ['how', 'why', 'what', 'which', '哪个', '如何', '为什么', '怎么']
|
||||
has_question = any(q in query.lower() for q in question_markers)
|
||||
has_specific_terms = len(re.findall(r'\w{5,}', query)) > 3
|
||||
return 0.8 if (has_question and has_specific_terms) else 0.5
|
||||
|
||||
def _is_conversational(self, query: str) -> bool:
|
||||
"""判断是否为对话式查询"""
|
||||
conversational_patterns = ['你', '我想', '能不能', '可以帮我', 'what do you think']
|
||||
return any(p in query for p in conversational_patterns)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task R.3.2:实现动态 Reranker
|
||||
|
||||
**目标:** 根据查询类型动态调整语义/关键词/标题权重
|
||||
|
||||
**新增文件:** `backend/app/services/dynamic_reranker.py`
|
||||
|
||||
```python
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
|
||||
class DynamicReranker:
|
||||
"""动态 Reranker,根据查询特性调整权重"""
|
||||
|
||||
def rerank(
|
||||
self,
|
||||
query: str,
|
||||
results: list[SearchResult],
|
||||
analyzer: QueryAnalyzer
|
||||
) -> list[SearchResult]:
|
||||
profile = analyzer.analyze(query)
|
||||
weights = self._get_weights(profile)
|
||||
beta = self._calc_beta(profile)
|
||||
|
||||
scored = []
|
||||
for r in results:
|
||||
score = r.score * weights["semantic"]
|
||||
score += self._keyword_score(query, r.content) * weights["keyword"]
|
||||
score += self._title_score(query, r.document_title) * weights["title"]
|
||||
|
||||
# 表格内容加分
|
||||
if profile.is_table_related:
|
||||
meta = json.loads(r.metadata_ or "{}")
|
||||
if meta.get("content_type") == "table_schema":
|
||||
score += 0.25
|
||||
elif meta.get("content_type") == "table_rows":
|
||||
score += 0.15
|
||||
|
||||
score *= beta
|
||||
scored.append((score, r))
|
||||
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [r for _, r in scored]
|
||||
|
||||
def _get_weights(self, profile: QueryProfile) -> dict:
|
||||
if profile.is_code_related:
|
||||
return {"semantic": 0.55, "keyword": 0.35, "title": 0.10}
|
||||
elif profile.is_table_related:
|
||||
return {"semantic": 0.50, "keyword": 0.30, "title": 0.20}
|
||||
elif profile.is_conversational:
|
||||
return {"semantic": 0.85, "keyword": 0.10, "title": 0.05}
|
||||
else:
|
||||
return {"semantic": 0.70, "keyword": 0.20, "title": 0.10}
|
||||
|
||||
def _calc_beta(self, profile: QueryProfile) -> float:
|
||||
"""计算动态 Beta:逻辑深度高时加大语义权重"""
|
||||
if profile.logic_depth > 0.7:
|
||||
return 1.2 # 意图明确,加大权重
|
||||
elif profile.logic_depth < 0.4:
|
||||
return 0.8 # 意图模糊,降低权重
|
||||
return 1.0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task R.3.3:实现核心标签系统
|
||||
|
||||
**目标:** 核心标签 1.33x 加权
|
||||
|
||||
**新增文件:** `backend/app/services/core_tag_search.py`
|
||||
|
||||
```python
|
||||
class CoreTagAwareSearch:
|
||||
"""核心标签感知检索"""
|
||||
|
||||
CORE_BOOST_FACTOR = 1.33 # 33% 加权
|
||||
|
||||
async def search(
|
||||
self,
|
||||
query: str,
|
||||
user_id: str,
|
||||
core_tags: list[str] = None,
|
||||
base_search_fn: callable
|
||||
) -> list[SearchResult]:
|
||||
results = await base_search_fn(query, user_id)
|
||||
|
||||
if core_tags:
|
||||
for r in results:
|
||||
meta = json.loads(r.metadata_ or "{}")
|
||||
chunk_tags = meta.get("tags", [])
|
||||
|
||||
if any(tag in chunk_tags for tag in core_tags):
|
||||
r.score *= self.CORE_BOOST_FACTOR
|
||||
|
||||
return sorted(results, key=lambda x: x.score, reverse=True)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. 修改现有文件
|
||||
|
||||
### `backend/app/models/document.py`
|
||||
|
||||
增加 `tags` 和 `is_core` 字段:
|
||||
|
||||
```python
|
||||
class DocumentChunk(Base):
|
||||
# ... existing fields ...
|
||||
|
||||
tags = Column(JSON, default=list) # ["重要", "代码", "架构"]
|
||||
is_core = Column(Boolean, default=False) # 是否核心切片
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `backend/app/services/knowledge_service.py`
|
||||
|
||||
集成动态权重:
|
||||
|
||||
```python
|
||||
from app.services.query_analyzer import QueryAnalyzer
|
||||
from app.services.dynamic_reranker import DynamicReranker
|
||||
from app.services.core_tag_search import CoreTagAwareSearch
|
||||
|
||||
class KnowledgeService:
|
||||
def __init__(self, ...):
|
||||
# ... existing init
|
||||
self.query_analyzer = QueryAnalyzer()
|
||||
self.dynamic_reranker = DynamicReranker()
|
||||
self.core_tag_search = CoreTagAwareSearch()
|
||||
|
||||
async def retrieve(self, query: str, user_id: str, ..., core_tags: list[str] = None) -> list[SearchResult]:
|
||||
# ... existing retrieval logic ...
|
||||
|
||||
# 动态 Rerank
|
||||
results = self.dynamic_reranker.rerank(
|
||||
query, results, self.query_analyzer
|
||||
)
|
||||
|
||||
# 核心标签加权
|
||||
if core_tags:
|
||||
results = await self.core_tag_search.search(
|
||||
query, user_id, core_tags,
|
||||
lambda q, u: results # 使用已检索的结果
|
||||
)
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. 新增测试
|
||||
|
||||
**新增文件:** `backend/tests/services/test_dynamic_reranker.py`
|
||||
|
||||
```python
|
||||
import pytest
|
||||
from app.services.query_analyzer import QueryAnalyzer, QueryProfile
|
||||
from app.services.dynamic_reranker import DynamicReranker
|
||||
|
||||
class TestQueryAnalyzer:
|
||||
def test_code_query_detection(self):
|
||||
analyzer = QueryAnalyzer()
|
||||
profile = analyzer.analyze("请解释这段 Python 代码")
|
||||
assert profile.is_code_related is True
|
||||
|
||||
def test_table_query_detection(self):
|
||||
analyzer = QueryAnalyzer()
|
||||
profile = analyzer.analyze("统计这个 Excel 表格的总和")
|
||||
assert profile.is_table_related is True
|
||||
|
||||
def test_conversational_detection(self):
|
||||
analyzer = QueryAnalyzer()
|
||||
profile = analyzer.analyze("我想了解一下")
|
||||
assert profile.is_conversational is True
|
||||
|
||||
class TestDynamicReranker:
|
||||
def test_code_query_weights(self):
|
||||
reranker = DynamicReranker()
|
||||
analyzer = QueryAnalyzer()
|
||||
|
||||
profile = QueryProfile(
|
||||
logic_depth=0.5,
|
||||
is_code_related=True,
|
||||
is_table_related=False,
|
||||
keyword_density=0.3,
|
||||
is_conversational=False
|
||||
)
|
||||
|
||||
weights = reranker._get_weights(profile)
|
||||
assert weights["keyword"] > weights["semantic"] * 0.5 # 代码查询关键词权重较高
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. 验收标准
|
||||
|
||||
- [ ] 查询特性分析准确(代码/表格/对话式识别)
|
||||
- [ ] 动态权重根据查询类型调整
|
||||
- [ ] 核心标签检索加权 1.33x
|
||||
- [ ] Rerank 集成测试通过
|
||||
|
||||
---
|
||||
|
||||
## 6. 变更文件清单
|
||||
|
||||
| 文件 | 操作 | 说明 |
|
||||
|------|------|------|
|
||||
| `backend/app/services/query_analyzer.py` | 新增 | 查询特性分析 |
|
||||
| `backend/app/services/dynamic_reranker.py` | 新增 | 动态 Reranker |
|
||||
| `backend/app/services/core_tag_search.py` | 新增 | 核心标签检索 |
|
||||
| `backend/app/services/knowledge_service.py` | 修改 | 集成动态权重 |
|
||||
| `backend/app/models/document.py` | 修改 | 增加 tags/is_core 字段 |
|
||||
| `backend/tests/services/test_dynamic_reranker.py` | 新增 | 动态 Reranker 测试 |
|
||||
|
||||
---
|
||||
|
||||
## 7. 工作量估算
|
||||
|
||||
| 任务 | 估算 |
|
||||
|------|------|
|
||||
| R.3.1 查询特性分析 | 1 天 |
|
||||
| R.3.2 动态 Reranker | 1 天 |
|
||||
| R.3.3 核心标签系统 | 1 天 |
|
||||
| 测试 + 调试 | 1.5 天 |
|
||||
| **R.3 总计** | **4.5 天** |
|
||||
Reference in New Issue
Block a user