245 lines
6.4 KiB
Markdown
245 lines
6.4 KiB
Markdown
|
|
# Phase R.2:多索引架构
|
|||
|
|
|
|||
|
|
日期:2026-04-03
|
|||
|
|
状态:已规划
|
|||
|
|
依赖:R.1(Token 感知分块)
|
|||
|
|
工作量:4 天
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 1. 本阶段目的
|
|||
|
|
|
|||
|
|
按知识类型/重要性分层,支持懒加载和 LRU 淘汰。
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 2. 核心任务
|
|||
|
|
|
|||
|
|
### Task R.2.1:设计 Collection 分离策略
|
|||
|
|
|
|||
|
|
**目标:** 按知识类型分离 ChromaDB Collection
|
|||
|
|
|
|||
|
|
**新增文件:** `backend/app/services/multi_index.py`
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
class MultiIndexManager:
|
|||
|
|
"""多索引管理器,按知识类型分离"""
|
|||
|
|
|
|||
|
|
INDEX_STRATEGIES = {
|
|||
|
|
"default": {
|
|||
|
|
"name": "user_{user_id}_default",
|
|||
|
|
"description": "通用文档"
|
|||
|
|
},
|
|||
|
|
"important": {
|
|||
|
|
"name": "user_{user_id}_important",
|
|||
|
|
"description": "重要文档(1.2x加权)"
|
|||
|
|
},
|
|||
|
|
"code": {
|
|||
|
|
"name": "user_{user_id}_code",
|
|||
|
|
"description": "代码片段"
|
|||
|
|
},
|
|||
|
|
"meeting": {
|
|||
|
|
"name": "user_{user_id}_meeting",
|
|||
|
|
"description": "会议记录"
|
|||
|
|
},
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def get_collection(self, user_id: str, index_type: str = "default"):
|
|||
|
|
name = self.INDEX_STRATEGIES[index_type]["name"].format(user_id=user_id)
|
|||
|
|
return self.chroma_client.get_or_create_collection(name=name)
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
### Task R.2.2:实现懒加载 + LRU TTL
|
|||
|
|
|
|||
|
|
**目标:** 2小时 TTL,访问时加载,不访问不加载
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
import time
|
|||
|
|
from threading import Lock
|
|||
|
|
|
|||
|
|
class LazyIndexLoader:
|
|||
|
|
"""懒加载索引,支持 TTL 淘汰"""
|
|||
|
|
|
|||
|
|
def __init__(self, ttl_seconds: int = 7200):
|
|||
|
|
self._cache = {}
|
|||
|
|
self._last_used = {}
|
|||
|
|
self._lock = Lock()
|
|||
|
|
self._ttl = ttl_seconds
|
|||
|
|
|
|||
|
|
def get_or_load(self, key: str, loader_fn) -> Any:
|
|||
|
|
with self._lock:
|
|||
|
|
if key in self._cache:
|
|||
|
|
self._last_used[key] = time.time()
|
|||
|
|
return self._cache[key]
|
|||
|
|
|
|||
|
|
value = loader_fn()
|
|||
|
|
self._cache[key] = value
|
|||
|
|
self._last_used[key] = time.time()
|
|||
|
|
return value
|
|||
|
|
|
|||
|
|
def sweep(self):
|
|||
|
|
"""清理过期索引"""
|
|||
|
|
now = time.time()
|
|||
|
|
expired = [
|
|||
|
|
k for k, t in self._last_used.items()
|
|||
|
|
if now - t > self._ttl
|
|||
|
|
]
|
|||
|
|
for k in expired:
|
|||
|
|
del self._cache[k]
|
|||
|
|
del self._last_used[k]
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
### Task R.2.3:实现重要性感知检索
|
|||
|
|
|
|||
|
|
**目标:** important 索引加权 1.2x
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
async def retrieve_with_importance(
|
|||
|
|
self,
|
|||
|
|
query: str,
|
|||
|
|
user_id: str,
|
|||
|
|
top_k: int = 5,
|
|||
|
|
) -> list[SearchResult]:
|
|||
|
|
"""重要性感知检索,优先返回高重要性文档"""
|
|||
|
|
|
|||
|
|
# 1. 从 default 索引检索
|
|||
|
|
default_results = await self.retrieve(query, user_id, top_k=top_k * 2)
|
|||
|
|
|
|||
|
|
# 2. 从 important 索引检索
|
|||
|
|
important_results = await self.retrieve(
|
|||
|
|
query, user_id,
|
|||
|
|
collection_name=f"user_{user_id}_important",
|
|||
|
|
top_k=top_k
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 3. 合并,重要文档加权
|
|||
|
|
scored = []
|
|||
|
|
for r in default_results:
|
|||
|
|
scored.append((r.score * 0.8, r))
|
|||
|
|
for r in important_results:
|
|||
|
|
scored.append((r.score * 1.2, r)) # 重要文档 1.2x
|
|||
|
|
|
|||
|
|
scored.sort(key=lambda x: x[0], reverse=True)
|
|||
|
|
return [r for _, r in scored[:top_k]]
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 3. 修改现有文件
|
|||
|
|
|
|||
|
|
### `backend/app/models/document.py`
|
|||
|
|
|
|||
|
|
增加 `importance` 字段:
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
class Document(Base):
|
|||
|
|
# ... existing fields ...
|
|||
|
|
|
|||
|
|
importance = Column(Float, default=0.5) # 0.0 ~ 1.0, >0.8 进入 important 索引
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
### `backend/app/services/knowledge_service.py`
|
|||
|
|
|
|||
|
|
集成多索引支持:
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
from app.services.multi_index import MultiIndexManager, LazyIndexLoader
|
|||
|
|
|
|||
|
|
class KnowledgeService:
|
|||
|
|
def __init__(self, ...):
|
|||
|
|
# ... existing init
|
|||
|
|
self.multi_index = MultiIndexManager(self.chroma_client)
|
|||
|
|
self.lazy_loader = LazyIndexLoader(ttl_seconds=7200)
|
|||
|
|
|
|||
|
|
async def index_document(self, document_id: str, user_id: str, ...):
|
|||
|
|
# 根据 importance 选择索引
|
|||
|
|
doc = await self._get_document(document_id)
|
|||
|
|
if doc.importance >= 0.8:
|
|||
|
|
collection = self.multi_index.get_collection(user_id, "important")
|
|||
|
|
else:
|
|||
|
|
collection = self.multi_index.get_collection(user_id, "default")
|
|||
|
|
# ... rest of indexing
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 4. 新增测试
|
|||
|
|
|
|||
|
|
**新增文件:** `backend/tests/services/test_multi_index.py`
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
import pytest
|
|||
|
|
from app.services.multi_index import MultiIndexManager, LazyIndexLoader
|
|||
|
|
|
|||
|
|
class TestMultiIndexManager:
|
|||
|
|
def test_get_collection_creates_if_not_exists(self):
|
|||
|
|
manager = MultiIndexManager(mock_chroma_client)
|
|||
|
|
col = manager.get_collection("user123", "default")
|
|||
|
|
assert col is not None
|
|||
|
|
|
|||
|
|
def test_collection_name_format(self):
|
|||
|
|
manager = MultiIndexManager(mock_chroma_client)
|
|||
|
|
name = manager.INDEX_STRATEGIES["important"]["name"].format(user_id="user123")
|
|||
|
|
assert name == "user_user123_important"
|
|||
|
|
|
|||
|
|
class TestLazyIndexLoader:
|
|||
|
|
def test_get_or_load_caches(self):
|
|||
|
|
loader = LazyIndexLoader()
|
|||
|
|
load_fn = lambda: {"data": "test"}
|
|||
|
|
|
|||
|
|
result1 = loader.get_or_load("key1", load_fn)
|
|||
|
|
result2 = loader.get_or_load("key1", load_fn)
|
|||
|
|
|
|||
|
|
# 第二次调用应该返回缓存的结果,而不是重新加载
|
|||
|
|
assert result1 is result2
|
|||
|
|
|
|||
|
|
def test_sweep_removes_expired(self):
|
|||
|
|
loader = LazyIndexLoader(ttl_seconds=1)
|
|||
|
|
loader.get_or_load("key1", lambda: "value1")
|
|||
|
|
|
|||
|
|
import time
|
|||
|
|
time.sleep(1.1) # 等待过期
|
|||
|
|
|
|||
|
|
loader.sweep()
|
|||
|
|
assert "key1" not in loader._cache
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 5. 验收标准
|
|||
|
|
|
|||
|
|
- [ ] 多 Collection 创建成功
|
|||
|
|
- [ ] 懒加载索引生效(访问时加载,不访问不加载)
|
|||
|
|
- [ ] TTL 淘汰机制工作(2小时无访问自动卸载)
|
|||
|
|
- [ ] 重要性感知检索加权生效
|
|||
|
|
- [ ] 单元测试覆盖率 > 80%
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 6. 变更文件清单
|
|||
|
|
|
|||
|
|
| 文件 | 操作 | 说明 |
|
|||
|
|
|------|------|------|
|
|||
|
|
| `backend/app/services/multi_index.py` | 新增 | 多索引管理器 |
|
|||
|
|
| `backend/app/services/knowledge_service.py` | 修改 | 集成多索引支持 |
|
|||
|
|
| `backend/app/models/document.py` | 修改 | 增加 importance 字段 |
|
|||
|
|
| `backend/tests/services/test_multi_index.py` | 新增 | 多索引单元测试 |
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 7. 工作量估算
|
|||
|
|
|
|||
|
|
| 任务 | 估算 |
|
|||
|
|
|------|------|
|
|||
|
|
| R.2.1 Collection 分离策略 | 1 天 |
|
|||
|
|
| R.2.2 懒加载 + LRU | 1 天 |
|
|||
|
|
| R.2.3 重要性感知检索 | 0.5 天 |
|
|||
|
|
| 测试 + 调试 | 1.5 天 |
|
|||
|
|
| **R.2 总计** | **4 天** |
|