feat: 重构知识库系统,移除Hermes集成,增强RAG和同步功能
主要变更: - 移除Hermes智能体及相关回调服务 - 新增知识库RAG、同步、调度、规范化和索引任务服务 - 重构orchestrator服务,增强运行时聊天功能 - 更新前端聊天、政策制度、设置等页面样式和逻辑 - 更新expense_claims和document_intelligence服务 - 删除llm_wiki相关服务和测试文件 - 更新docker-compose配置和启动脚本
This commit is contained in:
@@ -0,0 +1,179 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Diagnostic tool to check LightRAG initialization status.
|
||||
|
||||
This tool helps developers verify that their LightRAG instance is properly
|
||||
initialized and ready to use. It should be called AFTER initialize_storages()
|
||||
to validate that all components are correctly set up.
|
||||
|
||||
Usage:
|
||||
# Basic usage in your code:
|
||||
rag = LightRAG(...)
|
||||
await rag.initialize_storages()
|
||||
await check_lightrag_setup(rag, verbose=True)
|
||||
|
||||
# Run demo from command line:
|
||||
python -m lightrag.tools.check_initialization --demo
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from lightrag import LightRAG
|
||||
from lightrag.base import StoragesStatus
|
||||
|
||||
|
||||
async def check_lightrag_setup(rag_instance: LightRAG, verbose: bool = False) -> bool:
|
||||
"""
|
||||
Check if a LightRAG instance is properly initialized.
|
||||
|
||||
Args:
|
||||
rag_instance: The LightRAG instance to check
|
||||
verbose: If True, print detailed diagnostic information
|
||||
|
||||
Returns:
|
||||
True if properly initialized, False otherwise
|
||||
"""
|
||||
issues = []
|
||||
warnings = []
|
||||
|
||||
print("🔍 Checking LightRAG initialization status...\n")
|
||||
|
||||
# Check storage initialization status
|
||||
if not hasattr(rag_instance, "_storages_status"):
|
||||
issues.append("LightRAG instance missing _storages_status attribute")
|
||||
elif rag_instance._storages_status != StoragesStatus.INITIALIZED:
|
||||
issues.append(
|
||||
f"Storages not initialized (status: {rag_instance._storages_status.name})"
|
||||
)
|
||||
else:
|
||||
print("✅ Storage status: INITIALIZED")
|
||||
|
||||
# Check individual storage components
|
||||
storage_components = [
|
||||
("full_docs", "Document storage"),
|
||||
("text_chunks", "Text chunks storage"),
|
||||
("entities_vdb", "Entity vector database"),
|
||||
("relationships_vdb", "Relationship vector database"),
|
||||
("chunks_vdb", "Chunks vector database"),
|
||||
("doc_status", "Document status tracker"),
|
||||
("llm_response_cache", "LLM response cache"),
|
||||
("full_entities", "Entity storage"),
|
||||
("full_relations", "Relation storage"),
|
||||
("chunk_entity_relation_graph", "Graph storage"),
|
||||
]
|
||||
|
||||
if verbose:
|
||||
print("\n📦 Storage Components:")
|
||||
|
||||
for component, description in storage_components:
|
||||
if not hasattr(rag_instance, component):
|
||||
issues.append(f"Missing storage component: {component} ({description})")
|
||||
else:
|
||||
storage = getattr(rag_instance, component)
|
||||
if storage is None:
|
||||
warnings.append(f"Storage {component} is None (might be optional)")
|
||||
elif hasattr(storage, "_storage_lock"):
|
||||
if storage._storage_lock is None:
|
||||
issues.append(f"Storage {component} not initialized (lock is None)")
|
||||
elif verbose:
|
||||
print(f" ✅ {description}: Ready")
|
||||
elif verbose:
|
||||
print(f" ✅ {description}: Ready")
|
||||
|
||||
# Check pipeline status
|
||||
try:
|
||||
from lightrag.kg.shared_storage import get_namespace_data
|
||||
|
||||
get_namespace_data("pipeline_status", workspace=rag_instance.workspace)
|
||||
print("✅ Pipeline status: INITIALIZED")
|
||||
except KeyError:
|
||||
issues.append(
|
||||
"Pipeline status not initialized - call rag.initialize_storages() first"
|
||||
)
|
||||
except Exception as e:
|
||||
issues.append(f"Error checking pipeline status: {str(e)}")
|
||||
|
||||
# Print results
|
||||
print("\n" + "=" * 50)
|
||||
|
||||
if issues:
|
||||
print("❌ Issues found:\n")
|
||||
for issue in issues:
|
||||
print(f" • {issue}")
|
||||
|
||||
print("\n📝 To fix, run this initialization sequence:\n")
|
||||
print(" await rag.initialize_storages()")
|
||||
print(
|
||||
"\n📚 Documentation: https://github.com/HKUDS/LightRAG#important-initialization-requirements"
|
||||
)
|
||||
|
||||
if warnings and verbose:
|
||||
print("\n⚠️ Warnings (might be normal):")
|
||||
for warning in warnings:
|
||||
print(f" • {warning}")
|
||||
|
||||
return False
|
||||
else:
|
||||
print("✅ LightRAG is properly initialized and ready to use!")
|
||||
|
||||
if warnings and verbose:
|
||||
print("\n⚠️ Warnings (might be normal):")
|
||||
for warning in warnings:
|
||||
print(f" • {warning}")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def demo():
|
||||
"""Demonstrate the diagnostic tool with a test instance."""
|
||||
from lightrag.llm.openai import openai_embed, gpt_4o_mini_complete
|
||||
|
||||
print("=" * 50)
|
||||
print("LightRAG Initialization Diagnostic Tool")
|
||||
print("=" * 50)
|
||||
|
||||
# Create test instance
|
||||
rag = LightRAG(
|
||||
working_dir="./test_diagnostic",
|
||||
embedding_func=openai_embed,
|
||||
llm_model_func=gpt_4o_mini_complete,
|
||||
)
|
||||
|
||||
print("\n🔄 Initializing storages...\n")
|
||||
await rag.initialize_storages() # Auto-initializes pipeline_status
|
||||
|
||||
print("\n🔍 Checking initialization status:\n")
|
||||
await check_lightrag_setup(rag, verbose=True)
|
||||
|
||||
# Cleanup
|
||||
import shutil
|
||||
|
||||
shutil.rmtree("./test_diagnostic", ignore_errors=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Check LightRAG initialization status")
|
||||
parser.add_argument(
|
||||
"--demo", action="store_true", help="Run a demonstration with a test instance"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
help="Show detailed diagnostic information",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.demo:
|
||||
asyncio.run(demo())
|
||||
else:
|
||||
print("Run with --demo to see the diagnostic tool in action")
|
||||
print("Or import this module and use check_lightrag_setup() with your instance")
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,200 @@
|
||||
"""
|
||||
Download all necessary cache files for offline deployment.
|
||||
|
||||
This module provides a CLI command to download tiktoken model cache files
|
||||
for offline environments where internet access is not available.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Known tiktoken encoding names (not model names)
|
||||
# These need to be loaded with tiktoken.get_encoding() instead of tiktoken.encoding_for_model()
|
||||
TIKTOKEN_ENCODING_NAMES = {"cl100k_base", "p50k_base", "r50k_base", "o200k_base"}
|
||||
|
||||
|
||||
def download_tiktoken_cache(cache_dir: str = None, models: list = None):
|
||||
"""Download tiktoken models to local cache
|
||||
|
||||
Args:
|
||||
cache_dir: Directory to store the cache files. If None, uses tiktoken's default location.
|
||||
models: List of model names or encoding names to download. If None, downloads common ones.
|
||||
|
||||
Returns:
|
||||
Tuple of (success_count, failed_models, actual_cache_dir)
|
||||
"""
|
||||
# If user specified a cache directory, set it BEFORE importing tiktoken
|
||||
# tiktoken reads TIKTOKEN_CACHE_DIR at import time
|
||||
user_specified_cache = cache_dir is not None
|
||||
|
||||
if user_specified_cache:
|
||||
cache_dir = os.path.abspath(cache_dir)
|
||||
os.environ["TIKTOKEN_CACHE_DIR"] = cache_dir
|
||||
cache_path = Path(cache_dir)
|
||||
cache_path.mkdir(parents=True, exist_ok=True)
|
||||
print(f"Using specified cache directory: {cache_dir}")
|
||||
else:
|
||||
# Check if TIKTOKEN_CACHE_DIR is already set in environment
|
||||
env_cache_dir = os.environ.get("TIKTOKEN_CACHE_DIR")
|
||||
if env_cache_dir:
|
||||
cache_dir = env_cache_dir
|
||||
print(f"Using TIKTOKEN_CACHE_DIR from environment: {cache_dir}")
|
||||
else:
|
||||
# Use tiktoken's default location (tempdir/data-gym-cache)
|
||||
import tempfile
|
||||
|
||||
cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
|
||||
print(f"Using tiktoken default cache directory: {cache_dir}")
|
||||
|
||||
# Now import tiktoken (it will use the cache directory we determined)
|
||||
try:
|
||||
import tiktoken
|
||||
except ImportError:
|
||||
print("Error: tiktoken is not installed.")
|
||||
print("Install with: pip install tiktoken")
|
||||
sys.exit(1)
|
||||
|
||||
# Common models used by LightRAG and OpenAI
|
||||
if models is None:
|
||||
models = [
|
||||
"gpt-4o-mini", # Default model for LightRAG
|
||||
"gpt-4o", # GPT-4 Omni
|
||||
"gpt-4", # GPT-4
|
||||
"gpt-3.5-turbo", # GPT-3.5 Turbo
|
||||
"text-embedding-ada-002", # Legacy embedding model
|
||||
"text-embedding-3-small", # Small embedding model
|
||||
"text-embedding-3-large", # Large embedding model
|
||||
"cl100k_base", # Default encoding for LightRAG
|
||||
]
|
||||
|
||||
print(f"\nDownloading {len(models)} tiktoken models...")
|
||||
print("=" * 70)
|
||||
|
||||
success_count = 0
|
||||
failed_models = []
|
||||
|
||||
for i, model in enumerate(models, 1):
|
||||
try:
|
||||
print(f"[{i}/{len(models)}] Downloading {model}...", end=" ", flush=True)
|
||||
# Use get_encoding for encoding names, encoding_for_model for model names
|
||||
if model in TIKTOKEN_ENCODING_NAMES:
|
||||
encoding = tiktoken.get_encoding(model)
|
||||
else:
|
||||
encoding = tiktoken.encoding_for_model(model)
|
||||
# Trigger download by encoding a test string
|
||||
encoding.encode("test")
|
||||
print("✓ Done")
|
||||
success_count += 1
|
||||
except KeyError as e:
|
||||
print(f"✗ Failed: Unknown model or encoding '{model}'")
|
||||
failed_models.append((model, str(e)))
|
||||
except Exception as e:
|
||||
print(f"✗ Failed: {e}")
|
||||
failed_models.append((model, str(e)))
|
||||
|
||||
print("=" * 70)
|
||||
print(f"\n✓ Successfully cached {success_count}/{len(models)} models")
|
||||
|
||||
if failed_models:
|
||||
print(f"\n✗ Failed to download {len(failed_models)} models:")
|
||||
for model, error in failed_models:
|
||||
print(f" - {model}: {error}")
|
||||
|
||||
print(f"\nCache location: {cache_dir}")
|
||||
print("\nFor offline deployment:")
|
||||
print(" 1. Copy directory to offline server:")
|
||||
print(f" tar -czf tiktoken_cache.tar.gz {cache_dir}")
|
||||
print(" scp tiktoken_cache.tar.gz user@offline-server:/path/to/")
|
||||
print("")
|
||||
print(" 2. On offline server, extract and set environment variable:")
|
||||
print(" tar -xzf tiktoken_cache.tar.gz")
|
||||
print(" export TIKTOKEN_CACHE_DIR=/path/to/tiktoken_cache")
|
||||
print("")
|
||||
print(" 3. Or copy to default location:")
|
||||
print(f" cp -r {cache_dir} ~/.tiktoken_cache/")
|
||||
|
||||
return success_count, failed_models
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the CLI command"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="lightrag-download-cache",
|
||||
description="Download cache files for LightRAG offline deployment",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Download to default location (~/.tiktoken_cache)
|
||||
lightrag-download-cache
|
||||
|
||||
# Download to specific directory
|
||||
lightrag-download-cache --cache-dir ./offline_cache/tiktoken
|
||||
|
||||
# Download specific models only
|
||||
lightrag-download-cache --models gpt-4o-mini gpt-4
|
||||
|
||||
For more information, visit: https://github.com/HKUDS/LightRAG
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--cache-dir",
|
||||
help="Cache directory path (default: ~/.tiktoken_cache)",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--models",
|
||||
nargs="+",
|
||||
help="Specific models to download (default: common models)",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version", action="version", version="%(prog)s (LightRAG cache downloader)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 70)
|
||||
print("LightRAG Offline Cache Downloader")
|
||||
print("=" * 70)
|
||||
|
||||
try:
|
||||
success_count, failed_models = download_tiktoken_cache(
|
||||
args.cache_dir, args.models
|
||||
)
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("Download Complete")
|
||||
print("=" * 70)
|
||||
|
||||
# Exit with error code if all downloads failed
|
||||
if success_count == 0:
|
||||
print("\n✗ All downloads failed. Please check your internet connection.")
|
||||
sys.exit(1)
|
||||
# Exit with warning code if some downloads failed
|
||||
elif failed_models:
|
||||
print(
|
||||
f"\n⚠ Some downloads failed ({len(failed_models)}/{success_count + len(failed_models)})"
|
||||
)
|
||||
sys.exit(2)
|
||||
else:
|
||||
print("\n✓ All cache files downloaded successfully!")
|
||||
sys.exit(0)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n✗ Download interrupted by user")
|
||||
sys.exit(130)
|
||||
except Exception as e:
|
||||
print(f"\n\n✗ Error: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,39 @@
|
||||
import argparse
|
||||
import getpass
|
||||
|
||||
from lightrag.api.passwords import hash_password
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate a bcrypt password value for AUTH_ACCOUNTS."
|
||||
)
|
||||
parser.add_argument(
|
||||
"password",
|
||||
nargs="?",
|
||||
help="Password to hash. If omitted, a secure prompt is used.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--username",
|
||||
help="Optional username. When provided, output is ready to paste into AUTH_ACCOUNTS.",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
password = args.password or getpass.getpass("Password: ")
|
||||
if not password:
|
||||
parser.error("password cannot be empty")
|
||||
|
||||
hashed_password = hash_password(password)
|
||||
if args.username:
|
||||
print(f"{args.username}:{hashed_password}")
|
||||
else:
|
||||
print(hashed_password)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,720 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Qdrant Legacy Data Preparation Tool for LightRAG
|
||||
|
||||
This tool copies data from new collections to legacy collections for testing
|
||||
the data migration logic in setup_collection function.
|
||||
|
||||
New Collections (with workspace_id):
|
||||
- lightrag_vdb_chunks
|
||||
- lightrag_vdb_entities
|
||||
- lightrag_vdb_relationships
|
||||
|
||||
Legacy Collections (without workspace_id, dynamically named as {workspace}_{suffix}):
|
||||
- {workspace}_chunks (e.g., space1_chunks)
|
||||
- {workspace}_entities (e.g., space1_entities)
|
||||
- {workspace}_relationships (e.g., space1_relationships)
|
||||
|
||||
The tool:
|
||||
1. Filters source data by workspace_id
|
||||
2. Verifies workspace data exists before creating legacy collections
|
||||
3. Removes workspace_id field to simulate legacy data format
|
||||
4. Copies only the specified workspace's data to legacy collections
|
||||
|
||||
Usage:
|
||||
python -m lightrag.tools.prepare_qdrant_legacy_data
|
||||
# or
|
||||
python lightrag/tools/prepare_qdrant_legacy_data.py
|
||||
|
||||
# Specify custom workspace
|
||||
python -m lightrag.tools.prepare_qdrant_legacy_data --workspace space1
|
||||
|
||||
# Process specific collection types only
|
||||
python -m lightrag.tools.prepare_qdrant_legacy_data --types chunks,entities
|
||||
|
||||
# Dry run (preview only, no actual changes)
|
||||
python -m lightrag.tools.prepare_qdrant_legacy_data --dry-run
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import configparser
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import pipmaster as pm
|
||||
from dotenv import load_dotenv
|
||||
from qdrant_client import QdrantClient, models # type: ignore
|
||||
|
||||
# Add project root to path for imports
|
||||
sys.path.insert(
|
||||
0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
)
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(dotenv_path=".env", override=False)
|
||||
|
||||
# Ensure qdrant-client is installed
|
||||
if not pm.is_installed("qdrant-client"):
|
||||
pm.install("qdrant-client")
|
||||
|
||||
# Collection namespace mapping: new collection pattern -> legacy suffix
|
||||
# Legacy collection will be named as: {workspace}_{suffix}
|
||||
COLLECTION_NAMESPACES = {
|
||||
"chunks": {
|
||||
"new": "lightrag_vdb_chunks",
|
||||
"suffix": "chunks",
|
||||
},
|
||||
"entities": {
|
||||
"new": "lightrag_vdb_entities",
|
||||
"suffix": "entities",
|
||||
},
|
||||
"relationships": {
|
||||
"new": "lightrag_vdb_relationships",
|
||||
"suffix": "relationships",
|
||||
},
|
||||
}
|
||||
|
||||
# Default batch size for copy operations
|
||||
DEFAULT_BATCH_SIZE = 500
|
||||
|
||||
# Field to remove from legacy data
|
||||
WORKSPACE_ID_FIELD = "workspace_id"
|
||||
|
||||
# ANSI color codes for terminal output
|
||||
BOLD_CYAN = "\033[1;36m"
|
||||
BOLD_GREEN = "\033[1;32m"
|
||||
BOLD_YELLOW = "\033[1;33m"
|
||||
BOLD_RED = "\033[1;31m"
|
||||
RESET = "\033[0m"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CopyStats:
|
||||
"""Copy operation statistics"""
|
||||
|
||||
collection_type: str
|
||||
source_collection: str
|
||||
target_collection: str
|
||||
total_records: int = 0
|
||||
copied_records: int = 0
|
||||
failed_records: int = 0
|
||||
errors: List[Dict[str, Any]] = field(default_factory=list)
|
||||
elapsed_time: float = 0.0
|
||||
|
||||
def add_error(self, batch_idx: int, error: Exception, batch_size: int):
|
||||
"""Record batch error"""
|
||||
self.errors.append(
|
||||
{
|
||||
"batch": batch_idx,
|
||||
"error_type": type(error).__name__,
|
||||
"error_msg": str(error),
|
||||
"records_lost": batch_size,
|
||||
"timestamp": time.time(),
|
||||
}
|
||||
)
|
||||
self.failed_records += batch_size
|
||||
|
||||
|
||||
class QdrantLegacyDataPreparationTool:
|
||||
"""Tool for preparing legacy data in Qdrant for migration testing"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
workspace: str = "space1",
|
||||
batch_size: int = DEFAULT_BATCH_SIZE,
|
||||
dry_run: bool = False,
|
||||
clear_target: bool = False,
|
||||
):
|
||||
"""
|
||||
Initialize the tool.
|
||||
|
||||
Args:
|
||||
workspace: Workspace to use for filtering new collection data
|
||||
batch_size: Number of records to process per batch
|
||||
dry_run: If True, only preview operations without making changes
|
||||
clear_target: If True, delete target collection before copying data
|
||||
"""
|
||||
self.workspace = workspace
|
||||
self.batch_size = batch_size
|
||||
self.dry_run = dry_run
|
||||
self.clear_target = clear_target
|
||||
self._client: Optional[QdrantClient] = None
|
||||
|
||||
def _get_client(self) -> QdrantClient:
|
||||
"""Get or create QdrantClient instance"""
|
||||
if self._client is None:
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini", "utf-8")
|
||||
|
||||
self._client = QdrantClient(
|
||||
url=os.environ.get(
|
||||
"QDRANT_URL", config.get("qdrant", "uri", fallback=None)
|
||||
),
|
||||
api_key=os.environ.get(
|
||||
"QDRANT_API_KEY",
|
||||
config.get("qdrant", "apikey", fallback=None),
|
||||
),
|
||||
)
|
||||
return self._client
|
||||
|
||||
def print_header(self):
|
||||
"""Print tool header"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Qdrant Legacy Data Preparation Tool - LightRAG")
|
||||
print("=" * 60)
|
||||
if self.dry_run:
|
||||
print(f"{BOLD_YELLOW}⚠️ DRY RUN MODE - No changes will be made{RESET}")
|
||||
if self.clear_target:
|
||||
print(
|
||||
f"{BOLD_RED}⚠️ CLEAR TARGET MODE - Target collections will be deleted first{RESET}"
|
||||
)
|
||||
print(f"Workspace: {BOLD_CYAN}{self.workspace}{RESET}")
|
||||
print(f"Batch Size: {self.batch_size}")
|
||||
print("=" * 60)
|
||||
|
||||
def check_connection(self) -> bool:
|
||||
"""Check Qdrant connection"""
|
||||
try:
|
||||
client = self._get_client()
|
||||
# Try to list collections to verify connection
|
||||
client.get_collections()
|
||||
print(f"{BOLD_GREEN}✓{RESET} Qdrant connection successful")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"{BOLD_RED}✗{RESET} Qdrant connection failed: {e}")
|
||||
return False
|
||||
|
||||
def get_collection_info(self, collection_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get collection information.
|
||||
|
||||
Args:
|
||||
collection_name: Name of the collection
|
||||
|
||||
Returns:
|
||||
Dictionary with collection info (vector_size, count) or None if not exists
|
||||
"""
|
||||
client = self._get_client()
|
||||
|
||||
if not client.collection_exists(collection_name):
|
||||
return None
|
||||
|
||||
info = client.get_collection(collection_name)
|
||||
count = client.count(collection_name=collection_name, exact=True).count
|
||||
|
||||
# Handle both object and dict formats for vectors config
|
||||
vectors_config = info.config.params.vectors
|
||||
if isinstance(vectors_config, dict):
|
||||
# Named vectors format or dict format
|
||||
if vectors_config:
|
||||
first_key = next(iter(vectors_config.keys()), None)
|
||||
if first_key and hasattr(vectors_config[first_key], "size"):
|
||||
vector_size = vectors_config[first_key].size
|
||||
distance = vectors_config[first_key].distance
|
||||
else:
|
||||
# Try to get from dict values
|
||||
first_val = next(iter(vectors_config.values()), {})
|
||||
vector_size = (
|
||||
first_val.get("size")
|
||||
if isinstance(first_val, dict)
|
||||
else getattr(first_val, "size", None)
|
||||
)
|
||||
distance = (
|
||||
first_val.get("distance")
|
||||
if isinstance(first_val, dict)
|
||||
else getattr(first_val, "distance", None)
|
||||
)
|
||||
else:
|
||||
vector_size = None
|
||||
distance = None
|
||||
else:
|
||||
# Standard single vector format
|
||||
vector_size = vectors_config.size
|
||||
distance = vectors_config.distance
|
||||
|
||||
return {
|
||||
"name": collection_name,
|
||||
"vector_size": vector_size,
|
||||
"count": count,
|
||||
"distance": distance,
|
||||
}
|
||||
|
||||
def delete_collection(self, collection_name: str) -> bool:
|
||||
"""
|
||||
Delete a collection if it exists.
|
||||
|
||||
Args:
|
||||
collection_name: Name of the collection to delete
|
||||
|
||||
Returns:
|
||||
True if deleted or doesn't exist
|
||||
"""
|
||||
client = self._get_client()
|
||||
|
||||
if not client.collection_exists(collection_name):
|
||||
return True
|
||||
|
||||
if self.dry_run:
|
||||
target_info = self.get_collection_info(collection_name)
|
||||
count = target_info["count"] if target_info else 0
|
||||
print(
|
||||
f" {BOLD_YELLOW}[DRY RUN]{RESET} Would delete collection '{collection_name}' ({count:,} records)"
|
||||
)
|
||||
return True
|
||||
|
||||
try:
|
||||
target_info = self.get_collection_info(collection_name)
|
||||
count = target_info["count"] if target_info else 0
|
||||
client.delete_collection(collection_name=collection_name)
|
||||
print(
|
||||
f" {BOLD_RED}✗{RESET} Deleted collection '{collection_name}' ({count:,} records)"
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f" {BOLD_RED}✗{RESET} Failed to delete collection: {e}")
|
||||
return False
|
||||
|
||||
def create_legacy_collection(
|
||||
self, collection_name: str, vector_size: int, distance: models.Distance
|
||||
) -> bool:
|
||||
"""
|
||||
Create legacy collection if it doesn't exist.
|
||||
|
||||
Args:
|
||||
collection_name: Name of the collection to create
|
||||
vector_size: Dimension of vectors
|
||||
distance: Distance metric
|
||||
|
||||
Returns:
|
||||
True if created or already exists
|
||||
"""
|
||||
client = self._get_client()
|
||||
|
||||
if client.collection_exists(collection_name):
|
||||
print(f" Collection '{collection_name}' already exists")
|
||||
return True
|
||||
|
||||
if self.dry_run:
|
||||
print(
|
||||
f" {BOLD_YELLOW}[DRY RUN]{RESET} Would create collection '{collection_name}' with {vector_size}d vectors"
|
||||
)
|
||||
return True
|
||||
|
||||
try:
|
||||
client.create_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=models.VectorParams(
|
||||
size=vector_size,
|
||||
distance=distance,
|
||||
),
|
||||
hnsw_config=models.HnswConfigDiff(
|
||||
payload_m=16,
|
||||
m=0,
|
||||
),
|
||||
)
|
||||
print(
|
||||
f" {BOLD_GREEN}✓{RESET} Created collection '{collection_name}' with {vector_size}d vectors"
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f" {BOLD_RED}✗{RESET} Failed to create collection: {e}")
|
||||
return False
|
||||
|
||||
def _get_workspace_filter(self) -> models.Filter:
|
||||
"""Create workspace filter for Qdrant queries"""
|
||||
return models.Filter(
|
||||
must=[
|
||||
models.FieldCondition(
|
||||
key=WORKSPACE_ID_FIELD,
|
||||
match=models.MatchValue(value=self.workspace),
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
def get_workspace_count(self, collection_name: str) -> int:
|
||||
"""
|
||||
Get count of records for the current workspace in a collection.
|
||||
|
||||
Args:
|
||||
collection_name: Name of the collection
|
||||
|
||||
Returns:
|
||||
Count of records for the workspace
|
||||
"""
|
||||
client = self._get_client()
|
||||
return client.count(
|
||||
collection_name=collection_name,
|
||||
count_filter=self._get_workspace_filter(),
|
||||
exact=True,
|
||||
).count
|
||||
|
||||
def copy_collection_data(
|
||||
self,
|
||||
source_collection: str,
|
||||
target_collection: str,
|
||||
collection_type: str,
|
||||
workspace_count: int,
|
||||
) -> CopyStats:
|
||||
"""
|
||||
Copy data from source to target collection.
|
||||
|
||||
This filters by workspace_id and removes it from payload to simulate legacy data format.
|
||||
|
||||
Args:
|
||||
source_collection: Source collection name
|
||||
target_collection: Target collection name
|
||||
collection_type: Type of collection (chunks, entities, relationships)
|
||||
workspace_count: Pre-computed count of workspace records
|
||||
|
||||
Returns:
|
||||
CopyStats with operation results
|
||||
"""
|
||||
client = self._get_client()
|
||||
stats = CopyStats(
|
||||
collection_type=collection_type,
|
||||
source_collection=source_collection,
|
||||
target_collection=target_collection,
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
stats.total_records = workspace_count
|
||||
|
||||
if workspace_count == 0:
|
||||
print(f" No records for workspace '{self.workspace}', skipping")
|
||||
stats.elapsed_time = time.time() - start_time
|
||||
return stats
|
||||
|
||||
print(f" Workspace records: {workspace_count:,}")
|
||||
|
||||
if self.dry_run:
|
||||
print(
|
||||
f" {BOLD_YELLOW}[DRY RUN]{RESET} Would copy {workspace_count:,} records to '{target_collection}'"
|
||||
)
|
||||
stats.copied_records = workspace_count
|
||||
stats.elapsed_time = time.time() - start_time
|
||||
return stats
|
||||
|
||||
# Batch copy using scroll with workspace filter
|
||||
workspace_filter = self._get_workspace_filter()
|
||||
offset = None
|
||||
batch_idx = 0
|
||||
|
||||
while True:
|
||||
# Scroll source collection with workspace filter
|
||||
result = client.scroll(
|
||||
collection_name=source_collection,
|
||||
scroll_filter=workspace_filter,
|
||||
limit=self.batch_size,
|
||||
offset=offset,
|
||||
with_vectors=True,
|
||||
with_payload=True,
|
||||
)
|
||||
points, next_offset = result
|
||||
|
||||
if not points:
|
||||
break
|
||||
|
||||
batch_idx += 1
|
||||
|
||||
# Transform points: remove workspace_id from payload
|
||||
new_points = []
|
||||
for point in points:
|
||||
new_payload = dict(point.payload or {})
|
||||
# Remove workspace_id to simulate legacy format
|
||||
new_payload.pop(WORKSPACE_ID_FIELD, None)
|
||||
|
||||
# Use original id from payload if available, otherwise use point.id
|
||||
original_id = new_payload.get("id")
|
||||
if original_id:
|
||||
# Generate a simple deterministic id for legacy format
|
||||
# Use original id directly (legacy format didn't have workspace prefix)
|
||||
import hashlib
|
||||
import uuid
|
||||
|
||||
hashed = hashlib.sha256(original_id.encode("utf-8")).digest()
|
||||
point_id = uuid.UUID(bytes=hashed[:16], version=4).hex
|
||||
else:
|
||||
point_id = str(point.id)
|
||||
|
||||
new_points.append(
|
||||
models.PointStruct(
|
||||
id=point_id,
|
||||
vector=point.vector,
|
||||
payload=new_payload,
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
# Upsert to target collection
|
||||
client.upsert(
|
||||
collection_name=target_collection, points=new_points, wait=True
|
||||
)
|
||||
stats.copied_records += len(new_points)
|
||||
|
||||
# Progress bar
|
||||
progress = (stats.copied_records / workspace_count) * 100
|
||||
bar_length = 30
|
||||
filled = int(bar_length * stats.copied_records // workspace_count)
|
||||
bar = "█" * filled + "░" * (bar_length - filled)
|
||||
|
||||
print(
|
||||
f"\r Copying: {bar} {stats.copied_records:,}/{workspace_count:,} ({progress:.1f}%) ",
|
||||
end="",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
stats.add_error(batch_idx, e, len(new_points))
|
||||
print(
|
||||
f"\n {BOLD_RED}✗{RESET} Batch {batch_idx} failed: {type(e).__name__}: {e}"
|
||||
)
|
||||
|
||||
if next_offset is None:
|
||||
break
|
||||
offset = next_offset
|
||||
|
||||
print() # New line after progress bar
|
||||
stats.elapsed_time = time.time() - start_time
|
||||
|
||||
return stats
|
||||
|
||||
def process_collection_type(self, collection_type: str) -> Optional[CopyStats]:
|
||||
"""
|
||||
Process a single collection type.
|
||||
|
||||
Args:
|
||||
collection_type: Type of collection (chunks, entities, relationships)
|
||||
|
||||
Returns:
|
||||
CopyStats or None if error
|
||||
"""
|
||||
namespace_config = COLLECTION_NAMESPACES.get(collection_type)
|
||||
if not namespace_config:
|
||||
print(f"{BOLD_RED}✗{RESET} Unknown collection type: {collection_type}")
|
||||
return None
|
||||
|
||||
source = namespace_config["new"]
|
||||
# Generate legacy collection name dynamically: {workspace}_{suffix}
|
||||
target = f"{self.workspace}_{namespace_config['suffix']}"
|
||||
|
||||
print(f"\n{'=' * 50}")
|
||||
print(f"Processing: {BOLD_CYAN}{collection_type}{RESET}")
|
||||
print(f"{'=' * 50}")
|
||||
print(f" Source: {source}")
|
||||
print(f" Target: {target}")
|
||||
|
||||
# Check source collection
|
||||
source_info = self.get_collection_info(source)
|
||||
if source_info is None:
|
||||
print(
|
||||
f" {BOLD_YELLOW}⚠{RESET} Source collection '{source}' does not exist, skipping"
|
||||
)
|
||||
return None
|
||||
|
||||
print(f" Source vector dimension: {source_info['vector_size']}d")
|
||||
print(f" Source distance metric: {source_info['distance']}")
|
||||
print(f" Source total records: {source_info['count']:,}")
|
||||
|
||||
# Check workspace data exists BEFORE creating legacy collection
|
||||
workspace_count = self.get_workspace_count(source)
|
||||
print(f" Workspace '{self.workspace}' records: {workspace_count:,}")
|
||||
|
||||
if workspace_count == 0:
|
||||
print(
|
||||
f" {BOLD_YELLOW}⚠{RESET} No data found for workspace '{self.workspace}' in '{source}', skipping"
|
||||
)
|
||||
return None
|
||||
|
||||
# Clear target collection if requested
|
||||
if self.clear_target:
|
||||
if not self.delete_collection(target):
|
||||
return None
|
||||
|
||||
# Create target collection only after confirming workspace data exists
|
||||
if not self.create_legacy_collection(
|
||||
target, source_info["vector_size"], source_info["distance"]
|
||||
):
|
||||
return None
|
||||
|
||||
# Copy data with workspace filter
|
||||
stats = self.copy_collection_data(
|
||||
source, target, collection_type, workspace_count
|
||||
)
|
||||
|
||||
# Print result
|
||||
if stats.failed_records == 0:
|
||||
print(
|
||||
f" {BOLD_GREEN}✓{RESET} Copied {stats.copied_records:,} records in {stats.elapsed_time:.2f}s"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f" {BOLD_YELLOW}⚠{RESET} Copied {stats.copied_records:,} records, "
|
||||
f"{BOLD_RED}{stats.failed_records:,} failed{RESET} in {stats.elapsed_time:.2f}s"
|
||||
)
|
||||
|
||||
return stats
|
||||
|
||||
def print_summary(self, all_stats: List[CopyStats]):
|
||||
"""Print summary of all operations"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Summary")
|
||||
print("=" * 60)
|
||||
|
||||
total_copied = sum(s.copied_records for s in all_stats)
|
||||
total_failed = sum(s.failed_records for s in all_stats)
|
||||
total_time = sum(s.elapsed_time for s in all_stats)
|
||||
|
||||
for stats in all_stats:
|
||||
status = (
|
||||
f"{BOLD_GREEN}✓{RESET}"
|
||||
if stats.failed_records == 0
|
||||
else f"{BOLD_YELLOW}⚠{RESET}"
|
||||
)
|
||||
print(
|
||||
f" {status} {stats.collection_type}: {stats.copied_records:,}/{stats.total_records:,} "
|
||||
f"({stats.source_collection} → {stats.target_collection})"
|
||||
)
|
||||
|
||||
print("-" * 60)
|
||||
print(f" Total records copied: {BOLD_CYAN}{total_copied:,}{RESET}")
|
||||
if total_failed > 0:
|
||||
print(f" Total records failed: {BOLD_RED}{total_failed:,}{RESET}")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
|
||||
if self.dry_run:
|
||||
print(f"\n{BOLD_YELLOW}⚠️ DRY RUN - No actual changes were made{RESET}")
|
||||
|
||||
# Print error details if any
|
||||
all_errors = []
|
||||
for stats in all_stats:
|
||||
all_errors.extend(stats.errors)
|
||||
|
||||
if all_errors:
|
||||
print(f"\n{BOLD_RED}Errors ({len(all_errors)}){RESET}")
|
||||
for i, error in enumerate(all_errors[:5], 1):
|
||||
print(
|
||||
f" {i}. Batch {error['batch']}: {error['error_type']}: {error['error_msg']}"
|
||||
)
|
||||
if len(all_errors) > 5:
|
||||
print(f" ... and {len(all_errors) - 5} more errors")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
async def run(self, collection_types: Optional[List[str]] = None):
|
||||
"""
|
||||
Run the data preparation tool.
|
||||
|
||||
Args:
|
||||
collection_types: List of collection types to process (default: all)
|
||||
"""
|
||||
self.print_header()
|
||||
|
||||
# Check connection
|
||||
if not self.check_connection():
|
||||
return
|
||||
|
||||
# Determine which collection types to process
|
||||
if collection_types:
|
||||
types_to_process = [t.strip() for t in collection_types]
|
||||
invalid_types = [
|
||||
t for t in types_to_process if t not in COLLECTION_NAMESPACES
|
||||
]
|
||||
if invalid_types:
|
||||
print(
|
||||
f"{BOLD_RED}✗{RESET} Invalid collection types: {', '.join(invalid_types)}"
|
||||
)
|
||||
print(f" Valid types: {', '.join(COLLECTION_NAMESPACES.keys())}")
|
||||
return
|
||||
else:
|
||||
types_to_process = list(COLLECTION_NAMESPACES.keys())
|
||||
|
||||
print(f"\nCollection types to process: {', '.join(types_to_process)}")
|
||||
|
||||
# Process each collection type
|
||||
all_stats = []
|
||||
for ctype in types_to_process:
|
||||
stats = self.process_collection_type(ctype)
|
||||
if stats:
|
||||
all_stats.append(stats)
|
||||
|
||||
# Print summary
|
||||
if all_stats:
|
||||
self.print_summary(all_stats)
|
||||
else:
|
||||
print(f"\n{BOLD_YELLOW}⚠{RESET} No collections were processed")
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""Parse command line arguments"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Prepare legacy data in Qdrant for migration testing",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python -m lightrag.tools.prepare_qdrant_legacy_data
|
||||
python -m lightrag.tools.prepare_qdrant_legacy_data --workspace space1
|
||||
python -m lightrag.tools.prepare_qdrant_legacy_data --types chunks,entities
|
||||
python -m lightrag.tools.prepare_qdrant_legacy_data --dry-run
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--workspace",
|
||||
type=str,
|
||||
default="space1",
|
||||
help="Workspace name (default: space1)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--types",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Comma-separated list of collection types (chunks, entities, relationships)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=DEFAULT_BATCH_SIZE,
|
||||
help=f"Batch size for copy operations (default: {DEFAULT_BATCH_SIZE})",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Preview operations without making changes",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--clear-target",
|
||||
action="store_true",
|
||||
help="Delete target collections before copying (for clean test environment)",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main entry point"""
|
||||
args = parse_args()
|
||||
|
||||
collection_types = None
|
||||
if args.types:
|
||||
collection_types = [t.strip() for t in args.types.split(",")]
|
||||
|
||||
tool = QdrantLegacyDataPreparationTool(
|
||||
workspace=args.workspace,
|
||||
batch_size=args.batch_size,
|
||||
dry_run=args.dry_run,
|
||||
clear_target=args.clear_target,
|
||||
)
|
||||
|
||||
await tool.run(collection_types=collection_types)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user