feat: 重构知识库系统,移除Hermes集成,增强RAG和同步功能

主要变更:
- 移除Hermes智能体及相关回调服务
- 新增知识库RAG、同步、调度、规范化和索引任务服务
- 重构orchestrator服务,增强运行时聊天功能
- 更新前端聊天、政策制度、设置等页面样式和逻辑
- 更新expense_claims和document_intelligence服务
- 删除llm_wiki相关服务和测试文件
- 更新docker-compose配置和启动脚本
This commit is contained in:
caoxiaozhu
2026-05-17 08:38:41 +00:00
parent 212c935308
commit 68f663f2f4
308 changed files with 83729 additions and 13588 deletions

View File

@@ -0,0 +1,179 @@
#!/usr/bin/env python3
"""
Diagnostic tool to check LightRAG initialization status.
This tool helps developers verify that their LightRAG instance is properly
initialized and ready to use. It should be called AFTER initialize_storages()
to validate that all components are correctly set up.
Usage:
# Basic usage in your code:
rag = LightRAG(...)
await rag.initialize_storages()
await check_lightrag_setup(rag, verbose=True)
# Run demo from command line:
python -m lightrag.tools.check_initialization --demo
"""
import asyncio
import sys
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from lightrag import LightRAG
from lightrag.base import StoragesStatus
async def check_lightrag_setup(rag_instance: LightRAG, verbose: bool = False) -> bool:
"""
Check if a LightRAG instance is properly initialized.
Args:
rag_instance: The LightRAG instance to check
verbose: If True, print detailed diagnostic information
Returns:
True if properly initialized, False otherwise
"""
issues = []
warnings = []
print("🔍 Checking LightRAG initialization status...\n")
# Check storage initialization status
if not hasattr(rag_instance, "_storages_status"):
issues.append("LightRAG instance missing _storages_status attribute")
elif rag_instance._storages_status != StoragesStatus.INITIALIZED:
issues.append(
f"Storages not initialized (status: {rag_instance._storages_status.name})"
)
else:
print("✅ Storage status: INITIALIZED")
# Check individual storage components
storage_components = [
("full_docs", "Document storage"),
("text_chunks", "Text chunks storage"),
("entities_vdb", "Entity vector database"),
("relationships_vdb", "Relationship vector database"),
("chunks_vdb", "Chunks vector database"),
("doc_status", "Document status tracker"),
("llm_response_cache", "LLM response cache"),
("full_entities", "Entity storage"),
("full_relations", "Relation storage"),
("chunk_entity_relation_graph", "Graph storage"),
]
if verbose:
print("\n📦 Storage Components:")
for component, description in storage_components:
if not hasattr(rag_instance, component):
issues.append(f"Missing storage component: {component} ({description})")
else:
storage = getattr(rag_instance, component)
if storage is None:
warnings.append(f"Storage {component} is None (might be optional)")
elif hasattr(storage, "_storage_lock"):
if storage._storage_lock is None:
issues.append(f"Storage {component} not initialized (lock is None)")
elif verbose:
print(f"{description}: Ready")
elif verbose:
print(f"{description}: Ready")
# Check pipeline status
try:
from lightrag.kg.shared_storage import get_namespace_data
get_namespace_data("pipeline_status", workspace=rag_instance.workspace)
print("✅ Pipeline status: INITIALIZED")
except KeyError:
issues.append(
"Pipeline status not initialized - call rag.initialize_storages() first"
)
except Exception as e:
issues.append(f"Error checking pipeline status: {str(e)}")
# Print results
print("\n" + "=" * 50)
if issues:
print("❌ Issues found:\n")
for issue in issues:
print(f"{issue}")
print("\n📝 To fix, run this initialization sequence:\n")
print(" await rag.initialize_storages()")
print(
"\n📚 Documentation: https://github.com/HKUDS/LightRAG#important-initialization-requirements"
)
if warnings and verbose:
print("\n⚠️ Warnings (might be normal):")
for warning in warnings:
print(f"{warning}")
return False
else:
print("✅ LightRAG is properly initialized and ready to use!")
if warnings and verbose:
print("\n⚠️ Warnings (might be normal):")
for warning in warnings:
print(f"{warning}")
return True
async def demo():
"""Demonstrate the diagnostic tool with a test instance."""
from lightrag.llm.openai import openai_embed, gpt_4o_mini_complete
print("=" * 50)
print("LightRAG Initialization Diagnostic Tool")
print("=" * 50)
# Create test instance
rag = LightRAG(
working_dir="./test_diagnostic",
embedding_func=openai_embed,
llm_model_func=gpt_4o_mini_complete,
)
print("\n🔄 Initializing storages...\n")
await rag.initialize_storages() # Auto-initializes pipeline_status
print("\n🔍 Checking initialization status:\n")
await check_lightrag_setup(rag, verbose=True)
# Cleanup
import shutil
shutil.rmtree("./test_diagnostic", ignore_errors=True)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Check LightRAG initialization status")
parser.add_argument(
"--demo", action="store_true", help="Run a demonstration with a test instance"
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Show detailed diagnostic information",
)
args = parser.parse_args()
if args.demo:
asyncio.run(demo())
else:
print("Run with --demo to see the diagnostic tool in action")
print("Or import this module and use check_lightrag_setup() with your instance")

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,200 @@
"""
Download all necessary cache files for offline deployment.
This module provides a CLI command to download tiktoken model cache files
for offline environments where internet access is not available.
"""
import os
import sys
from pathlib import Path
# Known tiktoken encoding names (not model names)
# These need to be loaded with tiktoken.get_encoding() instead of tiktoken.encoding_for_model()
TIKTOKEN_ENCODING_NAMES = {"cl100k_base", "p50k_base", "r50k_base", "o200k_base"}
def download_tiktoken_cache(cache_dir: str = None, models: list = None):
"""Download tiktoken models to local cache
Args:
cache_dir: Directory to store the cache files. If None, uses tiktoken's default location.
models: List of model names or encoding names to download. If None, downloads common ones.
Returns:
Tuple of (success_count, failed_models, actual_cache_dir)
"""
# If user specified a cache directory, set it BEFORE importing tiktoken
# tiktoken reads TIKTOKEN_CACHE_DIR at import time
user_specified_cache = cache_dir is not None
if user_specified_cache:
cache_dir = os.path.abspath(cache_dir)
os.environ["TIKTOKEN_CACHE_DIR"] = cache_dir
cache_path = Path(cache_dir)
cache_path.mkdir(parents=True, exist_ok=True)
print(f"Using specified cache directory: {cache_dir}")
else:
# Check if TIKTOKEN_CACHE_DIR is already set in environment
env_cache_dir = os.environ.get("TIKTOKEN_CACHE_DIR")
if env_cache_dir:
cache_dir = env_cache_dir
print(f"Using TIKTOKEN_CACHE_DIR from environment: {cache_dir}")
else:
# Use tiktoken's default location (tempdir/data-gym-cache)
import tempfile
cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
print(f"Using tiktoken default cache directory: {cache_dir}")
# Now import tiktoken (it will use the cache directory we determined)
try:
import tiktoken
except ImportError:
print("Error: tiktoken is not installed.")
print("Install with: pip install tiktoken")
sys.exit(1)
# Common models used by LightRAG and OpenAI
if models is None:
models = [
"gpt-4o-mini", # Default model for LightRAG
"gpt-4o", # GPT-4 Omni
"gpt-4", # GPT-4
"gpt-3.5-turbo", # GPT-3.5 Turbo
"text-embedding-ada-002", # Legacy embedding model
"text-embedding-3-small", # Small embedding model
"text-embedding-3-large", # Large embedding model
"cl100k_base", # Default encoding for LightRAG
]
print(f"\nDownloading {len(models)} tiktoken models...")
print("=" * 70)
success_count = 0
failed_models = []
for i, model in enumerate(models, 1):
try:
print(f"[{i}/{len(models)}] Downloading {model}...", end=" ", flush=True)
# Use get_encoding for encoding names, encoding_for_model for model names
if model in TIKTOKEN_ENCODING_NAMES:
encoding = tiktoken.get_encoding(model)
else:
encoding = tiktoken.encoding_for_model(model)
# Trigger download by encoding a test string
encoding.encode("test")
print("✓ Done")
success_count += 1
except KeyError as e:
print(f"✗ Failed: Unknown model or encoding '{model}'")
failed_models.append((model, str(e)))
except Exception as e:
print(f"✗ Failed: {e}")
failed_models.append((model, str(e)))
print("=" * 70)
print(f"\n✓ Successfully cached {success_count}/{len(models)} models")
if failed_models:
print(f"\n✗ Failed to download {len(failed_models)} models:")
for model, error in failed_models:
print(f" - {model}: {error}")
print(f"\nCache location: {cache_dir}")
print("\nFor offline deployment:")
print(" 1. Copy directory to offline server:")
print(f" tar -czf tiktoken_cache.tar.gz {cache_dir}")
print(" scp tiktoken_cache.tar.gz user@offline-server:/path/to/")
print("")
print(" 2. On offline server, extract and set environment variable:")
print(" tar -xzf tiktoken_cache.tar.gz")
print(" export TIKTOKEN_CACHE_DIR=/path/to/tiktoken_cache")
print("")
print(" 3. Or copy to default location:")
print(f" cp -r {cache_dir} ~/.tiktoken_cache/")
return success_count, failed_models
def main():
"""Main entry point for the CLI command"""
import argparse
parser = argparse.ArgumentParser(
prog="lightrag-download-cache",
description="Download cache files for LightRAG offline deployment",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Download to default location (~/.tiktoken_cache)
lightrag-download-cache
# Download to specific directory
lightrag-download-cache --cache-dir ./offline_cache/tiktoken
# Download specific models only
lightrag-download-cache --models gpt-4o-mini gpt-4
For more information, visit: https://github.com/HKUDS/LightRAG
""",
)
parser.add_argument(
"--cache-dir",
help="Cache directory path (default: ~/.tiktoken_cache)",
default=None,
)
parser.add_argument(
"--models",
nargs="+",
help="Specific models to download (default: common models)",
default=None,
)
parser.add_argument(
"--version", action="version", version="%(prog)s (LightRAG cache downloader)"
)
args = parser.parse_args()
print("=" * 70)
print("LightRAG Offline Cache Downloader")
print("=" * 70)
try:
success_count, failed_models = download_tiktoken_cache(
args.cache_dir, args.models
)
print("\n" + "=" * 70)
print("Download Complete")
print("=" * 70)
# Exit with error code if all downloads failed
if success_count == 0:
print("\n✗ All downloads failed. Please check your internet connection.")
sys.exit(1)
# Exit with warning code if some downloads failed
elif failed_models:
print(
f"\n⚠ Some downloads failed ({len(failed_models)}/{success_count + len(failed_models)})"
)
sys.exit(2)
else:
print("\n✓ All cache files downloaded successfully!")
sys.exit(0)
except KeyboardInterrupt:
print("\n\n✗ Download interrupted by user")
sys.exit(130)
except Exception as e:
print(f"\n\n✗ Error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,39 @@
import argparse
import getpass
from lightrag.api.passwords import hash_password
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Generate a bcrypt password value for AUTH_ACCOUNTS."
)
parser.add_argument(
"password",
nargs="?",
help="Password to hash. If omitted, a secure prompt is used.",
)
parser.add_argument(
"--username",
help="Optional username. When provided, output is ready to paste into AUTH_ACCOUNTS.",
)
return parser
def main(argv: list[str] | None = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
password = args.password or getpass.getpass("Password: ")
if not password:
parser.error("password cannot be empty")
hashed_password = hash_password(password)
if args.username:
print(f"{args.username}:{hashed_password}")
else:
print(hashed_password)
return 0
if __name__ == "__main__":
raise SystemExit(main())

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,720 @@
#!/usr/bin/env python3
"""
Qdrant Legacy Data Preparation Tool for LightRAG
This tool copies data from new collections to legacy collections for testing
the data migration logic in setup_collection function.
New Collections (with workspace_id):
- lightrag_vdb_chunks
- lightrag_vdb_entities
- lightrag_vdb_relationships
Legacy Collections (without workspace_id, dynamically named as {workspace}_{suffix}):
- {workspace}_chunks (e.g., space1_chunks)
- {workspace}_entities (e.g., space1_entities)
- {workspace}_relationships (e.g., space1_relationships)
The tool:
1. Filters source data by workspace_id
2. Verifies workspace data exists before creating legacy collections
3. Removes workspace_id field to simulate legacy data format
4. Copies only the specified workspace's data to legacy collections
Usage:
python -m lightrag.tools.prepare_qdrant_legacy_data
# or
python lightrag/tools/prepare_qdrant_legacy_data.py
# Specify custom workspace
python -m lightrag.tools.prepare_qdrant_legacy_data --workspace space1
# Process specific collection types only
python -m lightrag.tools.prepare_qdrant_legacy_data --types chunks,entities
# Dry run (preview only, no actual changes)
python -m lightrag.tools.prepare_qdrant_legacy_data --dry-run
"""
import argparse
import asyncio
import configparser
import os
import sys
import time
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
import pipmaster as pm
from dotenv import load_dotenv
from qdrant_client import QdrantClient, models # type: ignore
# Add project root to path for imports
sys.path.insert(
0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
)
# Load environment variables
load_dotenv(dotenv_path=".env", override=False)
# Ensure qdrant-client is installed
if not pm.is_installed("qdrant-client"):
pm.install("qdrant-client")
# Collection namespace mapping: new collection pattern -> legacy suffix
# Legacy collection will be named as: {workspace}_{suffix}
COLLECTION_NAMESPACES = {
"chunks": {
"new": "lightrag_vdb_chunks",
"suffix": "chunks",
},
"entities": {
"new": "lightrag_vdb_entities",
"suffix": "entities",
},
"relationships": {
"new": "lightrag_vdb_relationships",
"suffix": "relationships",
},
}
# Default batch size for copy operations
DEFAULT_BATCH_SIZE = 500
# Field to remove from legacy data
WORKSPACE_ID_FIELD = "workspace_id"
# ANSI color codes for terminal output
BOLD_CYAN = "\033[1;36m"
BOLD_GREEN = "\033[1;32m"
BOLD_YELLOW = "\033[1;33m"
BOLD_RED = "\033[1;31m"
RESET = "\033[0m"
@dataclass
class CopyStats:
"""Copy operation statistics"""
collection_type: str
source_collection: str
target_collection: str
total_records: int = 0
copied_records: int = 0
failed_records: int = 0
errors: List[Dict[str, Any]] = field(default_factory=list)
elapsed_time: float = 0.0
def add_error(self, batch_idx: int, error: Exception, batch_size: int):
"""Record batch error"""
self.errors.append(
{
"batch": batch_idx,
"error_type": type(error).__name__,
"error_msg": str(error),
"records_lost": batch_size,
"timestamp": time.time(),
}
)
self.failed_records += batch_size
class QdrantLegacyDataPreparationTool:
"""Tool for preparing legacy data in Qdrant for migration testing"""
def __init__(
self,
workspace: str = "space1",
batch_size: int = DEFAULT_BATCH_SIZE,
dry_run: bool = False,
clear_target: bool = False,
):
"""
Initialize the tool.
Args:
workspace: Workspace to use for filtering new collection data
batch_size: Number of records to process per batch
dry_run: If True, only preview operations without making changes
clear_target: If True, delete target collection before copying data
"""
self.workspace = workspace
self.batch_size = batch_size
self.dry_run = dry_run
self.clear_target = clear_target
self._client: Optional[QdrantClient] = None
def _get_client(self) -> QdrantClient:
"""Get or create QdrantClient instance"""
if self._client is None:
config = configparser.ConfigParser()
config.read("config.ini", "utf-8")
self._client = QdrantClient(
url=os.environ.get(
"QDRANT_URL", config.get("qdrant", "uri", fallback=None)
),
api_key=os.environ.get(
"QDRANT_API_KEY",
config.get("qdrant", "apikey", fallback=None),
),
)
return self._client
def print_header(self):
"""Print tool header"""
print("\n" + "=" * 60)
print("Qdrant Legacy Data Preparation Tool - LightRAG")
print("=" * 60)
if self.dry_run:
print(f"{BOLD_YELLOW}⚠️ DRY RUN MODE - No changes will be made{RESET}")
if self.clear_target:
print(
f"{BOLD_RED}⚠️ CLEAR TARGET MODE - Target collections will be deleted first{RESET}"
)
print(f"Workspace: {BOLD_CYAN}{self.workspace}{RESET}")
print(f"Batch Size: {self.batch_size}")
print("=" * 60)
def check_connection(self) -> bool:
"""Check Qdrant connection"""
try:
client = self._get_client()
# Try to list collections to verify connection
client.get_collections()
print(f"{BOLD_GREEN}{RESET} Qdrant connection successful")
return True
except Exception as e:
print(f"{BOLD_RED}{RESET} Qdrant connection failed: {e}")
return False
def get_collection_info(self, collection_name: str) -> Optional[Dict[str, Any]]:
"""
Get collection information.
Args:
collection_name: Name of the collection
Returns:
Dictionary with collection info (vector_size, count) or None if not exists
"""
client = self._get_client()
if not client.collection_exists(collection_name):
return None
info = client.get_collection(collection_name)
count = client.count(collection_name=collection_name, exact=True).count
# Handle both object and dict formats for vectors config
vectors_config = info.config.params.vectors
if isinstance(vectors_config, dict):
# Named vectors format or dict format
if vectors_config:
first_key = next(iter(vectors_config.keys()), None)
if first_key and hasattr(vectors_config[first_key], "size"):
vector_size = vectors_config[first_key].size
distance = vectors_config[first_key].distance
else:
# Try to get from dict values
first_val = next(iter(vectors_config.values()), {})
vector_size = (
first_val.get("size")
if isinstance(first_val, dict)
else getattr(first_val, "size", None)
)
distance = (
first_val.get("distance")
if isinstance(first_val, dict)
else getattr(first_val, "distance", None)
)
else:
vector_size = None
distance = None
else:
# Standard single vector format
vector_size = vectors_config.size
distance = vectors_config.distance
return {
"name": collection_name,
"vector_size": vector_size,
"count": count,
"distance": distance,
}
def delete_collection(self, collection_name: str) -> bool:
"""
Delete a collection if it exists.
Args:
collection_name: Name of the collection to delete
Returns:
True if deleted or doesn't exist
"""
client = self._get_client()
if not client.collection_exists(collection_name):
return True
if self.dry_run:
target_info = self.get_collection_info(collection_name)
count = target_info["count"] if target_info else 0
print(
f" {BOLD_YELLOW}[DRY RUN]{RESET} Would delete collection '{collection_name}' ({count:,} records)"
)
return True
try:
target_info = self.get_collection_info(collection_name)
count = target_info["count"] if target_info else 0
client.delete_collection(collection_name=collection_name)
print(
f" {BOLD_RED}{RESET} Deleted collection '{collection_name}' ({count:,} records)"
)
return True
except Exception as e:
print(f" {BOLD_RED}{RESET} Failed to delete collection: {e}")
return False
def create_legacy_collection(
self, collection_name: str, vector_size: int, distance: models.Distance
) -> bool:
"""
Create legacy collection if it doesn't exist.
Args:
collection_name: Name of the collection to create
vector_size: Dimension of vectors
distance: Distance metric
Returns:
True if created or already exists
"""
client = self._get_client()
if client.collection_exists(collection_name):
print(f" Collection '{collection_name}' already exists")
return True
if self.dry_run:
print(
f" {BOLD_YELLOW}[DRY RUN]{RESET} Would create collection '{collection_name}' with {vector_size}d vectors"
)
return True
try:
client.create_collection(
collection_name=collection_name,
vectors_config=models.VectorParams(
size=vector_size,
distance=distance,
),
hnsw_config=models.HnswConfigDiff(
payload_m=16,
m=0,
),
)
print(
f" {BOLD_GREEN}{RESET} Created collection '{collection_name}' with {vector_size}d vectors"
)
return True
except Exception as e:
print(f" {BOLD_RED}{RESET} Failed to create collection: {e}")
return False
def _get_workspace_filter(self) -> models.Filter:
"""Create workspace filter for Qdrant queries"""
return models.Filter(
must=[
models.FieldCondition(
key=WORKSPACE_ID_FIELD,
match=models.MatchValue(value=self.workspace),
)
]
)
def get_workspace_count(self, collection_name: str) -> int:
"""
Get count of records for the current workspace in a collection.
Args:
collection_name: Name of the collection
Returns:
Count of records for the workspace
"""
client = self._get_client()
return client.count(
collection_name=collection_name,
count_filter=self._get_workspace_filter(),
exact=True,
).count
def copy_collection_data(
self,
source_collection: str,
target_collection: str,
collection_type: str,
workspace_count: int,
) -> CopyStats:
"""
Copy data from source to target collection.
This filters by workspace_id and removes it from payload to simulate legacy data format.
Args:
source_collection: Source collection name
target_collection: Target collection name
collection_type: Type of collection (chunks, entities, relationships)
workspace_count: Pre-computed count of workspace records
Returns:
CopyStats with operation results
"""
client = self._get_client()
stats = CopyStats(
collection_type=collection_type,
source_collection=source_collection,
target_collection=target_collection,
)
start_time = time.time()
stats.total_records = workspace_count
if workspace_count == 0:
print(f" No records for workspace '{self.workspace}', skipping")
stats.elapsed_time = time.time() - start_time
return stats
print(f" Workspace records: {workspace_count:,}")
if self.dry_run:
print(
f" {BOLD_YELLOW}[DRY RUN]{RESET} Would copy {workspace_count:,} records to '{target_collection}'"
)
stats.copied_records = workspace_count
stats.elapsed_time = time.time() - start_time
return stats
# Batch copy using scroll with workspace filter
workspace_filter = self._get_workspace_filter()
offset = None
batch_idx = 0
while True:
# Scroll source collection with workspace filter
result = client.scroll(
collection_name=source_collection,
scroll_filter=workspace_filter,
limit=self.batch_size,
offset=offset,
with_vectors=True,
with_payload=True,
)
points, next_offset = result
if not points:
break
batch_idx += 1
# Transform points: remove workspace_id from payload
new_points = []
for point in points:
new_payload = dict(point.payload or {})
# Remove workspace_id to simulate legacy format
new_payload.pop(WORKSPACE_ID_FIELD, None)
# Use original id from payload if available, otherwise use point.id
original_id = new_payload.get("id")
if original_id:
# Generate a simple deterministic id for legacy format
# Use original id directly (legacy format didn't have workspace prefix)
import hashlib
import uuid
hashed = hashlib.sha256(original_id.encode("utf-8")).digest()
point_id = uuid.UUID(bytes=hashed[:16], version=4).hex
else:
point_id = str(point.id)
new_points.append(
models.PointStruct(
id=point_id,
vector=point.vector,
payload=new_payload,
)
)
try:
# Upsert to target collection
client.upsert(
collection_name=target_collection, points=new_points, wait=True
)
stats.copied_records += len(new_points)
# Progress bar
progress = (stats.copied_records / workspace_count) * 100
bar_length = 30
filled = int(bar_length * stats.copied_records // workspace_count)
bar = "" * filled + "" * (bar_length - filled)
print(
f"\r Copying: {bar} {stats.copied_records:,}/{workspace_count:,} ({progress:.1f}%) ",
end="",
flush=True,
)
except Exception as e:
stats.add_error(batch_idx, e, len(new_points))
print(
f"\n {BOLD_RED}{RESET} Batch {batch_idx} failed: {type(e).__name__}: {e}"
)
if next_offset is None:
break
offset = next_offset
print() # New line after progress bar
stats.elapsed_time = time.time() - start_time
return stats
def process_collection_type(self, collection_type: str) -> Optional[CopyStats]:
"""
Process a single collection type.
Args:
collection_type: Type of collection (chunks, entities, relationships)
Returns:
CopyStats or None if error
"""
namespace_config = COLLECTION_NAMESPACES.get(collection_type)
if not namespace_config:
print(f"{BOLD_RED}{RESET} Unknown collection type: {collection_type}")
return None
source = namespace_config["new"]
# Generate legacy collection name dynamically: {workspace}_{suffix}
target = f"{self.workspace}_{namespace_config['suffix']}"
print(f"\n{'=' * 50}")
print(f"Processing: {BOLD_CYAN}{collection_type}{RESET}")
print(f"{'=' * 50}")
print(f" Source: {source}")
print(f" Target: {target}")
# Check source collection
source_info = self.get_collection_info(source)
if source_info is None:
print(
f" {BOLD_YELLOW}{RESET} Source collection '{source}' does not exist, skipping"
)
return None
print(f" Source vector dimension: {source_info['vector_size']}d")
print(f" Source distance metric: {source_info['distance']}")
print(f" Source total records: {source_info['count']:,}")
# Check workspace data exists BEFORE creating legacy collection
workspace_count = self.get_workspace_count(source)
print(f" Workspace '{self.workspace}' records: {workspace_count:,}")
if workspace_count == 0:
print(
f" {BOLD_YELLOW}{RESET} No data found for workspace '{self.workspace}' in '{source}', skipping"
)
return None
# Clear target collection if requested
if self.clear_target:
if not self.delete_collection(target):
return None
# Create target collection only after confirming workspace data exists
if not self.create_legacy_collection(
target, source_info["vector_size"], source_info["distance"]
):
return None
# Copy data with workspace filter
stats = self.copy_collection_data(
source, target, collection_type, workspace_count
)
# Print result
if stats.failed_records == 0:
print(
f" {BOLD_GREEN}{RESET} Copied {stats.copied_records:,} records in {stats.elapsed_time:.2f}s"
)
else:
print(
f" {BOLD_YELLOW}{RESET} Copied {stats.copied_records:,} records, "
f"{BOLD_RED}{stats.failed_records:,} failed{RESET} in {stats.elapsed_time:.2f}s"
)
return stats
def print_summary(self, all_stats: List[CopyStats]):
"""Print summary of all operations"""
print("\n" + "=" * 60)
print("Summary")
print("=" * 60)
total_copied = sum(s.copied_records for s in all_stats)
total_failed = sum(s.failed_records for s in all_stats)
total_time = sum(s.elapsed_time for s in all_stats)
for stats in all_stats:
status = (
f"{BOLD_GREEN}{RESET}"
if stats.failed_records == 0
else f"{BOLD_YELLOW}{RESET}"
)
print(
f" {status} {stats.collection_type}: {stats.copied_records:,}/{stats.total_records:,} "
f"({stats.source_collection}{stats.target_collection})"
)
print("-" * 60)
print(f" Total records copied: {BOLD_CYAN}{total_copied:,}{RESET}")
if total_failed > 0:
print(f" Total records failed: {BOLD_RED}{total_failed:,}{RESET}")
print(f" Total time: {total_time:.2f}s")
if self.dry_run:
print(f"\n{BOLD_YELLOW}⚠️ DRY RUN - No actual changes were made{RESET}")
# Print error details if any
all_errors = []
for stats in all_stats:
all_errors.extend(stats.errors)
if all_errors:
print(f"\n{BOLD_RED}Errors ({len(all_errors)}){RESET}")
for i, error in enumerate(all_errors[:5], 1):
print(
f" {i}. Batch {error['batch']}: {error['error_type']}: {error['error_msg']}"
)
if len(all_errors) > 5:
print(f" ... and {len(all_errors) - 5} more errors")
print("=" * 60)
async def run(self, collection_types: Optional[List[str]] = None):
"""
Run the data preparation tool.
Args:
collection_types: List of collection types to process (default: all)
"""
self.print_header()
# Check connection
if not self.check_connection():
return
# Determine which collection types to process
if collection_types:
types_to_process = [t.strip() for t in collection_types]
invalid_types = [
t for t in types_to_process if t not in COLLECTION_NAMESPACES
]
if invalid_types:
print(
f"{BOLD_RED}{RESET} Invalid collection types: {', '.join(invalid_types)}"
)
print(f" Valid types: {', '.join(COLLECTION_NAMESPACES.keys())}")
return
else:
types_to_process = list(COLLECTION_NAMESPACES.keys())
print(f"\nCollection types to process: {', '.join(types_to_process)}")
# Process each collection type
all_stats = []
for ctype in types_to_process:
stats = self.process_collection_type(ctype)
if stats:
all_stats.append(stats)
# Print summary
if all_stats:
self.print_summary(all_stats)
else:
print(f"\n{BOLD_YELLOW}{RESET} No collections were processed")
def parse_args():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(
description="Prepare legacy data in Qdrant for migration testing",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python -m lightrag.tools.prepare_qdrant_legacy_data
python -m lightrag.tools.prepare_qdrant_legacy_data --workspace space1
python -m lightrag.tools.prepare_qdrant_legacy_data --types chunks,entities
python -m lightrag.tools.prepare_qdrant_legacy_data --dry-run
""",
)
parser.add_argument(
"--workspace",
type=str,
default="space1",
help="Workspace name (default: space1)",
)
parser.add_argument(
"--types",
type=str,
default=None,
help="Comma-separated list of collection types (chunks, entities, relationships)",
)
parser.add_argument(
"--batch-size",
type=int,
default=DEFAULT_BATCH_SIZE,
help=f"Batch size for copy operations (default: {DEFAULT_BATCH_SIZE})",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Preview operations without making changes",
)
parser.add_argument(
"--clear-target",
action="store_true",
help="Delete target collections before copying (for clean test environment)",
)
return parser.parse_args()
async def main():
"""Main entry point"""
args = parse_args()
collection_types = None
if args.types:
collection_types = [t.strip() for t in args.types.split(",")]
tool = QdrantLegacyDataPreparationTool(
workspace=args.workspace,
batch_size=args.batch_size,
dry_run=args.dry_run,
clear_target=args.clear_target,
)
await tool.run(collection_types=collection_types)
if __name__ == "__main__":
asyncio.run(main())