385 lines
13 KiB
Python
385 lines
13 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
大语言模型API配置文件
|
||
|
||
此文件包含所有与LLM评估相关的配置参数,包括:
|
||
- API密钥和访问配置
|
||
- 模型参数
|
||
- 评估模式设置
|
||
- 提示词配置
|
||
- 权重配置
|
||
|
||
使用说明:
|
||
---------
|
||
1. 默认使用模拟评估模式(无需配置)
|
||
2. 如需使用真实LLM API,请:
|
||
- 安装依赖:pip install openai
|
||
- 在此处配置API密钥或使用环境变量
|
||
- 修改 model_evaluation.py 中的 use_real_llm = True
|
||
|
||
环境变量:
|
||
---------
|
||
- OPENAI_API_KEY: OpenAI API密钥
|
||
- API_BASE_URL: 自定义API地址(可选)
|
||
"""
|
||
|
||
import os
|
||
from typing import Dict, Any
|
||
|
||
|
||
# =============================================================================
|
||
# 评估模式配置
|
||
# =============================================================================
|
||
|
||
# 是否使用真实LLM评估
|
||
# False = 使用模拟评估(默认,推荐用于测试)
|
||
# True = 使用真实的大语言模型API(需要配置API密钥)
|
||
USE_REAL_LLM = True
|
||
|
||
# =============================================================================
|
||
# 并发评估配置
|
||
# =============================================================================
|
||
|
||
# 并发评估的最大线程数
|
||
# 推荐设置:
|
||
# - 模拟评估:可使用所有CPU核心(如32)
|
||
# - 真实LLM API:建议4-8(避免触发API速率限制)
|
||
# - None = 自动检测CPU核心数并使用所有核心
|
||
MAX_CONCURRENT_WORKERS = 5 # 可以手动设置为具体数字,如8
|
||
|
||
# 是否显示每个并发任务的详细进度条
|
||
# True = 显示详细进度条(可以看到每个任务的执行情况)
|
||
# False = 只显示总进度条
|
||
# 注意:当数据量很大时,建议设置为False以避免屏幕输出过多
|
||
SHOW_DETAILED_PROGRESS = True
|
||
|
||
|
||
# =============================================================================
|
||
# API配置
|
||
# =============================================================================
|
||
|
||
# OpenAI API配置
|
||
OPENAI_CONFIG = {
|
||
# API密钥获取方式(优先级从高到低):
|
||
# 1. 环境变量 OPENAI_API_KEY
|
||
# 2. 直接在此处配置(不推荐,存在安全风险)
|
||
"api_key": os.environ.get("OPENAI_API_KEY", "123"),
|
||
|
||
# API基础URL(可选)
|
||
# 使用默认值:https://api.openai.com/v1
|
||
# 如使用代理或其他兼容API,请在此处配置
|
||
"api_base": os.environ.get("API_BASE_URL", "http://10.10.10.122:1234/v1"),
|
||
|
||
# 默认模型
|
||
# 可选模型:gpt-3.5-turbo, gpt-4, gpt-4-turbo 等
|
||
"model": "gpt-3.5-turbo",
|
||
|
||
# 生成参数
|
||
"temperature": 0, # 温度参数,0表示最确定的输出
|
||
"max_tokens": 500, # 最大生成token数
|
||
"timeout": 60, # API调用超时时间(秒)
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# 评估提示词配置
|
||
# =============================================================================
|
||
|
||
# 评估维度权重(仅用于模拟评估,不影响真实LLM评估)
|
||
# 真实LLM评估使用其自身的语义理解能力
|
||
EVALUATION_WEIGHTS = {
|
||
"bleu_score": 0.2, # BLEU分数权重
|
||
"rouge_l_score": 0.25, # ROUGE-L分数权重
|
||
"exact_match_rate": 0.25, # 完全匹配率权重
|
||
"keyword_overlap_rate": 0.3 # 关键词重叠率权重
|
||
}
|
||
|
||
# 提示词模板
|
||
PROMPT_TEMPLATE = """你是一位专业的文本质量评估专家。你的任务是对模型的生成答案与参考答案进行对比评估,并给出1-10分的评分。
|
||
|
||
评估维度:
|
||
1. 答案准确性:生成答案是否正确回答了问题,与参考答案在内容上是否一致
|
||
2. 逻辑清晰度:答案的逻辑是否清晰,条理是否清楚
|
||
3. 完整性:答案是否完整,是否遗漏了关键信息
|
||
4. 表达质量:语言表达是否流畅、专业
|
||
|
||
评分标准:
|
||
- 10分:生成答案完全正确,逻辑清晰,表达优秀,与参考答案高度一致
|
||
- 8-9分:生成答案基本正确,逻辑较清晰,表达较好,与参考答案较一致
|
||
- 6-7分:生成答案部分正确,有一定逻辑,表达一般,与参考答案有一定差距
|
||
- 4-5分:生成答案存在问题,逻辑不够清晰,表达欠佳,与参考答案差距较大
|
||
- 2-3分:生成答案有较多错误,逻辑混乱,表达差,与参考答案差距很大
|
||
- 1分:生成答案错误严重,无法理解或完全不相关
|
||
|
||
问题:{question}
|
||
|
||
参考答案:{reference}
|
||
|
||
模型生成答案:{candidate}
|
||
|
||
请按照以下JSON格式返回评分结果:
|
||
{{
|
||
"score": 评分(1-10的整数),
|
||
"reason": "详细评价理由,包括优点、不足及评分依据"
|
||
}}
|
||
|
||
请确保:
|
||
1. 评分必须是1-10的整数
|
||
2. 评价理由要详细具体,指出具体的优缺点
|
||
3. 评价要客观公正,基于实际内容对比
|
||
4. JSON格式要正确,可以直接解析
|
||
"""
|
||
|
||
|
||
# =============================================================================
|
||
# 其他模型提供商配置(扩展用)
|
||
# =============================================================================
|
||
|
||
# Anthropic Claude API配置(示例,如需使用请安装相应库)
|
||
ANTHROPIC_CONFIG = {
|
||
"api_key": os.environ.get("ANTHROPIC_API_KEY", ""),
|
||
"api_base": "https://api.anthropic.com",
|
||
"model": "claude-3-sonnet-20240229",
|
||
"temperature": 0,
|
||
"max_tokens": 500,
|
||
}
|
||
|
||
# 通义千问API配置(示例)
|
||
QWEN_CONFIG = {
|
||
"api_key": os.environ.get("DASHSCOPE_API_KEY", ""),
|
||
"api_base": "https://dashscope.aliyuncs.com/api/v1",
|
||
"model": "qwen-turbo",
|
||
"temperature": 0,
|
||
"max_tokens": 500,
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# 配置验证函数
|
||
# =============================================================================
|
||
|
||
def validate_openai_config() -> Dict[str, Any]:
|
||
"""验证OpenAI配置是否正确"""
|
||
config = OPENAI_CONFIG.copy()
|
||
|
||
# 检查API密钥
|
||
if not config["api_key"]:
|
||
raise ValueError(
|
||
"未找到OpenAI API密钥!\n"
|
||
"请选择以下方式之一:\n"
|
||
"1. 设置环境变量:export OPENAI_API_KEY='your-api-key'\n"
|
||
"2. 在llm_config.py中直接配置(不推荐)\n"
|
||
"3. 使用模拟评估模式(USE_REAL_LLM = False)"
|
||
)
|
||
|
||
# 检查模型名称
|
||
valid_models = ["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-turbo", "gpt-4o"]
|
||
if config["model"] not in valid_models:
|
||
print(f"警告:模型 {config['model']} 可能不受支持,支持的模型:{valid_models}")
|
||
|
||
# 检查参数范围
|
||
if not (0 <= config["temperature"] <= 2):
|
||
print(f"警告:temperature {config['temperature']} 超出推荐范围 [0, 2]")
|
||
|
||
if config["max_tokens"] <= 0:
|
||
raise ValueError("max_tokens 必须大于 0")
|
||
|
||
return config
|
||
|
||
|
||
def print_config_info():
|
||
"""打印当前配置信息"""
|
||
print("\n" + "=" * 60)
|
||
print("LLM评估配置信息")
|
||
print("=" * 60)
|
||
|
||
print(f"评估模式: {'真实LLM API' if USE_REAL_LLM else '模拟评估(默认)'}")
|
||
print("-" * 60)
|
||
|
||
# 显示并发配置
|
||
if MAX_CONCURRENT_WORKERS is not None:
|
||
print(f"并发设置: {MAX_CONCURRENT_WORKERS} 个线程")
|
||
print(" 来源: llm_config.py 中的 MAX_CONCURRENT_WORKERS")
|
||
else:
|
||
import multiprocessing as mp
|
||
cpu_count = mp.cpu_count()
|
||
print(f"并发设置: 自动检测CPU核心数 ({cpu_count}核心)")
|
||
print(" 来源: 默认设置(可配置 MAX_CONCURRENT_WORKERS)")
|
||
|
||
# 显示详细进度条配置
|
||
print(f"详细进度条: {'开启' if SHOW_DETAILED_PROGRESS else '关闭'}")
|
||
print(" 来源: llm_config.py 中的 SHOW_DETAILED_PROGRESS")
|
||
if SHOW_DETAILED_PROGRESS:
|
||
print(" 注意: 开启时会显示每个并发任务的进度条")
|
||
else:
|
||
print(" 注意: 关闭时只显示总进度条")
|
||
print("-" * 60)
|
||
|
||
if USE_REAL_LLM:
|
||
print("OpenAI API配置:")
|
||
print(f" API Base: {OPENAI_CONFIG['api_base']}")
|
||
print(f" 模型: {OPENAI_CONFIG['model']}")
|
||
print(f" 温度: {OPENAI_CONFIG['temperature']}")
|
||
print(f" 最大Token: {OPENAI_CONFIG['max_tokens']}")
|
||
|
||
# 检查API密钥是否配置
|
||
if OPENAI_CONFIG['api_key']:
|
||
print(f" API密钥: {'已配置' if OPENAI_CONFIG['api_key'] else '未配置'}")
|
||
else:
|
||
print(" API密钥: 未配置!")
|
||
else:
|
||
print("模拟评估权重配置:")
|
||
for key, value in EVALUATION_WEIGHTS.items():
|
||
print(f" {key}: {value:.2%}")
|
||
print("\n模拟评估基于传统指标的综合加权计算")
|
||
|
||
print("=" * 60 + "\n")
|
||
|
||
|
||
def switch_to_real_llm():
|
||
"""切换到真实LLM评估模式的配置检查"""
|
||
print("\n" + "=" * 60)
|
||
print("切换到真实LLM评估模式")
|
||
print("=" * 60)
|
||
|
||
# 1. 检查依赖
|
||
try:
|
||
import openai
|
||
print("✓ OpenAI库已安装")
|
||
except ImportError:
|
||
print("✗ 未安装OpenAI库")
|
||
print("请运行:pip install openai")
|
||
return False
|
||
|
||
# 2. 检查API密钥
|
||
try:
|
||
validate_openai_config()
|
||
print("✓ API密钥配置正确")
|
||
except ValueError as e:
|
||
print(f"✗ API密钥配置错误: {e}")
|
||
return False
|
||
|
||
# 3. 测试API连接(可选)
|
||
print("\n配置验证通过!可以在model_evaluation.py中设置 use_real_llm = True")
|
||
|
||
return True
|
||
|
||
|
||
def switch_to_mock_evaluation():
|
||
"""切换到模拟评估模式"""
|
||
print("\n" + "=" * 60)
|
||
print("切换到模拟评估模式")
|
||
print("=" * 60)
|
||
print("模拟评估使用传统指标的加权组合:")
|
||
print(f" BLEU分数: {EVALUATION_WEIGHTS['bleu_score']:.1%}")
|
||
print(f" ROUGE-L分数: {EVALUATION_WEIGHTS['rouge_l_score']:.1%}")
|
||
print(f" 完全匹配率: {EVALUATION_WEIGHTS['exact_match_rate']:.1%}")
|
||
print(f" 关键词重叠率: {EVALUATION_WEIGHTS['keyword_overlap_rate']:.1%}")
|
||
print("\n优点:")
|
||
print(" ✓ 无需配置API")
|
||
print(" ✓ 运行速度快")
|
||
print(" ✓ 无额外成本")
|
||
print("=" * 60)
|
||
|
||
|
||
# =============================================================================
|
||
# 配置示例
|
||
# =============================================================================
|
||
|
||
EXAMPLE_CONFIGURATIONS = {
|
||
"openai_gpt35": {
|
||
"description": "OpenAI GPT-3.5-Turbo(推荐)",
|
||
"model": "gpt-3.5-turbo",
|
||
"cost": "低",
|
||
"speed": "快",
|
||
"quality": "中等"
|
||
},
|
||
"openai_gpt4": {
|
||
"description": "OpenAI GPT-4(高质量)",
|
||
"model": "gpt-4",
|
||
"cost": "高",
|
||
"speed": "中等",
|
||
"quality": "高"
|
||
},
|
||
"openai_gpt4_turbo": {
|
||
"description": "OpenAI GPT-4-Turbo(推荐)",
|
||
"model": "gpt-4-turbo",
|
||
"cost": "中等",
|
||
"speed": "快",
|
||
"quality": "高"
|
||
},
|
||
"anthropic_claude": {
|
||
"description": "Anthropic Claude-3-Sonnet",
|
||
"model": "claude-3-sonnet-20240229",
|
||
"cost": "中等",
|
||
"speed": "中等",
|
||
"quality": "高"
|
||
},
|
||
"qwen_turbo": {
|
||
"description": "阿里云通义千问-Turbo",
|
||
"model": "qwen-turbo",
|
||
"cost": "低",
|
||
"speed": "快",
|
||
"quality": "中等"
|
||
}
|
||
}
|
||
|
||
|
||
def print_model_options():
|
||
"""打印可选模型列表"""
|
||
print("\n" + "=" * 60)
|
||
print("支持的模型列表")
|
||
print("=" * 60)
|
||
|
||
for key, config in EXAMPLE_CONFIGURATIONS.items():
|
||
print(f"\n{key}:")
|
||
print(f" 描述: {config['description']}")
|
||
print(f" 模型: {config['model']}")
|
||
print(f" 成本: {config['cost']}")
|
||
print(f" 速度: {config['speed']}")
|
||
print(f" 质量: {config['quality']}")
|
||
|
||
print("\n" + "=" * 60)
|
||
|
||
|
||
# =============================================================================
|
||
# 使用示例
|
||
# =============================================================================
|
||
|
||
if __name__ == "__main__":
|
||
print_config_info()
|
||
|
||
print("选择操作:")
|
||
print("1. 查看支持模型列表")
|
||
print("2. 验证真实LLM配置")
|
||
print("3. 查看配置示例")
|
||
print("4. 退出")
|
||
|
||
choice = input("\n请输入选择 (1-4): ")
|
||
|
||
if choice == "1":
|
||
print_model_options()
|
||
elif choice == "2":
|
||
if switch_to_real_llm():
|
||
print("\n可以启用真实LLM评估模式!")
|
||
else:
|
||
print("\n请先解决配置问题")
|
||
elif choice == "3":
|
||
print("\n使用示例:")
|
||
print("""
|
||
# 方式1:使用环境变量
|
||
export OPENAI_API_KEY="your-api-key"
|
||
export USE_REAL_LLM="true"
|
||
|
||
# 方式2:在代码中配置
|
||
from llm_config import OPENAI_CONFIG
|
||
OPENAI_CONFIG["api_key"] = "your-api-key"
|
||
OPENUI_CONFIG["model"] = "gpt-3.5-turbo"
|
||
|
||
# 方式3:使用模拟评估(默认)
|
||
# 无需配置,直接使用
|
||
""")
|
||
else:
|
||
print("退出")
|