feat: 新增 account 和 plan 目录
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,177 @@
|
||||
"""
|
||||
图片理解模块 - 使用 Dashscope Qwen-VL 模型
|
||||
"""
|
||||
import os
|
||||
import base64
|
||||
import json
|
||||
import requests
|
||||
|
||||
# 配置
|
||||
API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
|
||||
if not API_KEY:
|
||||
print("错误: 请设置 DASHSCOPE_API_KEY 环境变量")
|
||||
exit(1)
|
||||
|
||||
BASE_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation"
|
||||
|
||||
|
||||
def encode_image(image_path):
|
||||
"""将图片编码为base64"""
|
||||
with open(image_path, "rb") as f:
|
||||
return base64.b64encode(f.read()).decode("utf-8")
|
||||
|
||||
|
||||
def call_dashscope(messages, model="qwen-vl-plus", max_tokens=1000):
|
||||
"""调用Dashscope API"""
|
||||
headers = {
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"input": {
|
||||
"messages": messages
|
||||
},
|
||||
"parameters": {
|
||||
"max_tokens": max_tokens
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(BASE_URL, headers=headers, json=payload)
|
||||
result = response.json()
|
||||
|
||||
if "output" in result and "choices" in result["output"]:
|
||||
return result["output"]["choices"][0]["message"]["content"]
|
||||
else:
|
||||
return f"错误: {json.dumps(result, ensure_ascii=False, indent=2)}"
|
||||
|
||||
|
||||
def describe_image(image_path, model="qwen-vl-plus"):
|
||||
"""描述图片内容"""
|
||||
image_b64 = encode_image(image_path)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "请详细描述这张图片的内容,包括场景、人物、物体、颜色等所有细节。"
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"image": f"data:image/jpeg;base64,{image_b64}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
return call_dashscope(messages, model, max_tokens=1000)
|
||||
|
||||
|
||||
def extract_text(image_path, model="qwen-vl-plus"):
|
||||
"""提取图片中的文字 (OCR)"""
|
||||
image_b64 = encode_image(image_path)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "请提取图片中的所有文字,保持原格式,不要遗漏任何内容。"
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"image": f"data:image/jpeg;base64,{image_b64}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
return call_dashscope(messages, model, max_tokens=2000)
|
||||
|
||||
|
||||
def identify_objects(image_path, model="qwen-vl-plus"):
|
||||
"""识别图片中的物体"""
|
||||
image_b64 = encode_image(image_path)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "请列出图片中的所有物体、人物、元素,用清晰的列表格式。"
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"image": f"data:image/jpeg;base64,{image_b64}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
return call_dashscope(messages, model, max_tokens=500)
|
||||
|
||||
|
||||
def answer_question(image_path, question, model="qwen-vl-plus"):
|
||||
"""回答关于图片的问题"""
|
||||
image_b64 = encode_image(image_path)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": question
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"image": f"data:image/jpeg;base64,{image_b64}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
return call_dashscope(messages, model, max_tokens=500)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("用法:")
|
||||
print(" python image_understander.py -i <图片路径> -m describe # 描述图片")
|
||||
print(" python image_understander.py -i <图片路径> -m ocr # 提取文字")
|
||||
print(" python image_understander.py -i <图片路径> -m objects # 识别物体")
|
||||
print(" python image_understander.py -i <图片路径> -m qa -q '问题' # 图片问答")
|
||||
sys.exit(1)
|
||||
|
||||
image_path = sys.argv[2]
|
||||
|
||||
if "-m" in sys.argv:
|
||||
mode = sys.argv[sys.argv.index("-m") + 1]
|
||||
else:
|
||||
mode = "describe"
|
||||
|
||||
if "-q" in sys.argv:
|
||||
question = sys.argv[sys.argv.index("-q") + 1]
|
||||
else:
|
||||
question = None
|
||||
|
||||
if mode == "describe":
|
||||
print(describe_image(image_path))
|
||||
elif mode == "ocr":
|
||||
print(extract_text(image_path))
|
||||
elif mode == "objects":
|
||||
print(identify_objects(image_path))
|
||||
elif mode == "qa":
|
||||
if question:
|
||||
print(answer_question(image_path, question))
|
||||
else:
|
||||
print("错误: 请使用 -q 指定问题")
|
||||
else:
|
||||
print(f"未知模式: {mode}")
|
||||
@@ -0,0 +1,381 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Image Understanding using Dashscope (Qwen Vision Models)
|
||||
|
||||
This script enables AI to understand and analyze images using Dashscope's
|
||||
vision API models (qwen-vl-plus, qwen-vl-max).
|
||||
|
||||
Usage:
|
||||
python image_understanding.py --image path/to/image.jpg
|
||||
python image_understanding.py --image https://example.com/image.png --prompt "图片里有什么?"
|
||||
python image_understanding.py --image ./screenshot.png --extract-text --describe
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from typing import Optional, Dict, Any
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Dashscope 配置
|
||||
DASHSCOPE_API_BASE = "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
DEFAULT_MODEL = "qwen-vl-plus"
|
||||
|
||||
|
||||
def validate_image_path(image_path: str) -> str:
|
||||
"""
|
||||
Validate and normalize image path or URL.
|
||||
|
||||
Args:
|
||||
image_path: Path to local image or URL
|
||||
|
||||
Returns:
|
||||
Validated image path
|
||||
|
||||
Raises:
|
||||
ValueError: If image path is invalid or file doesn't exist
|
||||
"""
|
||||
if image_path.startswith(('http://', 'https://')):
|
||||
if not image_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp')):
|
||||
raise ValueError(f"Invalid image URL format: {image_path}")
|
||||
return image_path
|
||||
|
||||
path = Path(image_path)
|
||||
if not path.exists():
|
||||
raise ValueError(f"Image file not found: {image_path}")
|
||||
if not path.is_file():
|
||||
raise ValueError(f"Path is not a file: {image_path}")
|
||||
|
||||
valid_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp'}
|
||||
if path.suffix.lower() not in valid_extensions:
|
||||
raise ValueError(f"Invalid image format: {path.suffix}. Supported: {', '.join(valid_extensions)}")
|
||||
|
||||
return str(path.absolute())
|
||||
|
||||
|
||||
def encode_image(image_path: str) -> str:
|
||||
"""
|
||||
Encode image to base64 string.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
|
||||
Returns:
|
||||
Base64 encoded image string
|
||||
"""
|
||||
import base64
|
||||
|
||||
with open(image_path, "rb") as f:
|
||||
return base64.b64encode(f.read()).decode('utf-8')
|
||||
|
||||
|
||||
def call_dashscope_api(api_key: str, image_path: str, prompt: str, model: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Call Dashscope Vision API to analyze image.
|
||||
|
||||
Args:
|
||||
api_key: Dashscope API key
|
||||
image_path: Path to image file or URL
|
||||
prompt: Custom prompt for analysis
|
||||
model: Model name (qwen-vl-plus or qwen-vl-max)
|
||||
|
||||
Returns:
|
||||
API response as dictionary
|
||||
"""
|
||||
import requests
|
||||
|
||||
# Prepare image content
|
||||
if image_path.startswith(('http://', 'https://')):
|
||||
image_content = {
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_path}
|
||||
}
|
||||
else:
|
||||
base64_image = encode_image(image_path)
|
||||
# Detect mime type
|
||||
ext = Path(image_path).suffix.lower()
|
||||
mime_type = "image/jpeg"
|
||||
if ext == '.png':
|
||||
mime_type = "image/png"
|
||||
elif ext == '.gif':
|
||||
mime_type = "image/gif"
|
||||
elif ext == '.webp':
|
||||
mime_type = "image/webp"
|
||||
|
||||
image_content = {
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:{mime_type};base64,{base64_image}"
|
||||
}
|
||||
}
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
image_content
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"max_tokens": 1500,
|
||||
"temperature": 0.1
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{DASHSCOPE_API_BASE}/chat/completions",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=90
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except requests.exceptions.HTTPError as e:
|
||||
error_msg = f"API request failed: {e.response.text}"
|
||||
try:
|
||||
error_data = e.response.json()
|
||||
if 'error' in error_data:
|
||||
error_msg = f"API error: {error_data['error'].get('message', str(error_data))}"
|
||||
except:
|
||||
pass
|
||||
raise Exception(error_msg)
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise Exception(f"Network error: {str(e)}")
|
||||
|
||||
|
||||
def analyze_image(
|
||||
api_key: str,
|
||||
image_path: str,
|
||||
custom_prompt: Optional[str] = None,
|
||||
describe: bool = True,
|
||||
extract_text: bool = False,
|
||||
identify_objects: bool = False,
|
||||
model: str = DEFAULT_MODEL
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze image with specified analysis types.
|
||||
|
||||
Args:
|
||||
api_key: Dashscope API key
|
||||
image_path: Path to image file or URL
|
||||
custom_prompt: Optional custom prompt
|
||||
describe: Whether to describe the image
|
||||
extract_text: Whether to extract text from image
|
||||
identify_objects: Whether to identify objects
|
||||
model: Model to use
|
||||
|
||||
Returns:
|
||||
Analysis results as dictionary
|
||||
"""
|
||||
# Build prompt based on analysis type
|
||||
if custom_prompt:
|
||||
prompt = custom_prompt
|
||||
else:
|
||||
tasks = []
|
||||
if describe:
|
||||
tasks.append("详细描述这张图片的内容,包括物体、人物、场景、颜色和整体构成")
|
||||
if extract_text:
|
||||
tasks.append("提取图片中所有可见的文字(OCR)")
|
||||
if identify_objects:
|
||||
tasks.append("识别并列出图片中所有可辨认的物体、人物和元素")
|
||||
|
||||
if tasks:
|
||||
prompt = f"""请分析这张图片,提供以下信息:
|
||||
{';'.join(tasks)}
|
||||
|
||||
请用清晰的分段格式回答。"""
|
||||
else:
|
||||
prompt = "请对这张图片进行全面详细的描述,包括所有可见的物体、人物、场景、文字和任何值得注意的细节。"
|
||||
|
||||
# Call API
|
||||
response = call_dashscope_api(api_key, image_path, prompt, model)
|
||||
|
||||
# Parse response
|
||||
content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
|
||||
# Extract usage info
|
||||
usage = response.get("usage", {})
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"image_path": image_path,
|
||||
"model": model,
|
||||
"api_provider": "dashscope",
|
||||
"analysis": {
|
||||
"description": content if describe else None,
|
||||
"extracted_text": content if extract_text else None,
|
||||
"objects": content if identify_objects else None,
|
||||
"full_response": content
|
||||
},
|
||||
"usage": {
|
||||
"prompt_tokens": usage.get("prompt_tokens", 0),
|
||||
"completion_tokens": usage.get("completion_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def execute(args: argparse.Namespace) -> Dict[str, Any]:
|
||||
"""Execute the image understanding analysis."""
|
||||
# Get API key - 支持 DASHSCOPE_API_KEY 或 OPENAI_API_KEY
|
||||
api_key = args.api_key or os.environ.get("DASHSCOPE_API_KEY") or os.environ.get("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError(
|
||||
"API key not provided. Use --api-key or set DASHSCOPE_API_KEY environment variable"
|
||||
)
|
||||
|
||||
# Validate image
|
||||
image_path = validate_image_path(args.image)
|
||||
|
||||
# Determine analysis type
|
||||
describe = not (args.extract_text or args.identify_objects or args.custom_prompt)
|
||||
extract_text = args.extract_text
|
||||
identify_objects = args.identify_objects
|
||||
custom_prompt = args.custom_prompt
|
||||
model = args.model or DEFAULT_MODEL
|
||||
|
||||
# Analyze
|
||||
result = analyze_image(
|
||||
api_key=api_key,
|
||||
image_path=image_path,
|
||||
custom_prompt=custom_prompt,
|
||||
describe=describe,
|
||||
extract_text=extract_text,
|
||||
identify_objects=identify_objects,
|
||||
model=model
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="使用 Dashscope(通义千问)视觉模型分析图片",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
示例:
|
||||
# 基本图片描述
|
||||
python image_understanding.py --image photo.jpg
|
||||
|
||||
# 提取图片中的文字
|
||||
python image_understanding.py --image screenshot.png --extract-text
|
||||
|
||||
# 识别图片中的物体
|
||||
python image_understanding.py --image photo.jpg --identify-objects
|
||||
|
||||
# 自定义分析提示词
|
||||
python image_understanding.py --image photo.jpg --prompt "这个产品多少钱?"
|
||||
|
||||
# 使用环境变量中的 API key
|
||||
export DASHSCOPE_API_KEY=your_key
|
||||
python image_understanding.py --image photo.jpg
|
||||
|
||||
# 使用网络图片URL
|
||||
python image_understanding.py --image "https://example.com/photo.jpg" --describe
|
||||
|
||||
# 使用更强的模型
|
||||
python image_understanding.py --image photo.jpg --model qwen-vl-max
|
||||
|
||||
环境变量:
|
||||
DASHSCOPE_API_KEY 你的 Dashscope API key
|
||||
OPENAI_API_KEY 也可以使用(兼容性支持)
|
||||
"""
|
||||
)
|
||||
|
||||
# Required
|
||||
parser.add_argument(
|
||||
"--image", "-i",
|
||||
required=True,
|
||||
help="本地图片路径或图片URL"
|
||||
)
|
||||
|
||||
# Optional
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
help="Dashscope API key (也可通过 DASHSCOPE_API_KEY 环境变量设置)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model", "-m",
|
||||
default=DEFAULT_MODEL,
|
||||
choices=["qwen-vl-plus", "qwen-vl-max"],
|
||||
help=f"使用的模型 (默认: {DEFAULT_MODEL})"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--custom-prompt", "-p",
|
||||
help="自定义图片分析提示词"
|
||||
)
|
||||
|
||||
# Analysis type
|
||||
analysis_group = parser.add_mutually_exclusive_group()
|
||||
analysis_group.add_argument(
|
||||
"--describe",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="描述图片内容 (默认行为)"
|
||||
)
|
||||
analysis_group.add_argument(
|
||||
"--extract-text", "-e",
|
||||
action="store_true",
|
||||
help="从图片提取文字 (OCR)"
|
||||
)
|
||||
analysis_group.add_argument(
|
||||
"--identify-objects", "-o",
|
||||
action="store_true",
|
||||
help="识别图片中的物体"
|
||||
)
|
||||
|
||||
# Output
|
||||
parser.add_argument(
|
||||
"--compact",
|
||||
action="store_true",
|
||||
help="输出紧凑JSON (不缩进)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
result = execute(args)
|
||||
indent = None if args.compact else 2
|
||||
print(json.dumps(result, ensure_ascii=False, indent=indent))
|
||||
sys.exit(0)
|
||||
|
||||
except ValueError as e:
|
||||
error_result = {
|
||||
"success": False,
|
||||
"error": {
|
||||
"type": "validation_error",
|
||||
"message": str(e)
|
||||
}
|
||||
}
|
||||
print(json.dumps(error_result, ensure_ascii=False, indent=2))
|
||||
sys.exit(2)
|
||||
|
||||
except Exception as e:
|
||||
error_result = {
|
||||
"success": False,
|
||||
"error": {
|
||||
"type": "execution_error",
|
||||
"message": str(e)
|
||||
}
|
||||
}
|
||||
print(json.dumps(error_result, ensure_ascii=False, indent=2))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
390
account/admin/skills/image-understanding/scripts/main.py
Normal file
390
account/admin/skills/image-understanding/scripts/main.py
Normal file
@@ -0,0 +1,390 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
image-understanding - Enable AI to understand and analyze images using vision API
|
||||
|
||||
This script allows users to analyze images by calling vision API (like OpenAI GPT-4 Vision).
|
||||
It can describe image content, extract text, identify objects, and answer questions about images.
|
||||
|
||||
Usage:
|
||||
python image_understanding.py --image path/to/image.jpg
|
||||
python image_understanding.py --image https://example.com/image.png --prompt "What objects are in this image?"
|
||||
python image_understanding.py --image ./screenshot.png --extract-text --describe
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from typing import Optional, Dict, Any, List
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def validate_image_path(image_path: str) -> str:
|
||||
"""
|
||||
Validate and normalize image path or URL.
|
||||
|
||||
Args:
|
||||
image_path: Path to local image or URL
|
||||
|
||||
Returns:
|
||||
Validated image path
|
||||
|
||||
Raises:
|
||||
ValueError: If image path is invalid or file doesn't exist
|
||||
"""
|
||||
if image_path.startswith(('http://', 'https://')):
|
||||
# URL validation
|
||||
if not image_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
|
||||
raise ValueError(f"Invalid image URL format: {image_path}")
|
||||
return image_path
|
||||
|
||||
# Local file path
|
||||
path = Path(image_path)
|
||||
if not path.exists():
|
||||
raise ValueError(f"Image file not found: {image_path}")
|
||||
if not path.is_file():
|
||||
raise ValueError(f"Path is not a file: {image_path}")
|
||||
|
||||
# Check file extension
|
||||
valid_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.webp'}
|
||||
if path.suffix.lower() not in valid_extensions:
|
||||
raise ValueError(f"Invalid image format: {path.suffix}. Supported formats: {', '.join(valid_extensions)}")
|
||||
|
||||
return str(path.absolute())
|
||||
|
||||
|
||||
def encode_image(image_path: str) -> str:
|
||||
"""
|
||||
Encode image to base64 string for API upload.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
|
||||
Returns:
|
||||
Base64 encoded image string
|
||||
|
||||
Raises:
|
||||
Exception: If encoding fails
|
||||
"""
|
||||
import base64
|
||||
|
||||
try:
|
||||
with open(image_path, "rb") as image_file:
|
||||
encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
|
||||
return encoded_string
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to encode image: {str(e)}")
|
||||
|
||||
|
||||
def call_vision_api(api_key: str, image_path: str, prompt: str, model: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Call OpenAI Vision API to analyze image.
|
||||
|
||||
Args:
|
||||
api_key: OpenAI API key
|
||||
image_path: Path to image file or URL
|
||||
prompt: Custom prompt for analysis
|
||||
model: Model name to use
|
||||
|
||||
Returns:
|
||||
API response as dictionary
|
||||
|
||||
Raises:
|
||||
Exception: If API call fails
|
||||
"""
|
||||
import requests
|
||||
|
||||
# Prepare image content
|
||||
if image_path.startswith(('http://', 'https://')):
|
||||
image_content = {"type": "image_url", "image_url": {"url": image_path}}
|
||||
else:
|
||||
base64_image = encode_image(image_path)
|
||||
image_content = {
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_image}"
|
||||
}
|
||||
}
|
||||
|
||||
# Build messages
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
image_content
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"max_tokens": 1000
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=60
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except requests.exceptions.HTTPError as e:
|
||||
error_msg = f"API request failed: {e.response.text}"
|
||||
try:
|
||||
error_data = e.response.json()
|
||||
if 'error' in error_data:
|
||||
error_msg = f"API error: {error_data['error'].get('message', str(error_data))}"
|
||||
except:
|
||||
pass
|
||||
raise Exception(error_msg)
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise Exception(f"Network error: {str(e)}")
|
||||
|
||||
|
||||
def analyze_image(
|
||||
api_key: str,
|
||||
image_path: str,
|
||||
custom_prompt: Optional[str] = None,
|
||||
describe: bool = True,
|
||||
extract_text: bool = False,
|
||||
identify_objects: bool = False,
|
||||
model: str = "gpt-4-vision-preview"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze image with specified analysis types.
|
||||
|
||||
Args:
|
||||
api_key: OpenAI API key
|
||||
image_path: Path to image file or URL
|
||||
custom_prompt: Optional custom prompt
|
||||
describe: Whether to describe the image
|
||||
extract_text: Whether to extract text from image
|
||||
identify_objects: Whether to identify objects in image
|
||||
model: Model to use for analysis
|
||||
|
||||
Returns:
|
||||
Analysis results as dictionary
|
||||
"""
|
||||
# Build analysis prompt
|
||||
analysis_tasks = []
|
||||
|
||||
if describe:
|
||||
analysis_tasks.append("Describe the image in detail, including objects, people,场景, colors, and overall composition")
|
||||
|
||||
if extract_text:
|
||||
analysis_tasks.append("Extract all visible text from the image (OCR)")
|
||||
|
||||
if identify_objects:
|
||||
analysis_tasks.append("Identify and list all recognizable objects, people, and elements in the image")
|
||||
|
||||
if custom_prompt:
|
||||
prompt = custom_prompt
|
||||
else:
|
||||
prompt = f"""Please analyze this image and provide the following information:
|
||||
1. {'Describe the image content in detail' if describe else ''}
|
||||
2. {'Extract all visible text from the image' if extract_text else ''}
|
||||
3. {'List all identifiable objects and elements' if identify_objects else ''}
|
||||
|
||||
Please format your response as a structured analysis with clear sections."""
|
||||
|
||||
# Remove empty tasks
|
||||
analysis_tasks = [task for task in analysis_tasks if task]
|
||||
|
||||
if not analysis_tasks and not custom_prompt:
|
||||
# Default: full analysis
|
||||
prompt = "Provide a comprehensive description of this image, including all visible objects, people,场景, text, and any notable details."
|
||||
elif not analysis_tasks and custom_prompt:
|
||||
prompt = custom_prompt
|
||||
|
||||
# Call API
|
||||
response = call_vision_api(api_key, image_path, prompt, model)
|
||||
|
||||
# Parse response
|
||||
content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
|
||||
# Extract usage information if available
|
||||
usage = response.get("usage", {})
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"image_path": image_path,
|
||||
"model": model,
|
||||
"analysis": {
|
||||
"description": content if describe else None,
|
||||
"extracted_text": content if extract_text else None,
|
||||
"objects": content if identify_objects else None,
|
||||
"full_response": content
|
||||
},
|
||||
"usage": {
|
||||
"prompt_tokens": usage.get("prompt_tokens", 0),
|
||||
"completion_tokens": usage.get("completion_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def execute(args: argparse.Namespace) -> Dict[str, Any]:
|
||||
"""
|
||||
Execute the image understanding analysis.
|
||||
|
||||
Args:
|
||||
args: Parsed command-line arguments
|
||||
|
||||
Returns:
|
||||
Analysis result as dictionary
|
||||
"""
|
||||
# Get API key
|
||||
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("API key not provided. Use --api-key or set OPENAI_API_KEY environment variable")
|
||||
|
||||
# Validate image path
|
||||
image_path = validate_image_path(args.image)
|
||||
|
||||
# Determine analysis type
|
||||
describe = not (args.extract_text or args.identify_objects or args.custom_prompt)
|
||||
extract_text = args.extract_text
|
||||
identify_objects = args.identify_objects
|
||||
custom_prompt = args.custom_prompt
|
||||
|
||||
# Perform analysis
|
||||
result = analyze_image(
|
||||
api_key=api_key,
|
||||
image_path=image_path,
|
||||
custom_prompt=custom_prompt,
|
||||
describe=describe,
|
||||
extract_text=extract_text,
|
||||
identify_objects=identify_objects,
|
||||
model=args.model
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the image understanding script."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze images using AI vision capabilities (OpenAI GPT-4 Vision)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Basic image description
|
||||
python image_understanding.py --image photo.jpg
|
||||
|
||||
# Extract text from image
|
||||
python image_understanding.py --image screenshot.png --extract-text
|
||||
|
||||
# Identify objects in image
|
||||
python image_understanding.py --image photo.jpg --identify-objects
|
||||
|
||||
# Custom analysis with specific prompt
|
||||
python image_understanding.py --image photo.jpg --prompt "What brand is this product?"
|
||||
|
||||
# Using API key from environment variable
|
||||
export OPENAI_API_KEY=your_key
|
||||
python image_understanding.py --image photo.jpg
|
||||
|
||||
# Using remote image URL
|
||||
python image_understanding.py --image "https://example.com/photo.jpg" --describe
|
||||
|
||||
Environment Variables:
|
||||
OPENAI_API_KEY Your OpenAI API key (can be used instead of --api-key)
|
||||
"""
|
||||
)
|
||||
|
||||
# Required arguments
|
||||
parser.add_argument(
|
||||
"--image",
|
||||
required=True,
|
||||
help="Path to local image file or URL of image"
|
||||
)
|
||||
|
||||
# Optional arguments
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
help="OpenAI API key (can also be set via OPENAI_API_KEY environment variable)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default="gpt-4-vision-preview",
|
||||
help="Model to use for vision analysis (default: gpt-4-vision-preview)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--custom-prompt",
|
||||
"-p",
|
||||
help="Custom prompt for image analysis"
|
||||
)
|
||||
|
||||
# Analysis type flags (mutually exclusive with custom prompt)
|
||||
analysis_group = parser.add_mutually_exclusive_group()
|
||||
analysis_group.add_argument(
|
||||
"--describe",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Describe the image content (default behavior)"
|
||||
)
|
||||
analysis_group.add_argument(
|
||||
"--extract-text",
|
||||
action="store_true",
|
||||
help="Extract text from the image (OCR)"
|
||||
)
|
||||
analysis_group.add_argument(
|
||||
"--identify-objects",
|
||||
action="store_true",
|
||||
help="Identify and list objects in the image"
|
||||
)
|
||||
|
||||
# Output options
|
||||
parser.add_argument(
|
||||
"--compact",
|
||||
action="store_true",
|
||||
help="Output compact JSON (without indentation)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
result = execute(args)
|
||||
|
||||
# Print result
|
||||
indent = None if args.compact else 2
|
||||
print(json.dumps(result, ensure_ascii=False, indent=indent))
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
except ValueError as e:
|
||||
error_result = {
|
||||
"success": False,
|
||||
"error": {
|
||||
"type": "validation_error",
|
||||
"message": str(e)
|
||||
}
|
||||
}
|
||||
print(json.dumps(error_result, ensure_ascii=False, indent=2))
|
||||
sys.exit(2)
|
||||
|
||||
except Exception as e:
|
||||
error_result = {
|
||||
"success": False,
|
||||
"error": {
|
||||
"type": "execution_error",
|
||||
"message": str(e)
|
||||
}
|
||||
}
|
||||
print(json.dumps(error_result, ensure_ascii=False, indent=2))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user