Files
X-Agents/account/admin/skills/image-understanding/scripts/image_understanding.py
2026-03-11 16:26:22 +08:00

382 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Image Understanding using Dashscope (Qwen Vision Models)
This script enables AI to understand and analyze images using Dashscope's
vision API models (qwen-vl-plus, qwen-vl-max).
Usage:
python image_understanding.py --image path/to/image.jpg
python image_understanding.py --image https://example.com/image.png --prompt "图片里有什么?"
python image_understanding.py --image ./screenshot.png --extract-text --describe
"""
import argparse
import json
import os
import sys
from typing import Optional, Dict, Any
from pathlib import Path
# Dashscope 配置
DASHSCOPE_API_BASE = "https://dashscope.aliyuncs.com/compatible-mode/v1"
DEFAULT_MODEL = "qwen-vl-plus"
def validate_image_path(image_path: str) -> str:
"""
Validate and normalize image path or URL.
Args:
image_path: Path to local image or URL
Returns:
Validated image path
Raises:
ValueError: If image path is invalid or file doesn't exist
"""
if image_path.startswith(('http://', 'https://')):
if not image_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp')):
raise ValueError(f"Invalid image URL format: {image_path}")
return image_path
path = Path(image_path)
if not path.exists():
raise ValueError(f"Image file not found: {image_path}")
if not path.is_file():
raise ValueError(f"Path is not a file: {image_path}")
valid_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp'}
if path.suffix.lower() not in valid_extensions:
raise ValueError(f"Invalid image format: {path.suffix}. Supported: {', '.join(valid_extensions)}")
return str(path.absolute())
def encode_image(image_path: str) -> str:
"""
Encode image to base64 string.
Args:
image_path: Path to image file
Returns:
Base64 encoded image string
"""
import base64
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode('utf-8')
def call_dashscope_api(api_key: str, image_path: str, prompt: str, model: str) -> Dict[str, Any]:
"""
Call Dashscope Vision API to analyze image.
Args:
api_key: Dashscope API key
image_path: Path to image file or URL
prompt: Custom prompt for analysis
model: Model name (qwen-vl-plus or qwen-vl-max)
Returns:
API response as dictionary
"""
import requests
# Prepare image content
if image_path.startswith(('http://', 'https://')):
image_content = {
"type": "image_url",
"image_url": {"url": image_path}
}
else:
base64_image = encode_image(image_path)
# Detect mime type
ext = Path(image_path).suffix.lower()
mime_type = "image/jpeg"
if ext == '.png':
mime_type = "image/png"
elif ext == '.gif':
mime_type = "image/gif"
elif ext == '.webp':
mime_type = "image/webp"
image_content = {
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{base64_image}"
}
}
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
image_content
]
}
]
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": model,
"messages": messages,
"max_tokens": 1500,
"temperature": 0.1
}
try:
response = requests.post(
f"{DASHSCOPE_API_BASE}/chat/completions",
headers=headers,
json=payload,
timeout=90
)
response.raise_for_status()
return response.json()
except requests.exceptions.HTTPError as e:
error_msg = f"API request failed: {e.response.text}"
try:
error_data = e.response.json()
if 'error' in error_data:
error_msg = f"API error: {error_data['error'].get('message', str(error_data))}"
except:
pass
raise Exception(error_msg)
except requests.exceptions.RequestException as e:
raise Exception(f"Network error: {str(e)}")
def analyze_image(
api_key: str,
image_path: str,
custom_prompt: Optional[str] = None,
describe: bool = True,
extract_text: bool = False,
identify_objects: bool = False,
model: str = DEFAULT_MODEL
) -> Dict[str, Any]:
"""
Analyze image with specified analysis types.
Args:
api_key: Dashscope API key
image_path: Path to image file or URL
custom_prompt: Optional custom prompt
describe: Whether to describe the image
extract_text: Whether to extract text from image
identify_objects: Whether to identify objects
model: Model to use
Returns:
Analysis results as dictionary
"""
# Build prompt based on analysis type
if custom_prompt:
prompt = custom_prompt
else:
tasks = []
if describe:
tasks.append("详细描述这张图片的内容,包括物体、人物、场景、颜色和整体构成")
if extract_text:
tasks.append("提取图片中所有可见的文字(OCR)")
if identify_objects:
tasks.append("识别并列出图片中所有可辨认的物体、人物和元素")
if tasks:
prompt = f"""请分析这张图片,提供以下信息:
{''.join(tasks)}
请用清晰的分段格式回答。"""
else:
prompt = "请对这张图片进行全面详细的描述,包括所有可见的物体、人物、场景、文字和任何值得注意的细节。"
# Call API
response = call_dashscope_api(api_key, image_path, prompt, model)
# Parse response
content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
# Extract usage info
usage = response.get("usage", {})
return {
"success": True,
"image_path": image_path,
"model": model,
"api_provider": "dashscope",
"analysis": {
"description": content if describe else None,
"extracted_text": content if extract_text else None,
"objects": content if identify_objects else None,
"full_response": content
},
"usage": {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0)
}
}
def execute(args: argparse.Namespace) -> Dict[str, Any]:
"""Execute the image understanding analysis."""
# Get API key - 支持 DASHSCOPE_API_KEY 或 OPENAI_API_KEY
api_key = args.api_key or os.environ.get("DASHSCOPE_API_KEY") or os.environ.get("OPENAI_API_KEY")
if not api_key:
raise ValueError(
"API key not provided. Use --api-key or set DASHSCOPE_API_KEY environment variable"
)
# Validate image
image_path = validate_image_path(args.image)
# Determine analysis type
describe = not (args.extract_text or args.identify_objects or args.custom_prompt)
extract_text = args.extract_text
identify_objects = args.identify_objects
custom_prompt = args.custom_prompt
model = args.model or DEFAULT_MODEL
# Analyze
result = analyze_image(
api_key=api_key,
image_path=image_path,
custom_prompt=custom_prompt,
describe=describe,
extract_text=extract_text,
identify_objects=identify_objects,
model=model
)
return result
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="使用 Dashscope通义千问视觉模型分析图片",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
# 基本图片描述
python image_understanding.py --image photo.jpg
# 提取图片中的文字
python image_understanding.py --image screenshot.png --extract-text
# 识别图片中的物体
python image_understanding.py --image photo.jpg --identify-objects
# 自定义分析提示词
python image_understanding.py --image photo.jpg --prompt "这个产品多少钱?"
# 使用环境变量中的 API key
export DASHSCOPE_API_KEY=your_key
python image_understanding.py --image photo.jpg
# 使用网络图片URL
python image_understanding.py --image "https://example.com/photo.jpg" --describe
# 使用更强的模型
python image_understanding.py --image photo.jpg --model qwen-vl-max
环境变量:
DASHSCOPE_API_KEY 你的 Dashscope API key
OPENAI_API_KEY 也可以使用(兼容性支持)
"""
)
# Required
parser.add_argument(
"--image", "-i",
required=True,
help="本地图片路径或图片URL"
)
# Optional
parser.add_argument(
"--api-key",
help="Dashscope API key (也可通过 DASHSCOPE_API_KEY 环境变量设置)"
)
parser.add_argument(
"--model", "-m",
default=DEFAULT_MODEL,
choices=["qwen-vl-plus", "qwen-vl-max"],
help=f"使用的模型 (默认: {DEFAULT_MODEL})"
)
parser.add_argument(
"--custom-prompt", "-p",
help="自定义图片分析提示词"
)
# Analysis type
analysis_group = parser.add_mutually_exclusive_group()
analysis_group.add_argument(
"--describe",
action="store_true",
default=True,
help="描述图片内容 (默认行为)"
)
analysis_group.add_argument(
"--extract-text", "-e",
action="store_true",
help="从图片提取文字 (OCR)"
)
analysis_group.add_argument(
"--identify-objects", "-o",
action="store_true",
help="识别图片中的物体"
)
# Output
parser.add_argument(
"--compact",
action="store_true",
help="输出紧凑JSON (不缩进)"
)
args = parser.parse_args()
try:
result = execute(args)
indent = None if args.compact else 2
print(json.dumps(result, ensure_ascii=False, indent=indent))
sys.exit(0)
except ValueError as e:
error_result = {
"success": False,
"error": {
"type": "validation_error",
"message": str(e)
}
}
print(json.dumps(error_result, ensure_ascii=False, indent=2))
sys.exit(2)
except Exception as e:
error_result = {
"success": False,
"error": {
"type": "execution_error",
"message": str(e)
}
}
print(json.dumps(error_result, ensure_ascii=False, indent=2))
sys.exit(1)
if __name__ == "__main__":
main()