Files
X-Agents/account/admin/skills/image-understanding/scripts/main.py
2026-03-11 16:26:22 +08:00

390 lines
12 KiB
Python

#!/usr/bin/env python3
"""
image-understanding - Enable AI to understand and analyze images using vision API
This script allows users to analyze images by calling vision API (like OpenAI GPT-4 Vision).
It can describe image content, extract text, identify objects, and answer questions about images.
Usage:
python image_understanding.py --image path/to/image.jpg
python image_understanding.py --image https://example.com/image.png --prompt "What objects are in this image?"
python image_understanding.py --image ./screenshot.png --extract-text --describe
"""
import argparse
import json
import os
import sys
from typing import Optional, Dict, Any, List
from pathlib import Path
def validate_image_path(image_path: str) -> str:
"""
Validate and normalize image path or URL.
Args:
image_path: Path to local image or URL
Returns:
Validated image path
Raises:
ValueError: If image path is invalid or file doesn't exist
"""
if image_path.startswith(('http://', 'https://')):
# URL validation
if not image_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
raise ValueError(f"Invalid image URL format: {image_path}")
return image_path
# Local file path
path = Path(image_path)
if not path.exists():
raise ValueError(f"Image file not found: {image_path}")
if not path.is_file():
raise ValueError(f"Path is not a file: {image_path}")
# Check file extension
valid_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.webp'}
if path.suffix.lower() not in valid_extensions:
raise ValueError(f"Invalid image format: {path.suffix}. Supported formats: {', '.join(valid_extensions)}")
return str(path.absolute())
def encode_image(image_path: str) -> str:
"""
Encode image to base64 string for API upload.
Args:
image_path: Path to image file
Returns:
Base64 encoded image string
Raises:
Exception: If encoding fails
"""
import base64
try:
with open(image_path, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
return encoded_string
except Exception as e:
raise Exception(f"Failed to encode image: {str(e)}")
def call_vision_api(api_key: str, image_path: str, prompt: str, model: str) -> Dict[str, Any]:
"""
Call OpenAI Vision API to analyze image.
Args:
api_key: OpenAI API key
image_path: Path to image file or URL
prompt: Custom prompt for analysis
model: Model name to use
Returns:
API response as dictionary
Raises:
Exception: If API call fails
"""
import requests
# Prepare image content
if image_path.startswith(('http://', 'https://')):
image_content = {"type": "image_url", "image_url": {"url": image_path}}
else:
base64_image = encode_image(image_path)
image_content = {
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
# Build messages
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
image_content
]
}
]
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": model,
"messages": messages,
"max_tokens": 1000
}
try:
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers=headers,
json=payload,
timeout=60
)
response.raise_for_status()
return response.json()
except requests.exceptions.HTTPError as e:
error_msg = f"API request failed: {e.response.text}"
try:
error_data = e.response.json()
if 'error' in error_data:
error_msg = f"API error: {error_data['error'].get('message', str(error_data))}"
except:
pass
raise Exception(error_msg)
except requests.exceptions.RequestException as e:
raise Exception(f"Network error: {str(e)}")
def analyze_image(
api_key: str,
image_path: str,
custom_prompt: Optional[str] = None,
describe: bool = True,
extract_text: bool = False,
identify_objects: bool = False,
model: str = "gpt-4-vision-preview"
) -> Dict[str, Any]:
"""
Analyze image with specified analysis types.
Args:
api_key: OpenAI API key
image_path: Path to image file or URL
custom_prompt: Optional custom prompt
describe: Whether to describe the image
extract_text: Whether to extract text from image
identify_objects: Whether to identify objects in image
model: Model to use for analysis
Returns:
Analysis results as dictionary
"""
# Build analysis prompt
analysis_tasks = []
if describe:
analysis_tasks.append("Describe the image in detail, including objects, people,场景, colors, and overall composition")
if extract_text:
analysis_tasks.append("Extract all visible text from the image (OCR)")
if identify_objects:
analysis_tasks.append("Identify and list all recognizable objects, people, and elements in the image")
if custom_prompt:
prompt = custom_prompt
else:
prompt = f"""Please analyze this image and provide the following information:
1. {'Describe the image content in detail' if describe else ''}
2. {'Extract all visible text from the image' if extract_text else ''}
3. {'List all identifiable objects and elements' if identify_objects else ''}
Please format your response as a structured analysis with clear sections."""
# Remove empty tasks
analysis_tasks = [task for task in analysis_tasks if task]
if not analysis_tasks and not custom_prompt:
# Default: full analysis
prompt = "Provide a comprehensive description of this image, including all visible objects, people,场景, text, and any notable details."
elif not analysis_tasks and custom_prompt:
prompt = custom_prompt
# Call API
response = call_vision_api(api_key, image_path, prompt, model)
# Parse response
content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
# Extract usage information if available
usage = response.get("usage", {})
return {
"success": True,
"image_path": image_path,
"model": model,
"analysis": {
"description": content if describe else None,
"extracted_text": content if extract_text else None,
"objects": content if identify_objects else None,
"full_response": content
},
"usage": {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0)
}
}
def execute(args: argparse.Namespace) -> Dict[str, Any]:
"""
Execute the image understanding analysis.
Args:
args: Parsed command-line arguments
Returns:
Analysis result as dictionary
"""
# Get API key
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
if not api_key:
raise ValueError("API key not provided. Use --api-key or set OPENAI_API_KEY environment variable")
# Validate image path
image_path = validate_image_path(args.image)
# Determine analysis type
describe = not (args.extract_text or args.identify_objects or args.custom_prompt)
extract_text = args.extract_text
identify_objects = args.identify_objects
custom_prompt = args.custom_prompt
# Perform analysis
result = analyze_image(
api_key=api_key,
image_path=image_path,
custom_prompt=custom_prompt,
describe=describe,
extract_text=extract_text,
identify_objects=identify_objects,
model=args.model
)
return result
def main():
"""Main entry point for the image understanding script."""
parser = argparse.ArgumentParser(
description="Analyze images using AI vision capabilities (OpenAI GPT-4 Vision)",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Basic image description
python image_understanding.py --image photo.jpg
# Extract text from image
python image_understanding.py --image screenshot.png --extract-text
# Identify objects in image
python image_understanding.py --image photo.jpg --identify-objects
# Custom analysis with specific prompt
python image_understanding.py --image photo.jpg --prompt "What brand is this product?"
# Using API key from environment variable
export OPENAI_API_KEY=your_key
python image_understanding.py --image photo.jpg
# Using remote image URL
python image_understanding.py --image "https://example.com/photo.jpg" --describe
Environment Variables:
OPENAI_API_KEY Your OpenAI API key (can be used instead of --api-key)
"""
)
# Required arguments
parser.add_argument(
"--image",
required=True,
help="Path to local image file or URL of image"
)
# Optional arguments
parser.add_argument(
"--api-key",
help="OpenAI API key (can also be set via OPENAI_API_KEY environment variable)"
)
parser.add_argument(
"--model",
default="gpt-4-vision-preview",
help="Model to use for vision analysis (default: gpt-4-vision-preview)"
)
parser.add_argument(
"--custom-prompt",
"-p",
help="Custom prompt for image analysis"
)
# Analysis type flags (mutually exclusive with custom prompt)
analysis_group = parser.add_mutually_exclusive_group()
analysis_group.add_argument(
"--describe",
action="store_true",
default=True,
help="Describe the image content (default behavior)"
)
analysis_group.add_argument(
"--extract-text",
action="store_true",
help="Extract text from the image (OCR)"
)
analysis_group.add_argument(
"--identify-objects",
action="store_true",
help="Identify and list objects in the image"
)
# Output options
parser.add_argument(
"--compact",
action="store_true",
help="Output compact JSON (without indentation)"
)
args = parser.parse_args()
try:
result = execute(args)
# Print result
indent = None if args.compact else 2
print(json.dumps(result, ensure_ascii=False, indent=indent))
sys.exit(0)
except ValueError as e:
error_result = {
"success": False,
"error": {
"type": "validation_error",
"message": str(e)
}
}
print(json.dumps(error_result, ensure_ascii=False, indent=2))
sys.exit(2)
except Exception as e:
error_result = {
"success": False,
"error": {
"type": "execution_error",
"message": str(e)
}
}
print(json.dumps(error_result, ensure_ascii=False, indent=2))
sys.exit(1)
if __name__ == "__main__":
main()