234 lines
6.6 KiB
Python
234 lines
6.6 KiB
Python
"""
|
|
网页获取工具
|
|
提供安全的网页内容抓取功能
|
|
"""
|
|
import httpx
|
|
from typing import Dict, Any, Optional
|
|
|
|
|
|
class WebToolConfig:
|
|
"""网页工具配置"""
|
|
REQUEST_TIMEOUT = 30 # 请求超时(秒)
|
|
MAX_RESPONSE_SIZE = 2 * 1024 * 1024 # 最大响应大小(2MB)
|
|
MAX_REDIRECTS = 5 # 最大重定向次数
|
|
ALLOWED_PROTOCOLS = ["http", "https"] # 允许的协议
|
|
|
|
|
|
async def web_fetch(
|
|
url: str,
|
|
method: str = "GET",
|
|
params: Optional[Dict[str, Any]] = None,
|
|
headers: Optional[Dict[str, str]] = None,
|
|
body: Optional[str] = None,
|
|
timeout: Optional[int] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
获取网页内容
|
|
|
|
Args:
|
|
url: 目标URL
|
|
method: HTTP方法
|
|
params: 查询参数
|
|
headers: 请求头
|
|
body: 请求体
|
|
timeout: 超时时间
|
|
|
|
Returns:
|
|
网页内容
|
|
"""
|
|
timeout = timeout or WebToolConfig.REQUEST_TIMEOUT
|
|
|
|
# 安全检查:协议
|
|
if not url.startswith(("http://", "https://")):
|
|
return {
|
|
"success": False,
|
|
"error": "Only HTTP and HTTPS protocols are allowed"
|
|
}
|
|
|
|
try:
|
|
async with httpx.AsyncClient(
|
|
timeout=timeout,
|
|
max_redirects=WebToolConfig.MAX_REDIRECTS,
|
|
follow_redirects=True,
|
|
) as client:
|
|
# 发送请求
|
|
response = await client.request(
|
|
method=method,
|
|
url=url,
|
|
params=params,
|
|
headers=headers,
|
|
content=body,
|
|
)
|
|
|
|
# 检查响应大小
|
|
if len(response.content) > WebToolConfig.MAX_RESPONSE_SIZE:
|
|
return {
|
|
"success": False,
|
|
"error": f"Response too large: {len(response.content)} bytes (max {WebToolConfig.MAX_RESPONSE_SIZE})"
|
|
}
|
|
|
|
# 尝试解析JSON
|
|
content_type = response.headers.get("content-type", "")
|
|
if "application/json" in content_type:
|
|
try:
|
|
data = response.json()
|
|
return {
|
|
"success": True,
|
|
"url": str(response.url),
|
|
"status_code": response.status_code,
|
|
"content_type": content_type,
|
|
"data": data,
|
|
"headers": dict(response.headers)
|
|
}
|
|
except:
|
|
pass
|
|
|
|
# 返回文本
|
|
return {
|
|
"success": True,
|
|
"url": str(response.url),
|
|
"status_code": response.status_code,
|
|
"content_type": content_type,
|
|
"content": response.text[:WebToolConfig.MAX_RESPONSE_SIZE],
|
|
"headers": dict(response.headers)
|
|
}
|
|
|
|
except httpx.TimeoutException:
|
|
return {
|
|
"success": False,
|
|
"error": f"Request timeout ({timeout}s)"
|
|
}
|
|
except httpx.RedirectLoop:
|
|
return {
|
|
"success": False,
|
|
"error": "Too many redirects"
|
|
}
|
|
except httpx.InvalidURL:
|
|
return {
|
|
"success": False,
|
|
"error": "Invalid URL"
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
|
|
async def web_search(
|
|
query: str,
|
|
max_results: int = 5
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
搜索网页
|
|
|
|
Args:
|
|
query: 搜索关键词
|
|
max_results: 最大结果数
|
|
|
|
Returns:
|
|
搜索结果
|
|
"""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
response = await client.get(
|
|
"https://api.duckduckgo.com/",
|
|
params={
|
|
"q": query,
|
|
"format": "json",
|
|
"no_html": 1,
|
|
"skip_disambig": 1
|
|
}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
results = []
|
|
|
|
if "RelatedTopics" in data:
|
|
for item in data["RelatedTopics"][:max_results]:
|
|
if "Text" in item:
|
|
text = item.get("Text", "")
|
|
results.append({
|
|
"title": text.split(" - ")[0] if " - " in text else "",
|
|
"content": text,
|
|
"url": item.get("URL", "")
|
|
})
|
|
|
|
return {
|
|
"success": True,
|
|
"query": query,
|
|
"results": results,
|
|
"count": len(results)
|
|
}
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"error": f"Search API returned status {response.status_code}"
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
|
|
# 工具定义
|
|
WEB_FETCH_TOOL = {
|
|
"name": "web_fetch",
|
|
"description": "Fetch content from a web URL. Supports GET, POST methods and can return JSON or text content.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"url": {
|
|
"type": "string",
|
|
"description": "The URL to fetch"
|
|
},
|
|
"method": {
|
|
"type": "string",
|
|
"description": "HTTP method (GET, POST)",
|
|
"default": "GET"
|
|
},
|
|
"params": {
|
|
"type": "object",
|
|
"description": "Query parameters"
|
|
},
|
|
"headers": {
|
|
"type": "object",
|
|
"description": "Request headers"
|
|
},
|
|
"body": {
|
|
"type": "string",
|
|
"description": "Request body (for POST)"
|
|
},
|
|
"timeout": {
|
|
"type": "integer",
|
|
"description": "Request timeout in seconds",
|
|
"default": 30
|
|
}
|
|
},
|
|
"required": ["url"]
|
|
}
|
|
}
|
|
|
|
WEB_SEARCH_TOOL = {
|
|
"name": "web_search",
|
|
"description": "Search the web for information. Use this when you need to find current information or facts.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"query": {
|
|
"type": "string",
|
|
"description": "The search query"
|
|
},
|
|
"max_results": {
|
|
"type": "integer",
|
|
"description": "Maximum number of results to return",
|
|
"default": 5
|
|
}
|
|
},
|
|
"required": ["query"]
|
|
}
|
|
}
|