""" 网页获取工具 提供安全的网页内容抓取功能 """ import httpx from typing import Dict, Any, Optional class WebToolConfig: """网页工具配置""" REQUEST_TIMEOUT = 30 # 请求超时(秒) MAX_RESPONSE_SIZE = 2 * 1024 * 1024 # 最大响应大小(2MB) MAX_REDIRECTS = 5 # 最大重定向次数 ALLOWED_PROTOCOLS = ["http", "https"] # 允许的协议 async def web_fetch( url: str, method: str = "GET", params: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, body: Optional[str] = None, timeout: Optional[int] = None ) -> Dict[str, Any]: """ 获取网页内容 Args: url: 目标URL method: HTTP方法 params: 查询参数 headers: 请求头 body: 请求体 timeout: 超时时间 Returns: 网页内容 """ timeout = timeout or WebToolConfig.REQUEST_TIMEOUT # 安全检查:协议 if not url.startswith(("http://", "https://")): return { "success": False, "error": "Only HTTP and HTTPS protocols are allowed" } try: async with httpx.AsyncClient( timeout=timeout, max_redirects=WebToolConfig.MAX_REDIRECTS, follow_redirects=True, ) as client: # 发送请求 response = await client.request( method=method, url=url, params=params, headers=headers, content=body, ) # 检查响应大小 if len(response.content) > WebToolConfig.MAX_RESPONSE_SIZE: return { "success": False, "error": f"Response too large: {len(response.content)} bytes (max {WebToolConfig.MAX_RESPONSE_SIZE})" } # 尝试解析JSON content_type = response.headers.get("content-type", "") if "application/json" in content_type: try: data = response.json() return { "success": True, "url": str(response.url), "status_code": response.status_code, "content_type": content_type, "data": data, "headers": dict(response.headers) } except: pass # 返回文本 return { "success": True, "url": str(response.url), "status_code": response.status_code, "content_type": content_type, "content": response.text[:WebToolConfig.MAX_RESPONSE_SIZE], "headers": dict(response.headers) } except httpx.TimeoutException: return { "success": False, "error": f"Request timeout ({timeout}s)" } except httpx.RedirectLoop: return { "success": False, "error": "Too many redirects" } except httpx.InvalidURL: return { "success": False, "error": "Invalid URL" } except Exception as e: return { "success": False, "error": str(e) } async def web_search( query: str, max_results: int = 5 ) -> Dict[str, Any]: """ 搜索网页 Args: query: 搜索关键词 max_results: 最大结果数 Returns: 搜索结果 """ try: async with httpx.AsyncClient(timeout=10.0) as client: response = await client.get( "https://api.duckduckgo.com/", params={ "q": query, "format": "json", "no_html": 1, "skip_disambig": 1 } ) if response.status_code == 200: data = response.json() results = [] if "RelatedTopics" in data: for item in data["RelatedTopics"][:max_results]: if "Text" in item: text = item.get("Text", "") results.append({ "title": text.split(" - ")[0] if " - " in text else "", "content": text, "url": item.get("URL", "") }) return { "success": True, "query": query, "results": results, "count": len(results) } else: return { "success": False, "error": f"Search API returned status {response.status_code}" } except Exception as e: return { "success": False, "error": str(e) } # 工具定义 WEB_FETCH_TOOL = { "name": "web_fetch", "description": "Fetch content from a web URL. Supports GET, POST methods and can return JSON or text content.", "parameters": { "type": "object", "properties": { "url": { "type": "string", "description": "The URL to fetch" }, "method": { "type": "string", "description": "HTTP method (GET, POST)", "default": "GET" }, "params": { "type": "object", "description": "Query parameters" }, "headers": { "type": "object", "description": "Request headers" }, "body": { "type": "string", "description": "Request body (for POST)" }, "timeout": { "type": "integer", "description": "Request timeout in seconds", "default": 30 } }, "required": ["url"] } } WEB_SEARCH_TOOL = { "name": "web_search", "description": "Search the web for information. Use this when you need to find current information or facts.", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "The search query" }, "max_results": { "type": "integer", "description": "Maximum number of results to return", "default": 5 } }, "required": ["query"] } }