""" 新闻搜索工具 - 使用博查 Web Search API """ import requests import json from datetime import datetime from typing import List, Dict, Optional class NewsSearcher: """博查新闻搜索器""" def __init__(self, api_key: str, base_url: str = 'https://api.bocha.cn'): """ 初始化新闻搜索器 Args: api_key: 博查API密钥 base_url: API基础URL """ self.api_key = api_key self.base_url = base_url self.endpoint = f"{base_url}/v1/web-search" def search_news( self, query: str, count: int = 10, freshness: str = 'oneMonth', summary: bool = True, include: Optional[str] = None, exclude: Optional[str] = None ) -> Dict: """ 搜索新闻 Args: query: 搜索关键词 count: 返回结果数量(1-50) freshness: 时间范围(noLimit/oneDay/oneWeek/oneMonth/oneYear) summary: 是否显示摘要 include: 指定搜索的网站范围(多个域名用|或,分隔) exclude: 排除搜索的网站范围(多个域名用|或,分隔) Returns: 搜索结果字典 """ headers = { 'Authorization': f'Bearer {self.api_key}', 'Content-Type': 'application/json' } payload = { 'query': query, 'count': count, 'freshness': freshness, 'summary': summary } # 添加可选参数 if include: payload['include'] = include if exclude: payload['exclude'] = exclude try: response = requests.post( self.endpoint, headers=headers, data=json.dumps(payload), timeout=30 ) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: return { 'success': False, 'error': str(e), 'code': getattr(response, 'status_code', None) if 'response' in locals() else None } def parse_news_items(self, search_result: Dict) -> List[Dict]: """ 解析搜索结果为新闻列表 Args: search_result: 博查API返回的搜索结果 Returns: 新闻列表,每个新闻包含:title, url, snippet, summary, site_name, published_at等 """ news_items = [] # 检查返回数据格式 if 'data' not in search_result: return news_items data = search_result['data'] if 'webPages' not in data or 'value' not in data['webPages']: return news_items # 解析每条新闻 for item in data['webPages']['value']: news_item = { 'title': item.get('name', ''), 'url': item.get('url', ''), 'snippet': item.get('snippet', ''), 'summary': item.get('summary', ''), 'site_name': item.get('siteName', ''), 'site_icon': item.get('siteIcon', ''), 'published_at': self._parse_date(item.get('datePublished')), 'display_url': item.get('displayUrl', ''), 'language': item.get('language', ''), } news_items.append(news_item) return news_items def search_site_news( self, site_name: str, site_url: Optional[str] = None, count: int = 10, freshness: str = 'oneMonth' ) -> List[Dict]: """ 搜索特定网站的相关新闻 Args: site_name: 网站名称(用于搜索关键词) site_url: 网站URL(可选,用于排除网站自身) count: 返回结果数量 freshness: 时间范围 Returns: 新闻列表 """ # 构建搜索关键词:网站名称 + "最新" + "新闻" query = f"{site_name} 最新 新闻" # 如果提供了网站URL,排除网站自身的结果 exclude = None if site_url: # 提取域名 try: from urllib.parse import urlparse parsed = urlparse(site_url) domain = parsed.netloc or parsed.path # 移除 www. 前缀 domain = domain.replace('www.', '') exclude = domain except Exception: pass # 执行搜索 search_result = self.search_news( query=query, count=count, freshness=freshness, summary=True, exclude=exclude ) # 解析结果 return self.parse_news_items(search_result) def _parse_date(self, date_str: Optional[str]) -> Optional[datetime]: """ 解析日期字符串 Args: date_str: 日期字符串(例如:2025-02-23T08:18:30+08:00) Returns: datetime对象,如果解析失败返回None """ if not date_str: return None try: # 尝试解析 ISO 8601 格式 # 博查API返回格式:2025-02-23T08:18:30+08:00 if '+' in date_str or 'Z' in date_str: # 使用 fromisoformat(Python 3.7+) return datetime.fromisoformat(date_str.replace('Z', '+00:00')) else: # 简单格式 return datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') except Exception: # 如果解析失败,返回None return None def format_news_for_display(self, news_items: List[Dict]) -> List[Dict]: """ 格式化新闻用于前端展示 Args: news_items: 新闻列表 Returns: 格式化后的新闻列表 """ formatted_news = [] for item in news_items: formatted_item = { 'title': item['title'], 'url': item['url'], 'description': item.get('summary') or item.get('snippet', ''), 'source': item.get('site_name', '未知来源'), 'published_date': self._format_date(item.get('published_at')), 'icon': item.get('site_icon', '') } formatted_news.append(formatted_item) return formatted_news def _format_date(self, dt: Optional[datetime]) -> str: """ 格式化日期用于显示 Args: dt: datetime对象 Returns: 格式化的日期字符串 """ if not dt: return '未知日期' try: # 返回格式:2025-01-30 return dt.strftime('%Y-%m-%d') except Exception: return '未知日期' # 测试代码 if __name__ == '__main__': import os from dotenv import load_dotenv load_dotenv() # 从环境变量获取API密钥 api_key = os.environ.get('BOCHA_API_KEY') if not api_key: print("错误:未设置BOCHA_API_KEY环境变量") exit(1) # 创建搜索器 searcher = NewsSearcher(api_key) # 测试搜索 print("正在搜索:ChatGPT 最新新闻...") news_items = searcher.search_site_news( site_name='ChatGPT', count=5, freshness='oneWeek' ) # 显示结果 print(f"\n找到 {len(news_items)} 条新闻:\n") for i, news in enumerate(news_items, 1): print(f"{i}. {news['title']}") print(f" 来源:{news['site_name']}") print(f" 日期:{searcher._format_date(news['published_at'])}") print(f" URL:{news['url']}") print(f" 摘要:{news.get('summary', news.get('snippet', ''))[:100]}...") print()