新增功能: - 集成博查Web Search API,自动获取网站相关新闻 - News模型添加source_name和source_icon字段 - 新闻管理后台界面优化 - 网站详情页新闻展示(标题、摘要、来源、链接) - 定期任务脚本支持批量获取新闻 - 完整的API路由和测试脚本 技术实现: - NewsSearcher工具类封装博查API - 智能新闻搜索和去重机制 - 数据库迁移脚本migrate_news_fields.py - API路由:/api/fetch-site-news 和 /api/fetch-all-news - Cron任务脚本:fetch_news_cron.py 修改文件: - config.py: 添加博查API配置 - models.py: News模型扩展 - app.py: 新闻获取路由和NewsAdmin优化 - templates/detail_new.html: 新闻展示UI 新增文件: - utils/news_searcher.py (271行) - migrate_news_fields.py (99行) - fetch_news_cron.py (167行) - test_news_feature.py (142行) - NEWS_FEATURE_v2.2.md (408行) 统计:9个文件,1348行新增 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
272 lines
7.8 KiB
Python
272 lines
7.8 KiB
Python
"""
|
||
新闻搜索工具 - 使用博查 Web Search API
|
||
"""
|
||
import requests
|
||
import json
|
||
from datetime import datetime
|
||
from typing import List, Dict, Optional
|
||
|
||
|
||
class NewsSearcher:
|
||
"""博查新闻搜索器"""
|
||
|
||
def __init__(self, api_key: str, base_url: str = 'https://api.bocha.cn'):
|
||
"""
|
||
初始化新闻搜索器
|
||
|
||
Args:
|
||
api_key: 博查API密钥
|
||
base_url: API基础URL
|
||
"""
|
||
self.api_key = api_key
|
||
self.base_url = base_url
|
||
self.endpoint = f"{base_url}/v1/web-search"
|
||
|
||
def search_news(
|
||
self,
|
||
query: str,
|
||
count: int = 10,
|
||
freshness: str = 'oneMonth',
|
||
summary: bool = True,
|
||
include: Optional[str] = None,
|
||
exclude: Optional[str] = None
|
||
) -> Dict:
|
||
"""
|
||
搜索新闻
|
||
|
||
Args:
|
||
query: 搜索关键词
|
||
count: 返回结果数量(1-50)
|
||
freshness: 时间范围(noLimit/oneDay/oneWeek/oneMonth/oneYear)
|
||
summary: 是否显示摘要
|
||
include: 指定搜索的网站范围(多个域名用|或,分隔)
|
||
exclude: 排除搜索的网站范围(多个域名用|或,分隔)
|
||
|
||
Returns:
|
||
搜索结果字典
|
||
"""
|
||
headers = {
|
||
'Authorization': f'Bearer {self.api_key}',
|
||
'Content-Type': 'application/json'
|
||
}
|
||
|
||
payload = {
|
||
'query': query,
|
||
'count': count,
|
||
'freshness': freshness,
|
||
'summary': summary
|
||
}
|
||
|
||
# 添加可选参数
|
||
if include:
|
||
payload['include'] = include
|
||
if exclude:
|
||
payload['exclude'] = exclude
|
||
|
||
try:
|
||
response = requests.post(
|
||
self.endpoint,
|
||
headers=headers,
|
||
data=json.dumps(payload),
|
||
timeout=30
|
||
)
|
||
response.raise_for_status()
|
||
return response.json()
|
||
|
||
except requests.exceptions.RequestException as e:
|
||
return {
|
||
'success': False,
|
||
'error': str(e),
|
||
'code': getattr(response, 'status_code', None) if 'response' in locals() else None
|
||
}
|
||
|
||
def parse_news_items(self, search_result: Dict) -> List[Dict]:
|
||
"""
|
||
解析搜索结果为新闻列表
|
||
|
||
Args:
|
||
search_result: 博查API返回的搜索结果
|
||
|
||
Returns:
|
||
新闻列表,每个新闻包含:title, url, snippet, summary, site_name, published_at等
|
||
"""
|
||
news_items = []
|
||
|
||
# 检查返回数据格式
|
||
if 'data' not in search_result:
|
||
return news_items
|
||
|
||
data = search_result['data']
|
||
if 'webPages' not in data or 'value' not in data['webPages']:
|
||
return news_items
|
||
|
||
# 解析每条新闻
|
||
for item in data['webPages']['value']:
|
||
news_item = {
|
||
'title': item.get('name', ''),
|
||
'url': item.get('url', ''),
|
||
'snippet': item.get('snippet', ''),
|
||
'summary': item.get('summary', ''),
|
||
'site_name': item.get('siteName', ''),
|
||
'site_icon': item.get('siteIcon', ''),
|
||
'published_at': self._parse_date(item.get('datePublished')),
|
||
'display_url': item.get('displayUrl', ''),
|
||
'language': item.get('language', ''),
|
||
}
|
||
news_items.append(news_item)
|
||
|
||
return news_items
|
||
|
||
def search_site_news(
|
||
self,
|
||
site_name: str,
|
||
site_url: Optional[str] = None,
|
||
count: int = 10,
|
||
freshness: str = 'oneMonth'
|
||
) -> List[Dict]:
|
||
"""
|
||
搜索特定网站的相关新闻
|
||
|
||
Args:
|
||
site_name: 网站名称(用于搜索关键词)
|
||
site_url: 网站URL(可选,用于排除网站自身)
|
||
count: 返回结果数量
|
||
freshness: 时间范围
|
||
|
||
Returns:
|
||
新闻列表
|
||
"""
|
||
# 构建搜索关键词:网站名称 + "最新" + "新闻"
|
||
query = f"{site_name} 最新 新闻"
|
||
|
||
# 如果提供了网站URL,排除网站自身的结果
|
||
exclude = None
|
||
if site_url:
|
||
# 提取域名
|
||
try:
|
||
from urllib.parse import urlparse
|
||
parsed = urlparse(site_url)
|
||
domain = parsed.netloc or parsed.path
|
||
# 移除 www. 前缀
|
||
domain = domain.replace('www.', '')
|
||
exclude = domain
|
||
except Exception:
|
||
pass
|
||
|
||
# 执行搜索
|
||
search_result = self.search_news(
|
||
query=query,
|
||
count=count,
|
||
freshness=freshness,
|
||
summary=True,
|
||
exclude=exclude
|
||
)
|
||
|
||
# 解析结果
|
||
return self.parse_news_items(search_result)
|
||
|
||
def _parse_date(self, date_str: Optional[str]) -> Optional[datetime]:
|
||
"""
|
||
解析日期字符串
|
||
|
||
Args:
|
||
date_str: 日期字符串(例如:2025-02-23T08:18:30+08:00)
|
||
|
||
Returns:
|
||
datetime对象,如果解析失败返回None
|
||
"""
|
||
if not date_str:
|
||
return None
|
||
|
||
try:
|
||
# 尝试解析 ISO 8601 格式
|
||
# 博查API返回格式:2025-02-23T08:18:30+08:00
|
||
if '+' in date_str or 'Z' in date_str:
|
||
# 使用 fromisoformat(Python 3.7+)
|
||
return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
|
||
else:
|
||
# 简单格式
|
||
return datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S')
|
||
except Exception:
|
||
# 如果解析失败,返回None
|
||
return None
|
||
|
||
def format_news_for_display(self, news_items: List[Dict]) -> List[Dict]:
|
||
"""
|
||
格式化新闻用于前端展示
|
||
|
||
Args:
|
||
news_items: 新闻列表
|
||
|
||
Returns:
|
||
格式化后的新闻列表
|
||
"""
|
||
formatted_news = []
|
||
|
||
for item in news_items:
|
||
formatted_item = {
|
||
'title': item['title'],
|
||
'url': item['url'],
|
||
'description': item.get('summary') or item.get('snippet', ''),
|
||
'source': item.get('site_name', '未知来源'),
|
||
'published_date': self._format_date(item.get('published_at')),
|
||
'icon': item.get('site_icon', '')
|
||
}
|
||
formatted_news.append(formatted_item)
|
||
|
||
return formatted_news
|
||
|
||
def _format_date(self, dt: Optional[datetime]) -> str:
|
||
"""
|
||
格式化日期用于显示
|
||
|
||
Args:
|
||
dt: datetime对象
|
||
|
||
Returns:
|
||
格式化的日期字符串
|
||
"""
|
||
if not dt:
|
||
return '未知日期'
|
||
|
||
try:
|
||
# 返回格式:2025-01-30
|
||
return dt.strftime('%Y-%m-%d')
|
||
except Exception:
|
||
return '未知日期'
|
||
|
||
|
||
# 测试代码
|
||
if __name__ == '__main__':
|
||
import os
|
||
from dotenv import load_dotenv
|
||
|
||
load_dotenv()
|
||
|
||
# 从环境变量获取API密钥
|
||
api_key = os.environ.get('BOCHA_API_KEY')
|
||
if not api_key:
|
||
print("错误:未设置BOCHA_API_KEY环境变量")
|
||
exit(1)
|
||
|
||
# 创建搜索器
|
||
searcher = NewsSearcher(api_key)
|
||
|
||
# 测试搜索
|
||
print("正在搜索:ChatGPT 最新新闻...")
|
||
news_items = searcher.search_site_news(
|
||
site_name='ChatGPT',
|
||
count=5,
|
||
freshness='oneWeek'
|
||
)
|
||
|
||
# 显示结果
|
||
print(f"\n找到 {len(news_items)} 条新闻:\n")
|
||
for i, news in enumerate(news_items, 1):
|
||
print(f"{i}. {news['title']}")
|
||
print(f" 来源:{news['site_name']}")
|
||
print(f" 日期:{searcher._format_date(news['published_at'])}")
|
||
print(f" URL:{news['url']}")
|
||
print(f" 摘要:{news.get('summary', news.get('snippet', ''))[:100]}...")
|
||
print()
|