Files
zjpb.net/utils/news_searcher.py
Jowe d7d21e19c9 release: v2.2.0 - 博查新闻搜索功能
新增功能:
- 集成博查Web Search API,自动获取网站相关新闻
- News模型添加source_name和source_icon字段
- 新闻管理后台界面优化
- 网站详情页新闻展示(标题、摘要、来源、链接)
- 定期任务脚本支持批量获取新闻
- 完整的API路由和测试脚本

技术实现:
- NewsSearcher工具类封装博查API
- 智能新闻搜索和去重机制
- 数据库迁移脚本migrate_news_fields.py
- API路由:/api/fetch-site-news 和 /api/fetch-all-news
- Cron任务脚本:fetch_news_cron.py

修改文件:
- config.py: 添加博查API配置
- models.py: News模型扩展
- app.py: 新闻获取路由和NewsAdmin优化
- templates/detail_new.html: 新闻展示UI

新增文件:
- utils/news_searcher.py (271行)
- migrate_news_fields.py (99行)
- fetch_news_cron.py (167行)
- test_news_feature.py (142行)
- NEWS_FEATURE_v2.2.md (408行)

统计:9个文件,1348行新增

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 22:04:35 +08:00

272 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
新闻搜索工具 - 使用博查 Web Search API
"""
import requests
import json
from datetime import datetime
from typing import List, Dict, Optional
class NewsSearcher:
"""博查新闻搜索器"""
def __init__(self, api_key: str, base_url: str = 'https://api.bocha.cn'):
"""
初始化新闻搜索器
Args:
api_key: 博查API密钥
base_url: API基础URL
"""
self.api_key = api_key
self.base_url = base_url
self.endpoint = f"{base_url}/v1/web-search"
def search_news(
self,
query: str,
count: int = 10,
freshness: str = 'oneMonth',
summary: bool = True,
include: Optional[str] = None,
exclude: Optional[str] = None
) -> Dict:
"""
搜索新闻
Args:
query: 搜索关键词
count: 返回结果数量1-50
freshness: 时间范围noLimit/oneDay/oneWeek/oneMonth/oneYear
summary: 是否显示摘要
include: 指定搜索的网站范围(多个域名用|或,分隔)
exclude: 排除搜索的网站范围(多个域名用|或,分隔)
Returns:
搜索结果字典
"""
headers = {
'Authorization': f'Bearer {self.api_key}',
'Content-Type': 'application/json'
}
payload = {
'query': query,
'count': count,
'freshness': freshness,
'summary': summary
}
# 添加可选参数
if include:
payload['include'] = include
if exclude:
payload['exclude'] = exclude
try:
response = requests.post(
self.endpoint,
headers=headers,
data=json.dumps(payload),
timeout=30
)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
return {
'success': False,
'error': str(e),
'code': getattr(response, 'status_code', None) if 'response' in locals() else None
}
def parse_news_items(self, search_result: Dict) -> List[Dict]:
"""
解析搜索结果为新闻列表
Args:
search_result: 博查API返回的搜索结果
Returns:
新闻列表每个新闻包含title, url, snippet, summary, site_name, published_at等
"""
news_items = []
# 检查返回数据格式
if 'data' not in search_result:
return news_items
data = search_result['data']
if 'webPages' not in data or 'value' not in data['webPages']:
return news_items
# 解析每条新闻
for item in data['webPages']['value']:
news_item = {
'title': item.get('name', ''),
'url': item.get('url', ''),
'snippet': item.get('snippet', ''),
'summary': item.get('summary', ''),
'site_name': item.get('siteName', ''),
'site_icon': item.get('siteIcon', ''),
'published_at': self._parse_date(item.get('datePublished')),
'display_url': item.get('displayUrl', ''),
'language': item.get('language', ''),
}
news_items.append(news_item)
return news_items
def search_site_news(
self,
site_name: str,
site_url: Optional[str] = None,
count: int = 10,
freshness: str = 'oneMonth'
) -> List[Dict]:
"""
搜索特定网站的相关新闻
Args:
site_name: 网站名称(用于搜索关键词)
site_url: 网站URL可选用于排除网站自身
count: 返回结果数量
freshness: 时间范围
Returns:
新闻列表
"""
# 构建搜索关键词:网站名称 + "最新" + "新闻"
query = f"{site_name} 最新 新闻"
# 如果提供了网站URL排除网站自身的结果
exclude = None
if site_url:
# 提取域名
try:
from urllib.parse import urlparse
parsed = urlparse(site_url)
domain = parsed.netloc or parsed.path
# 移除 www. 前缀
domain = domain.replace('www.', '')
exclude = domain
except Exception:
pass
# 执行搜索
search_result = self.search_news(
query=query,
count=count,
freshness=freshness,
summary=True,
exclude=exclude
)
# 解析结果
return self.parse_news_items(search_result)
def _parse_date(self, date_str: Optional[str]) -> Optional[datetime]:
"""
解析日期字符串
Args:
date_str: 日期字符串例如2025-02-23T08:18:30+08:00
Returns:
datetime对象如果解析失败返回None
"""
if not date_str:
return None
try:
# 尝试解析 ISO 8601 格式
# 博查API返回格式2025-02-23T08:18:30+08:00
if '+' in date_str or 'Z' in date_str:
# 使用 fromisoformatPython 3.7+
return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
else:
# 简单格式
return datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S')
except Exception:
# 如果解析失败返回None
return None
def format_news_for_display(self, news_items: List[Dict]) -> List[Dict]:
"""
格式化新闻用于前端展示
Args:
news_items: 新闻列表
Returns:
格式化后的新闻列表
"""
formatted_news = []
for item in news_items:
formatted_item = {
'title': item['title'],
'url': item['url'],
'description': item.get('summary') or item.get('snippet', ''),
'source': item.get('site_name', '未知来源'),
'published_date': self._format_date(item.get('published_at')),
'icon': item.get('site_icon', '')
}
formatted_news.append(formatted_item)
return formatted_news
def _format_date(self, dt: Optional[datetime]) -> str:
"""
格式化日期用于显示
Args:
dt: datetime对象
Returns:
格式化的日期字符串
"""
if not dt:
return '未知日期'
try:
# 返回格式2025-01-30
return dt.strftime('%Y-%m-%d')
except Exception:
return '未知日期'
# 测试代码
if __name__ == '__main__':
import os
from dotenv import load_dotenv
load_dotenv()
# 从环境变量获取API密钥
api_key = os.environ.get('BOCHA_API_KEY')
if not api_key:
print("错误未设置BOCHA_API_KEY环境变量")
exit(1)
# 创建搜索器
searcher = NewsSearcher(api_key)
# 测试搜索
print("正在搜索ChatGPT 最新新闻...")
news_items = searcher.search_site_news(
site_name='ChatGPT',
count=5,
freshness='oneWeek'
)
# 显示结果
print(f"\n找到 {len(news_items)} 条新闻:\n")
for i, news in enumerate(news_items, 1):
print(f"{i}. {news['title']}")
print(f" 来源:{news['site_name']}")
print(f" 日期:{searcher._format_date(news['published_at'])}")
print(f" URL{news['url']}")
print(f" 摘要:{news.get('summary', news.get('snippet', ''))[:100]}...")
print()