Files
zjpb.net/utils/news_searcher.py
ZJPB Admin b00e52e1e0 release: v2.2.0 - 博查新闻搜索功能 (生产环境部署版)
核心功能:
  - 集成博查Web Search API自动获取网站相关新闻
  - 智能新闻更新机制(每日首次访问触发)
  - 精确新闻搜索(使用引号强制匹配网站名称)
  - News模型扩展(source_name, source_icon字段)
  - 网站详情页新闻展示模块
  - 新闻来源网站信息展示
  - 自动去重防止重复新闻

  技术实现:
  - NewsSearcher工具类封装博查API
  - 数据库迁移脚本migrate_news_fields.py
  - 测试脚本test_news_feature.py
  - 定期任务脚本fetch_news_cron.py
  - API路由:/api/fetch-site-news, /api/fetch-all-news

  配置优化:
  - 修复manage.sh路径和启动命令
  - 博查API配置(BOCHA_API_KEY, BOCHA_BASE_URL)
  - 新闻搜索参数配置

  界面优化:
  - 详情页新闻模块(左侧主栏)
  - 相似推荐模块(右侧边栏)
  - 首页标签图标修复
  - 后台添加修改密码功能
  - 登录页面优化

  部署信息:
  - 部署日期: 2025-12-30
  - 部署方式: 手动上传文件
  - 数据库: 已迁移(添加source_name和source_icon字段)
2025-12-30 23:44:27 +08:00

272 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
新闻搜索工具 - 使用博查 Web Search API
"""
import requests
import json
from datetime import datetime
from typing import List, Dict, Optional
class NewsSearcher:
"""博查新闻搜索器"""
def __init__(self, api_key: str, base_url: str = 'https://api.bocha.cn'):
"""
初始化新闻搜索器
Args:
api_key: 博查API密钥
base_url: API基础URL
"""
self.api_key = api_key
self.base_url = base_url
self.endpoint = f"{base_url}/v1/web-search"
def search_news(
self,
query: str,
count: int = 10,
freshness: str = 'oneMonth',
summary: bool = True,
include: Optional[str] = None,
exclude: Optional[str] = None
) -> Dict:
"""
搜索新闻
Args:
query: 搜索关键词
count: 返回结果数量1-50
freshness: 时间范围noLimit/oneDay/oneWeek/oneMonth/oneYear
summary: 是否显示摘要
include: 指定搜索的网站范围(多个域名用|或,分隔)
exclude: 排除搜索的网站范围(多个域名用|或,分隔)
Returns:
搜索结果字典
"""
headers = {
'Authorization': f'Bearer {self.api_key}',
'Content-Type': 'application/json'
}
payload = {
'query': query,
'count': count,
'freshness': freshness,
'summary': summary
}
# 添加可选参数
if include:
payload['include'] = include
if exclude:
payload['exclude'] = exclude
try:
response = requests.post(
self.endpoint,
headers=headers,
data=json.dumps(payload),
timeout=30
)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
return {
'success': False,
'error': str(e),
'code': getattr(response, 'status_code', None) if 'response' in locals() else None
}
def parse_news_items(self, search_result: Dict) -> List[Dict]:
"""
解析搜索结果为新闻列表
Args:
search_result: 博查API返回的搜索结果
Returns:
新闻列表每个新闻包含title, url, snippet, summary, site_name, published_at等
"""
news_items = []
# 检查返回数据格式
if 'data' not in search_result:
return news_items
data = search_result['data']
if 'webPages' not in data or 'value' not in data['webPages']:
return news_items
# 解析每条新闻
for item in data['webPages']['value']:
news_item = {
'title': item.get('name', ''),
'url': item.get('url', ''),
'snippet': item.get('snippet', ''),
'summary': item.get('summary', ''),
'site_name': item.get('siteName', ''),
'site_icon': item.get('siteIcon', ''),
'published_at': self._parse_date(item.get('datePublished')),
'display_url': item.get('displayUrl', ''),
'language': item.get('language', ''),
}
news_items.append(news_item)
return news_items
def search_site_news(
self,
site_name: str,
site_url: Optional[str] = None,
count: int = 10,
freshness: str = 'oneMonth'
) -> List[Dict]:
"""
搜索特定网站的相关新闻
Args:
site_name: 网站名称(用于搜索关键词)
site_url: 网站URL可选用于排除网站自身
count: 返回结果数量
freshness: 时间范围
Returns:
新闻列表
"""
# 构建搜索关键词:网站名称 + "最新" + "新闻"
query = f'"{site_name}" 新闻'
# 如果提供了网站URL排除网站自身的结果
exclude = None
if site_url:
# 提取域名
try:
from urllib.parse import urlparse
parsed = urlparse(site_url)
domain = parsed.netloc or parsed.path
# 移除 www. 前缀
domain = domain.replace('www.', '')
exclude = domain
except Exception:
pass
# 执行搜索
search_result = self.search_news(
query=query,
count=count,
freshness=freshness,
summary=True,
exclude=exclude
)
# 解析结果
return self.parse_news_items(search_result)
def _parse_date(self, date_str: Optional[str]) -> Optional[datetime]:
"""
解析日期字符串
Args:
date_str: 日期字符串例如2025-02-23T08:18:30+08:00
Returns:
datetime对象如果解析失败返回None
"""
if not date_str:
return None
try:
# 尝试解析 ISO 8601 格式
# 博查API返回格式2025-02-23T08:18:30+08:00
if '+' in date_str or 'Z' in date_str:
# 使用 fromisoformatPython 3.7+
return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
else:
# 简单格式
return datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S')
except Exception:
# 如果解析失败返回None
return None
def format_news_for_display(self, news_items: List[Dict]) -> List[Dict]:
"""
格式化新闻用于前端展示
Args:
news_items: 新闻列表
Returns:
格式化后的新闻列表
"""
formatted_news = []
for item in news_items:
formatted_item = {
'title': item['title'],
'url': item['url'],
'description': item.get('summary') or item.get('snippet', ''),
'source': item.get('site_name', '未知来源'),
'published_date': self._format_date(item.get('published_at')),
'icon': item.get('site_icon', '')
}
formatted_news.append(formatted_item)
return formatted_news
def _format_date(self, dt: Optional[datetime]) -> str:
"""
格式化日期用于显示
Args:
dt: datetime对象
Returns:
格式化的日期字符串
"""
if not dt:
return '未知日期'
try:
# 返回格式2025-01-30
return dt.strftime('%Y-%m-%d')
except Exception:
return '未知日期'
# 测试代码
if __name__ == '__main__':
import os
from dotenv import load_dotenv
load_dotenv()
# 从环境变量获取API密钥
api_key = os.environ.get('BOCHA_API_KEY')
if not api_key:
print("错误未设置BOCHA_API_KEY环境变量")
exit(1)
# 创建搜索器
searcher = NewsSearcher(api_key)
# 测试搜索
print("正在搜索ChatGPT 最新新闻...")
news_items = searcher.search_site_news(
site_name='ChatGPT',
count=5,
freshness='oneWeek'
)
# 显示结果
print(f"\n找到 {len(news_items)} 条新闻:\n")
for i, news in enumerate(news_items, 1):
print(f"{i}. {news['title']}")
print(f" 来源:{news['site_name']}")
print(f" 日期:{searcher._format_date(news['published_at'])}")
print(f" URL{news['url']}")
print(f" 摘要:{news.get('summary', news.get('snippet', ''))[:100]}...")
print()