Files
zjpb.net/utils/news_searcher.py
Jowe fdde6990fb feat: v2.3.0 - 新闻获取准确性优化
核心改进:
1. 新增专用新闻关键词字段(sites.news_keywords)
2. 严格匹配搜索策略(双引号包裹关键词)
3. 前台手动刷新新闻功能

数据库变更:
- Sites表添加news_keywords字段(VARCHAR(200))
- 提供迁移脚本migrate_news_keywords.py

代码变更:
- models.py: Site模型添加news_keywords字段
- app.py: 后台表单配置、API路由、search_site_news调用优化
- utils/news_searcher.py: 支持news_keywords参数优先匹配
- templates/detail_new.html: 添加刷新按钮和JavaScript

新增功能:
- 后台可为每个网站设置专属新闻关键词
- 详情页"获取最新资讯"按钮(前台可用,无需登录)
- 新API端点:POST /api/refresh-site-news/<site_code>

文档:
- DEPLOY_v2.3.0.md: 完整部署指南
- DEPLOY_v2.3_QUICK.md: 快速部署指南

向后兼容:
- 现有网站自动使用网站名称作为默认关键词
- 未设置关键词时降级使用网站名称搜索

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-31 11:01:51 +08:00

280 lines
8.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
新闻搜索工具 - 使用博查 Web Search API
"""
import requests
import json
from datetime import datetime
from typing import List, Dict, Optional
class NewsSearcher:
"""博查新闻搜索器"""
def __init__(self, api_key: str, base_url: str = 'https://api.bocha.cn'):
"""
初始化新闻搜索器
Args:
api_key: 博查API密钥
base_url: API基础URL
"""
self.api_key = api_key
self.base_url = base_url
self.endpoint = f"{base_url}/v1/web-search"
def search_news(
self,
query: str,
count: int = 10,
freshness: str = 'oneMonth',
summary: bool = True,
include: Optional[str] = None,
exclude: Optional[str] = None
) -> Dict:
"""
搜索新闻
Args:
query: 搜索关键词
count: 返回结果数量1-50
freshness: 时间范围noLimit/oneDay/oneWeek/oneMonth/oneYear
summary: 是否显示摘要
include: 指定搜索的网站范围(多个域名用|或,分隔)
exclude: 排除搜索的网站范围(多个域名用|或,分隔)
Returns:
搜索结果字典
"""
headers = {
'Authorization': f'Bearer {self.api_key}',
'Content-Type': 'application/json'
}
payload = {
'query': query,
'count': count,
'freshness': freshness,
'summary': summary
}
# 添加可选参数
if include:
payload['include'] = include
if exclude:
payload['exclude'] = exclude
try:
response = requests.post(
self.endpoint,
headers=headers,
data=json.dumps(payload),
timeout=30
)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
return {
'success': False,
'error': str(e),
'code': getattr(response, 'status_code', None) if 'response' in locals() else None
}
def parse_news_items(self, search_result: Dict) -> List[Dict]:
"""
解析搜索结果为新闻列表
Args:
search_result: 博查API返回的搜索结果
Returns:
新闻列表每个新闻包含title, url, snippet, summary, site_name, published_at等
"""
news_items = []
# 检查返回数据格式
if 'data' not in search_result:
return news_items
data = search_result['data']
if 'webPages' not in data or 'value' not in data['webPages']:
return news_items
# 解析每条新闻
for item in data['webPages']['value']:
news_item = {
'title': item.get('name', ''),
'url': item.get('url', ''),
'snippet': item.get('snippet', ''),
'summary': item.get('summary', ''),
'site_name': item.get('siteName', ''),
'site_icon': item.get('siteIcon', ''),
'published_at': self._parse_date(item.get('datePublished')),
'display_url': item.get('displayUrl', ''),
'language': item.get('language', ''),
}
news_items.append(news_item)
return news_items
def search_site_news(
self,
site_name: str,
site_url: Optional[str] = None,
news_keywords: Optional[str] = None,
count: int = 10,
freshness: str = 'oneMonth'
) -> List[Dict]:
"""
搜索特定网站的相关新闻
Args:
site_name: 网站名称用于搜索关键词如果没有news_keywords
site_url: 网站URL可选用于排除网站自身
news_keywords: 新闻专用关键词(优先使用,支持精准匹配)
count: 返回结果数量
freshness: 时间范围
Returns:
新闻列表
"""
# 构建搜索关键词
# 优先使用news_keywords如果提供- v2.3新增
if news_keywords and news_keywords.strip():
# 使用双引号包裹关键词,确保严格匹配
query = f'"{news_keywords.strip()}" 新闻'
else:
# 降级使用网站名称(向后兼容)
query = f'"{site_name}" 新闻'
# 如果提供了网站URL排除网站自身的结果
exclude = None
if site_url:
# 提取域名
try:
from urllib.parse import urlparse
parsed = urlparse(site_url)
domain = parsed.netloc or parsed.path
# 移除 www. 前缀
domain = domain.replace('www.', '')
exclude = domain
except Exception:
pass
# 执行搜索
search_result = self.search_news(
query=query,
count=count,
freshness=freshness,
summary=True,
exclude=exclude
)
# 解析结果
return self.parse_news_items(search_result)
def _parse_date(self, date_str: Optional[str]) -> Optional[datetime]:
"""
解析日期字符串
Args:
date_str: 日期字符串例如2025-02-23T08:18:30+08:00
Returns:
datetime对象如果解析失败返回None
"""
if not date_str:
return None
try:
# 尝试解析 ISO 8601 格式
# 博查API返回格式2025-02-23T08:18:30+08:00
if '+' in date_str or 'Z' in date_str:
# 使用 fromisoformatPython 3.7+
return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
else:
# 简单格式
return datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S')
except Exception:
# 如果解析失败返回None
return None
def format_news_for_display(self, news_items: List[Dict]) -> List[Dict]:
"""
格式化新闻用于前端展示
Args:
news_items: 新闻列表
Returns:
格式化后的新闻列表
"""
formatted_news = []
for item in news_items:
formatted_item = {
'title': item['title'],
'url': item['url'],
'description': item.get('summary') or item.get('snippet', ''),
'source': item.get('site_name', '未知来源'),
'published_date': self._format_date(item.get('published_at')),
'icon': item.get('site_icon', '')
}
formatted_news.append(formatted_item)
return formatted_news
def _format_date(self, dt: Optional[datetime]) -> str:
"""
格式化日期用于显示
Args:
dt: datetime对象
Returns:
格式化的日期字符串
"""
if not dt:
return '未知日期'
try:
# 返回格式2025-01-30
return dt.strftime('%Y-%m-%d')
except Exception:
return '未知日期'
# 测试代码
if __name__ == '__main__':
import os
from dotenv import load_dotenv
load_dotenv()
# 从环境变量获取API密钥
api_key = os.environ.get('BOCHA_API_KEY')
if not api_key:
print("错误未设置BOCHA_API_KEY环境变量")
exit(1)
# 创建搜索器
searcher = NewsSearcher(api_key)
# 测试搜索
print("正在搜索ChatGPT 最新新闻...")
news_items = searcher.search_site_news(
site_name='ChatGPT',
count=5,
freshness='oneWeek'
)
# 显示结果
print(f"\n找到 {len(news_items)} 条新闻:\n")
for i, news in enumerate(news_items, 1):
print(f"{i}. {news['title']}")
print(f" 来源:{news['site_name']}")
print(f" 日期:{searcher._format_date(news['published_at'])}")
print(f" URL{news['url']}")
print(f" 摘要:{news.get('summary', news.get('snippet', ''))[:100]}...")
print()