Files
zjpb.net/fetch_news_cron.py
Jowe d7d21e19c9 release: v2.2.0 - 博查新闻搜索功能
新增功能:
- 集成博查Web Search API,自动获取网站相关新闻
- News模型添加source_name和source_icon字段
- 新闻管理后台界面优化
- 网站详情页新闻展示(标题、摘要、来源、链接)
- 定期任务脚本支持批量获取新闻
- 完整的API路由和测试脚本

技术实现:
- NewsSearcher工具类封装博查API
- 智能新闻搜索和去重机制
- 数据库迁移脚本migrate_news_fields.py
- API路由:/api/fetch-site-news 和 /api/fetch-all-news
- Cron任务脚本:fetch_news_cron.py

修改文件:
- config.py: 添加博查API配置
- models.py: News模型扩展
- app.py: 新闻获取路由和NewsAdmin优化
- templates/detail_new.html: 新闻展示UI

新增文件:
- utils/news_searcher.py (271行)
- migrate_news_fields.py (99行)
- fetch_news_cron.py (167行)
- test_news_feature.py (142行)
- NEWS_FEATURE_v2.2.md (408行)

统计:9个文件,1348行新增

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 22:04:35 +08:00

168 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
定期新闻获取任务脚本
用途:定期为网站批量获取最新新闻
使用python fetch_news_cron.py [options]
可以通过crontab定时执行
# 每天早上8点执行获取10个网站的新闻
0 8 * * * cd /path/to/zjpb && /path/to/venv/bin/python fetch_news_cron.py --limit 10 >> logs/news_fetch.log 2>&1
"""
import os
import sys
import argparse
from datetime import datetime
from dotenv import load_dotenv
# 加载环境变量
load_dotenv()
# 添加项目根目录到Python路径
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from app import create_app
from models import db, Site, News
from utils.news_searcher import NewsSearcher
def fetch_news_for_sites(limit=10, count_per_site=5, freshness='oneMonth'):
"""
批量为网站获取新闻
Args:
limit: 处理的网站数量限制
count_per_site: 每个网站获取的新闻数量
freshness: 新闻时间范围
"""
# 创建Flask应用上下文
app = create_app(os.getenv('FLASK_ENV', 'production'))
with app.app_context():
# 检查博查API配置
api_key = app.config.get('BOCHA_API_KEY')
if not api_key:
print(f"[{datetime.now()}] 错误未配置BOCHA_API_KEY")
return False
# 获取启用的网站(按更新时间排序,优先处理旧的)
sites = Site.query.filter_by(is_active=True).order_by(Site.updated_at).limit(limit).all()
if not sites:
print(f"[{datetime.now()}] 没有可处理的网站")
return False
print(f"[{datetime.now()}] 开始批量获取新闻,共 {len(sites)} 个网站")
print(f"配置:每个网站 {count_per_site} 条新闻,时间范围:{freshness}")
print("-" * 60)
# 创建新闻搜索器
searcher = NewsSearcher(api_key)
# 统计信息
total_saved = 0
total_found = 0
success_count = 0
error_count = 0
# 为每个网站获取新闻
for i, site in enumerate(sites, 1):
print(f"[{i}/{len(sites)}] 处理网站: {site.name}")
try:
# 搜索新闻
news_items = searcher.search_site_news(
site_name=site.name,
site_url=site.url,
count=count_per_site,
freshness=freshness
)
if not news_items:
print(f" └─ 未找到新闻")
continue
site_saved = 0
for item in news_items:
# 检查是否已存在
existing_news = News.query.filter_by(
site_id=site.id,
url=item['url']
).first()
if not existing_news:
news = News(
site_id=site.id,
title=item['title'],
content=item.get('summary') or item.get('snippet', ''),
url=item['url'],
source_name=item.get('site_name', ''),
source_icon=item.get('site_icon', ''),
published_at=item.get('published_at'),
news_type='Search Result',
is_active=True
)
db.session.add(news)
site_saved += 1
# 提交该网站的新闻
db.session.commit()
total_found += len(news_items)
total_saved += site_saved
success_count += 1
print(f" └─ 找到 {len(news_items)} 条,保存 {site_saved} 条新闻")
except Exception as e:
error_count += 1
print(f" └─ 错误: {str(e)}")
db.session.rollback()
continue
print("-" * 60)
print(f"[{datetime.now()}] 批量获取完成")
print(f"成功: {success_count} 个网站, 失败: {error_count} 个网站")
print(f"共找到 {total_found} 条新闻,保存 {total_saved} 条新新闻")
print("=" * 60)
return True
def main():
"""主函数"""
parser = argparse.ArgumentParser(description='定期新闻获取任务')
parser.add_argument('--limit', type=int, default=10, help='处理的网站数量限制默认10')
parser.add_argument('--count', type=int, default=5, help='每个网站获取的新闻数量默认5')
parser.add_argument('--freshness', type=str, default='oneMonth',
choices=['noLimit', 'oneDay', 'oneWeek', 'oneMonth', 'oneYear'],
help='新闻时间范围默认oneMonth')
args = parser.parse_args()
print("=" * 60)
print(f"定期新闻获取任务 - 开始时间: {datetime.now()}")
print("=" * 60)
try:
success = fetch_news_for_sites(
limit=args.limit,
count_per_site=args.count,
freshness=args.freshness
)
if success:
print(f"\n任务执行成功!")
sys.exit(0)
else:
print(f"\n任务执行失败!")
sys.exit(1)
except Exception as e:
print(f"\n[{datetime.now()}] 严重错误: {str(e)}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == '__main__':
main()