Files
zjpb.net/utils/website_fetcher.py
Jowe 2fbca6ebc7 feat: 完成全站UI优化 - 科技感/未来风设计
- 前台页面全面升级为Tailwind CSS框架
- 引入Google Fonts (Space Grotesk, Noto Sans)
- 主色调更新为#25c0f4 (cyan blue)
- 实现玻璃态效果和渐变背景
- 优化首页网格卡片布局和悬停动画
- 优化详情页双栏布局和渐变Logo光晕
- 优化管理员登录页,添加科技网格背景
- Flask-Admin后台完整深色主题
- 统一Material Symbols图标系统
- 网站自动抓取功能界面优化

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-27 22:45:09 +08:00

169 lines
5.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
网站信息抓取工具
"""
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
from PIL import Image
from io import BytesIO
class WebsiteFetcher:
"""网站信息抓取器"""
def __init__(self, timeout=10):
self.timeout = timeout
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
def fetch_website_info(self, url):
"""
抓取网站信息
Args:
url: 网站URL
Returns:
dict: 包含name, description, logo_url的字典失败返回None
"""
try:
# 确保URL包含协议
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
# 请求网页
response = requests.get(url, headers=self.headers, timeout=self.timeout, allow_redirects=True)
response.raise_for_status()
response.encoding = response.apparent_encoding # 自动检测编码
# 解析HTML
soup = BeautifulSoup(response.text, 'html.parser')
# 提取信息
info = {
'name': self._extract_title(soup),
'description': self._extract_description(soup),
'logo_url': self._extract_logo(soup, url)
}
return info
except Exception as e:
print(f"抓取网站信息失败: {str(e)}")
return None
def _extract_title(self, soup):
"""提取网站标题"""
# 优先使用 og:title
og_title = soup.find('meta', property='og:title')
if og_title and og_title.get('content'):
return og_title['content'].strip()
# 使用 title 标签
title_tag = soup.find('title')
if title_tag:
return title_tag.get_text().strip()
return ''
def _extract_description(self, soup):
"""提取网站描述"""
# 优先使用 og:description
og_desc = soup.find('meta', property='og:description')
if og_desc and og_desc.get('content'):
return og_desc['content'].strip()
# 使用 meta description
meta_desc = soup.find('meta', attrs={'name': 'description'})
if meta_desc and meta_desc.get('content'):
return meta_desc['content'].strip()
# 使用 meta keywords 作为fallback
meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
if meta_keywords and meta_keywords.get('content'):
return meta_keywords['content'].strip()
return ''
def _extract_logo(self, soup, base_url):
"""提取网站Logo"""
logo_url = None
# 1. 尝试 og:image
og_image = soup.find('meta', property='og:image')
if og_image and og_image.get('content'):
logo_url = og_image['content']
# 2. 尝试 link rel="icon" 或 "shortcut icon"
if not logo_url:
icon_link = soup.find('link', rel=lambda x: x and ('icon' in x.lower() if isinstance(x, str) else 'icon' in ' '.join(x).lower()))
if icon_link and icon_link.get('href'):
logo_url = icon_link['href']
# 3. 尝试 apple-touch-icon
if not logo_url:
apple_icon = soup.find('link', rel='apple-touch-icon')
if apple_icon and apple_icon.get('href'):
logo_url = apple_icon['href']
# 4. 默认使用 /favicon.ico
if not logo_url:
logo_url = '/favicon.ico'
# 转换为绝对URL
if logo_url:
logo_url = urljoin(base_url, logo_url)
return logo_url
def download_logo(self, logo_url, save_dir='static/uploads'):
"""
下载并保存Logo
Args:
logo_url: Logo的URL
save_dir: 保存目录
Returns:
str: 保存后的相对路径失败返回None
"""
if not logo_url:
return None
try:
# 创建保存目录
os.makedirs(save_dir, exist_ok=True)
# 下载图片
response = requests.get(logo_url, headers=self.headers, timeout=self.timeout)
response.raise_for_status()
# 检查是否是图片
content_type = response.headers.get('content-type', '')
if not content_type.startswith('image/'):
return None
# 生成文件名
parsed_url = urlparse(logo_url)
ext = os.path.splitext(parsed_url.path)[1]
if not ext or len(ext) > 5:
ext = '.png' # 默认扩展名
# 使用域名作为文件名
domain = parsed_url.netloc.replace(':', '_').replace('.', '_')
filename = f"logo_{domain}{ext}"
filepath = os.path.join(save_dir, filename)
# 保存图片
with open(filepath, 'wb') as f:
f.write(response.content)
# 返回相对路径(用于数据库存储)
return f'/{filepath.replace(os.sep, "/")}'
except Exception as e:
print(f"下载Logo失败: {str(e)}")
return None