- 前台页面全面升级为Tailwind CSS框架 - 引入Google Fonts (Space Grotesk, Noto Sans) - 主色调更新为#25c0f4 (cyan blue) - 实现玻璃态效果和渐变背景 - 优化首页网格卡片布局和悬停动画 - 优化详情页双栏布局和渐变Logo光晕 - 优化管理员登录页,添加科技网格背景 - Flask-Admin后台完整深色主题 - 统一Material Symbols图标系统 - 网站自动抓取功能界面优化 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
169 lines
5.2 KiB
Python
169 lines
5.2 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
网站信息抓取工具
|
||
"""
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from urllib.parse import urljoin, urlparse
|
||
import os
|
||
from PIL import Image
|
||
from io import BytesIO
|
||
|
||
class WebsiteFetcher:
|
||
"""网站信息抓取器"""
|
||
|
||
def __init__(self, timeout=10):
|
||
self.timeout = timeout
|
||
self.headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||
}
|
||
|
||
def fetch_website_info(self, url):
|
||
"""
|
||
抓取网站信息
|
||
|
||
Args:
|
||
url: 网站URL
|
||
|
||
Returns:
|
||
dict: 包含name, description, logo_url的字典,失败返回None
|
||
"""
|
||
try:
|
||
# 确保URL包含协议
|
||
if not url.startswith(('http://', 'https://')):
|
||
url = 'https://' + url
|
||
|
||
# 请求网页
|
||
response = requests.get(url, headers=self.headers, timeout=self.timeout, allow_redirects=True)
|
||
response.raise_for_status()
|
||
response.encoding = response.apparent_encoding # 自动检测编码
|
||
|
||
# 解析HTML
|
||
soup = BeautifulSoup(response.text, 'html.parser')
|
||
|
||
# 提取信息
|
||
info = {
|
||
'name': self._extract_title(soup),
|
||
'description': self._extract_description(soup),
|
||
'logo_url': self._extract_logo(soup, url)
|
||
}
|
||
|
||
return info
|
||
|
||
except Exception as e:
|
||
print(f"抓取网站信息失败: {str(e)}")
|
||
return None
|
||
|
||
def _extract_title(self, soup):
|
||
"""提取网站标题"""
|
||
# 优先使用 og:title
|
||
og_title = soup.find('meta', property='og:title')
|
||
if og_title and og_title.get('content'):
|
||
return og_title['content'].strip()
|
||
|
||
# 使用 title 标签
|
||
title_tag = soup.find('title')
|
||
if title_tag:
|
||
return title_tag.get_text().strip()
|
||
|
||
return ''
|
||
|
||
def _extract_description(self, soup):
|
||
"""提取网站描述"""
|
||
# 优先使用 og:description
|
||
og_desc = soup.find('meta', property='og:description')
|
||
if og_desc and og_desc.get('content'):
|
||
return og_desc['content'].strip()
|
||
|
||
# 使用 meta description
|
||
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
||
if meta_desc and meta_desc.get('content'):
|
||
return meta_desc['content'].strip()
|
||
|
||
# 使用 meta keywords 作为fallback
|
||
meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
|
||
if meta_keywords and meta_keywords.get('content'):
|
||
return meta_keywords['content'].strip()
|
||
|
||
return ''
|
||
|
||
def _extract_logo(self, soup, base_url):
|
||
"""提取网站Logo"""
|
||
logo_url = None
|
||
|
||
# 1. 尝试 og:image
|
||
og_image = soup.find('meta', property='og:image')
|
||
if og_image and og_image.get('content'):
|
||
logo_url = og_image['content']
|
||
|
||
# 2. 尝试 link rel="icon" 或 "shortcut icon"
|
||
if not logo_url:
|
||
icon_link = soup.find('link', rel=lambda x: x and ('icon' in x.lower() if isinstance(x, str) else 'icon' in ' '.join(x).lower()))
|
||
if icon_link and icon_link.get('href'):
|
||
logo_url = icon_link['href']
|
||
|
||
# 3. 尝试 apple-touch-icon
|
||
if not logo_url:
|
||
apple_icon = soup.find('link', rel='apple-touch-icon')
|
||
if apple_icon and apple_icon.get('href'):
|
||
logo_url = apple_icon['href']
|
||
|
||
# 4. 默认使用 /favicon.ico
|
||
if not logo_url:
|
||
logo_url = '/favicon.ico'
|
||
|
||
# 转换为绝对URL
|
||
if logo_url:
|
||
logo_url = urljoin(base_url, logo_url)
|
||
|
||
return logo_url
|
||
|
||
def download_logo(self, logo_url, save_dir='static/uploads'):
|
||
"""
|
||
下载并保存Logo
|
||
|
||
Args:
|
||
logo_url: Logo的URL
|
||
save_dir: 保存目录
|
||
|
||
Returns:
|
||
str: 保存后的相对路径,失败返回None
|
||
"""
|
||
if not logo_url:
|
||
return None
|
||
|
||
try:
|
||
# 创建保存目录
|
||
os.makedirs(save_dir, exist_ok=True)
|
||
|
||
# 下载图片
|
||
response = requests.get(logo_url, headers=self.headers, timeout=self.timeout)
|
||
response.raise_for_status()
|
||
|
||
# 检查是否是图片
|
||
content_type = response.headers.get('content-type', '')
|
||
if not content_type.startswith('image/'):
|
||
return None
|
||
|
||
# 生成文件名
|
||
parsed_url = urlparse(logo_url)
|
||
ext = os.path.splitext(parsed_url.path)[1]
|
||
if not ext or len(ext) > 5:
|
||
ext = '.png' # 默认扩展名
|
||
|
||
# 使用域名作为文件名
|
||
domain = parsed_url.netloc.replace(':', '_').replace('.', '_')
|
||
filename = f"logo_{domain}{ext}"
|
||
filepath = os.path.join(save_dir, filename)
|
||
|
||
# 保存图片
|
||
with open(filepath, 'wb') as f:
|
||
f.write(response.content)
|
||
|
||
# 返回相对路径(用于数据库存储)
|
||
return f'/{filepath.replace(os.sep, "/")}'
|
||
|
||
except Exception as e:
|
||
print(f"下载Logo失败: {str(e)}")
|
||
return None
|