# -*- coding: utf-8 -*- """ 网站信息抓取工具 """ import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse import os from PIL import Image from io import BytesIO class WebsiteFetcher: """网站信息抓取器""" def __init__(self, timeout=10): self.timeout = timeout self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } def fetch_website_info(self, url): """ 抓取网站信息 Args: url: 网站URL Returns: dict: 包含name, description, logo_url的字典,失败返回None """ try: # 确保URL包含协议 if not url.startswith(('http://', 'https://')): url = 'https://' + url # 请求网页 response = requests.get(url, headers=self.headers, timeout=self.timeout, allow_redirects=True) response.raise_for_status() response.encoding = response.apparent_encoding # 自动检测编码 # 解析HTML soup = BeautifulSoup(response.text, 'html.parser') # 提取信息 info = { 'name': self._extract_title(soup), 'description': self._extract_description(soup), 'logo_url': self._extract_logo(soup, url) } return info except Exception as e: print(f"抓取网站信息失败: {str(e)}") return None def _extract_title(self, soup): """提取网站标题""" # 优先使用 og:title og_title = soup.find('meta', property='og:title') if og_title and og_title.get('content'): return og_title['content'].strip() # 使用 title 标签 title_tag = soup.find('title') if title_tag: return title_tag.get_text().strip() return '' def _extract_description(self, soup): """提取网站描述""" # 优先使用 og:description og_desc = soup.find('meta', property='og:description') if og_desc and og_desc.get('content'): return og_desc['content'].strip() # 使用 meta description meta_desc = soup.find('meta', attrs={'name': 'description'}) if meta_desc and meta_desc.get('content'): return meta_desc['content'].strip() # 使用 meta keywords 作为fallback meta_keywords = soup.find('meta', attrs={'name': 'keywords'}) if meta_keywords and meta_keywords.get('content'): return meta_keywords['content'].strip() return '' def _extract_logo(self, soup, base_url): """提取网站Logo""" logo_url = None # 1. 尝试 og:image og_image = soup.find('meta', property='og:image') if og_image and og_image.get('content'): logo_url = og_image['content'] # 2. 尝试 link rel="icon" 或 "shortcut icon" if not logo_url: icon_link = soup.find('link', rel=lambda x: x and ('icon' in x.lower() if isinstance(x, str) else 'icon' in ' '.join(x).lower())) if icon_link and icon_link.get('href'): logo_url = icon_link['href'] # 3. 尝试 apple-touch-icon if not logo_url: apple_icon = soup.find('link', rel='apple-touch-icon') if apple_icon and apple_icon.get('href'): logo_url = apple_icon['href'] # 4. 默认使用 /favicon.ico if not logo_url: logo_url = '/favicon.ico' # 转换为绝对URL if logo_url: logo_url = urljoin(base_url, logo_url) return logo_url def download_logo(self, logo_url, save_dir='static/uploads'): """ 下载并保存Logo Args: logo_url: Logo的URL save_dir: 保存目录 Returns: str: 保存后的相对路径,失败返回None """ if not logo_url: return None try: # 创建保存目录 os.makedirs(save_dir, exist_ok=True) # 下载图片 response = requests.get(logo_url, headers=self.headers, timeout=self.timeout) response.raise_for_status() # 检查是否是图片 content_type = response.headers.get('content-type', '') if not content_type.startswith('image/'): return None # 生成文件名 parsed_url = urlparse(logo_url) ext = os.path.splitext(parsed_url.path)[1] if not ext or len(ext) > 5: ext = '.png' # 默认扩展名 # 使用域名作为文件名 domain = parsed_url.netloc.replace(':', '_').replace('.', '_') filename = f"logo_{domain}{ext}" filepath = os.path.join(save_dir, filename) # 保存图片 with open(filepath, 'wb') as f: f.write(response.content) # 返回相对路径(用于数据库存储) return f'/{filepath.replace(os.sep, "/")}' except Exception as e: print(f"下载Logo失败: {str(e)}") return None