zjpb.net/utils/website_fetcher.py

# -*- coding: utf-8 -*-
"""
网站信息抓取工具
"""
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
from PIL import Image
from io import BytesIO

class WebsiteFetcher:
    """网站信息抓取器"""

    def __init__(self, timeout=10):
        self.timeout = timeout
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

    def fetch_website_info(self, url):
        """
        抓取网站信息

        Args:
            url: 网站URL

        Returns:
            dict: 包含name, description, logo_url的字典，失败返回None
        """
        try:
            # 确保URL包含协议
            if not url.startswith(('http://', 'https://')):
                url = 'https://' + url

            # 请求网页
            response = requests.get(url, headers=self.headers, timeout=self.timeout, allow_redirects=True)
            response.raise_for_status()
            response.encoding = response.apparent_encoding  # 自动检测编码

            # 解析HTML
            soup = BeautifulSoup(response.text, 'html.parser')

            # 提取信息
            info = {
                'name': self._extract_title(soup),
                'description': self._extract_description(soup),
                'logo_url': self._extract_logo(soup, url)
            }

            return info

        except Exception as e:
            print(f"抓取网站信息失败: {str(e)}")
            return None

    def _extract_title(self, soup):
        """提取网站标题"""
        # 优先使用 og:title
        og_title = soup.find('meta', property='og:title')
        if og_title and og_title.get('content'):
            return og_title['content'].strip()

        # 使用 title 标签
        title_tag = soup.find('title')
        if title_tag:
            return title_tag.get_text().strip()

        return ''

    def _extract_description(self, soup):
        """提取网站描述"""
        # 优先使用 og:description
        og_desc = soup.find('meta', property='og:description')
        if og_desc and og_desc.get('content'):
            return og_desc['content'].strip()

        # 使用 meta description
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        if meta_desc and meta_desc.get('content'):
            return meta_desc['content'].strip()

        # 使用 meta keywords 作为fallback
        meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
        if meta_keywords and meta_keywords.get('content'):
            return meta_keywords['content'].strip()

        return ''

    def _extract_logo(self, soup, base_url):
        """提取网站Logo"""
        logo_url = None

        # 1. 尝试 og:image
        og_image = soup.find('meta', property='og:image')
        if og_image and og_image.get('content'):
            logo_url = og_image['content']

        # 2. 尝试 link rel="icon" 或 "shortcut icon"
        if not logo_url:
            icon_link = soup.find('link', rel=lambda x: x and ('icon' in x.lower() if isinstance(x, str) else 'icon' in ' '.join(x).lower()))
            if icon_link and icon_link.get('href'):
                logo_url = icon_link['href']

        # 3. 尝试 apple-touch-icon
        if not logo_url:
            apple_icon = soup.find('link', rel='apple-touch-icon')
            if apple_icon and apple_icon.get('href'):
                logo_url = apple_icon['href']

        # 4. 默认使用 /favicon.ico
        if not logo_url:
            logo_url = '/favicon.ico'

        # 转换为绝对URL
        if logo_url:
            logo_url = urljoin(base_url, logo_url)

        return logo_url

    def download_logo(self, logo_url, save_dir='static/uploads'):
        """
        下载并保存Logo

        Args:
            logo_url: Logo的URL
            save_dir: 保存目录

        Returns:
            str: 保存后的相对路径，失败返回None
        """
        if not logo_url:
            return None

        try:
            # 创建保存目录
            os.makedirs(save_dir, exist_ok=True)

            # 下载图片
            response = requests.get(logo_url, headers=self.headers, timeout=self.timeout)
            response.raise_for_status()

            # 检查是否是图片
            content_type = response.headers.get('content-type', '')
            if not content_type.startswith('image/'):
                return None

            # 生成文件名
            parsed_url = urlparse(logo_url)
            ext = os.path.splitext(parsed_url.path)[1]
            if not ext or len(ext) > 5:
                ext = '.png'  # 默认扩展名

            # 使用域名作为文件名
            domain = parsed_url.netloc.replace(':', '_').replace('.', '_')
            filename = f"logo_{domain}{ext}"
            filepath = os.path.join(save_dir, filename)

            # 保存图片
            with open(filepath, 'wb') as f:
                f.write(response.content)

            # 返回相对路径（用于数据库存储）
            return f'/{filepath.replace(os.sep, "/")}'

        except Exception as e:
            print(f"下载Logo失败: {str(e)}")
            return None