Files
zjpb.net/utils/bookmark_parser.py
Jowe 9e47ebe749 release: v2.0 - 完整功能管理系统
主要功能:
- 完整的Flask-Admin后台管理系统
- 网站/标签/新闻管理功能
- 用户登录认证系统
- 科技感/未来风UI设计
- 标签分类系统(取代传统分类)
- 详情页面展示
- 数据库迁移脚本
- 书签导入解析工具

技术栈:
- Flask + SQLAlchemy
- Flask-Admin管理界面
- Bootstrap 4响应式设计
- 用户认证与权限管理

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-28 19:21:17 +08:00

205 lines
6.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""OneNav/Chrome书签HTML文件解析工具"""
from bs4 import BeautifulSoup
from typing import List, Dict
import re
class BookmarkParser:
"""解析OneNav/Chrome导出的书签HTML文件"""
def parse_html_file(self, html_content: str, debug=False) -> Dict[str, any]:
"""
解析OneNav/Chrome书签HTML文件
Args:
html_content: HTML文件内容
debug: 是否打印调试信息
Returns:
Dict: 包含 categories(标签列表) 和 sites(网站列表)
"""
soup = BeautifulSoup(html_content, 'html.parser')
categories = set() # 使用set去重
sites = []
# 找到第一个DL标签作为起点
first_dl = soup.find('dl')
if first_dl:
# 递归解析书签,收集分类和网站
self._parse_dl_tag(first_dl, categories, sites, current_category=None, debug=debug)
return {
'categories': sorted(list(categories)), # 转为排序的列表
'sites': sites
}
def parse_html_file_legacy(self, html_content: str) -> List[Dict[str, str]]:
"""
解析Chrome书签HTML文件旧版格式
Args:
html_content: HTML文件内容
Returns:
List[Dict]: 书签列表,每个书签包含 name, url, folder
"""
soup = BeautifulSoup(html_content, 'html.parser')
bookmarks = []
# 递归解析书签
self._parse_dl_tag_legacy(soup, bookmarks, folder_path="")
return bookmarks
def _parse_dl_tag(self, element, categories: set, sites: List[Dict], current_category: str, debug=False):
"""递归解析DL标签OneNav格式"""
# 查找所有DT标签不限制为直接子元素因为可能在p标签内
dt_list = element.find_all('dt')
if debug and dt_list:
print(f"Found {len(dt_list)} DT tags total")
for dt in dt_list:
# 检查是否是文件夹/分类
h3 = dt.find('h3', recursive=False)
if h3:
category_name = h3.get_text(strip=True)
# 跳过根节点和默认分类
if category_name not in ['OneNav', 'OneNav默认分类']:
categories.add(category_name)
if debug:
print(f" Category: {category_name}")
# 检查是否是书签链接并且不在子分类的DL中
a = dt.find('a', recursive=False)
if a and a.get('href'):
# 找到这个DT所属的最近的H3分类
parent_category = None
# 向上查找同级或父级的H3
prev = dt.find_previous('h3')
if prev:
parent_category = prev.get_text(strip=True)
# 跳过根节点和默认分类
if parent_category in ['OneNav', 'OneNav默认分类']:
parent_category = None
if parent_category:
url = a['href']
title = a.get_text(strip=True)
sites.append({
'title': title,
'url': url,
'category': parent_category,
'add_date': a.get('add_date', '')
})
if debug:
print(f" Site: {title} -> {parent_category}")
def _parse_dl_tag_legacy(self, element, bookmarks: List[Dict], folder_path: str):
"""递归解析DL标签Chrome旧格式"""
# 查找所有DT标签书签项
for dt in element.find_all('dt', recursive=False):
# 检查是否是文件夹
h3 = dt.find('h3', recursive=False)
if h3:
folder_name = h3.get_text(strip=True)
new_folder_path = f"{folder_path}/{folder_name}" if folder_path else folder_name
# 递归解析子文件夹
dl = dt.find('dl', recursive=False)
if dl:
self._parse_dl_tag_legacy(dl, bookmarks, new_folder_path)
# 检查是否是书签链接
a = dt.find('a', recursive=False)
if a and a.get('href'):
url = a['href']
name = a.get_text(strip=True)
bookmarks.append({
'name': name,
'url': url,
'folder': folder_path
})
def parse_url_list(self, text: str) -> List[Dict[str, str]]:
"""
解析纯文本URL列表
Args:
text: 文本内容每行一个URL
Returns:
List[Dict]: URL列表
"""
urls = []
lines = text.strip().split('\n')
for line in lines:
line = line.strip()
if not line or line.startswith('#'):
continue
# 简单的URL验证
if re.match(r'^https?://', line):
urls.append({
'url': line,
'name': '', # 名称留空,后续自动获取
'folder': ''
})
return urls
@staticmethod
def clean_title(title: str) -> str:
"""清理网站标题,提取网站名称"""
if not title:
return ''
# 去除HTML实体
title = re.sub(r'&amp;', '&', title)
title = re.sub(r'&lt;', '<', title)
title = re.sub(r'&gt;', '>', title)
title = re.sub(r'&quot;', '"', title)
# 常见的分隔符
separators = [' - ', ' | ', ' · ', '·', '', '']
for sep in separators:
if sep in title:
parts = title.split(sep)
# 过滤掉常见的无用部分
filtered_parts = []
skip_keywords = ['官网', '首页', 'official', 'home', 'page', 'website']
for part in parts:
part = part.strip()
if part and not any(kw in part.lower() for kw in skip_keywords):
filtered_parts.append(part)
if filtered_parts:
# 返回最短的部分(通常是网站名)
return min(filtered_parts, key=len)
# 去除一些常见的后缀词
suffixes = [
r'\s*官网\s*$', r'\s*首页\s*$',
r'\s*Official Site\s*$', r'\s*Home Page\s*$',
r'\s*Homepage\s*$', r'\s*Website\s*$'
]
for suffix in suffixes:
title = re.sub(suffix, '', title, flags=re.IGNORECASE)
return title.strip()
@staticmethod
def extract_domain(url: str) -> str:
"""从URL提取域名"""
match = re.search(r'https?://([^/]+)', url)
if match:
domain = match.group(1)
# 去除www前缀
domain = re.sub(r'^www\.', '', domain)
return domain
return url