"""OneNav/Chrome书签HTML文件解析工具"""
from bs4 import BeautifulSoup
from typing import List, Dict
import re
class BookmarkParser:
"""解析OneNav/Chrome导出的书签HTML文件"""
def parse_html_file(self, html_content: str, debug=False) -> Dict[str, any]:
"""
解析OneNav/Chrome书签HTML文件
Args:
html_content: HTML文件内容
debug: 是否打印调试信息
Returns:
Dict: 包含 categories(标签列表) 和 sites(网站列表)
"""
soup = BeautifulSoup(html_content, 'html.parser')
categories = set() # 使用set去重
sites = []
# 找到第一个DL标签作为起点
first_dl = soup.find('dl')
if first_dl:
# 递归解析书签,收集分类和网站
self._parse_dl_tag(first_dl, categories, sites, current_category=None, debug=debug)
return {
'categories': sorted(list(categories)), # 转为排序的列表
'sites': sites
}
def parse_html_file_legacy(self, html_content: str) -> List[Dict[str, str]]:
"""
解析Chrome书签HTML文件(旧版格式)
Args:
html_content: HTML文件内容
Returns:
List[Dict]: 书签列表,每个书签包含 name, url, folder
"""
soup = BeautifulSoup(html_content, 'html.parser')
bookmarks = []
# 递归解析书签
self._parse_dl_tag_legacy(soup, bookmarks, folder_path="")
return bookmarks
def _parse_dl_tag(self, element, categories: set, sites: List[Dict], current_category: str, debug=False):
"""递归解析DL标签(OneNav格式)"""
# 查找所有DT标签(不限制为直接子元素,因为可能在p标签内)
dt_list = element.find_all('dt')
if debug and dt_list:
print(f"Found {len(dt_list)} DT tags total")
for dt in dt_list:
# 检查是否是文件夹/分类
h3 = dt.find('h3', recursive=False)
if h3:
category_name = h3.get_text(strip=True)
# 跳过根节点和默认分类
if category_name not in ['OneNav', 'OneNav默认分类']:
categories.add(category_name)
if debug:
print(f" Category: {category_name}")
# 检查是否是书签链接(并且不在子分类的DL中)
a = dt.find('a', recursive=False)
if a and a.get('href'):
# 找到这个DT所属的最近的H3分类
parent_category = None
# 向上查找同级或父级的H3
prev = dt.find_previous('h3')
if prev:
parent_category = prev.get_text(strip=True)
# 跳过根节点和默认分类
if parent_category in ['OneNav', 'OneNav默认分类']:
parent_category = None
if parent_category:
url = a['href']
title = a.get_text(strip=True)
sites.append({
'title': title,
'url': url,
'category': parent_category,
'add_date': a.get('add_date', '')
})
if debug:
print(f" Site: {title} -> {parent_category}")
def _parse_dl_tag_legacy(self, element, bookmarks: List[Dict], folder_path: str):
"""递归解析DL标签(Chrome旧格式)"""
# 查找所有DT标签(书签项)
for dt in element.find_all('dt', recursive=False):
# 检查是否是文件夹
h3 = dt.find('h3', recursive=False)
if h3:
folder_name = h3.get_text(strip=True)
new_folder_path = f"{folder_path}/{folder_name}" if folder_path else folder_name
# 递归解析子文件夹
dl = dt.find('dl', recursive=False)
if dl:
self._parse_dl_tag_legacy(dl, bookmarks, new_folder_path)
# 检查是否是书签链接
a = dt.find('a', recursive=False)
if a and a.get('href'):
url = a['href']
name = a.get_text(strip=True)
bookmarks.append({
'name': name,
'url': url,
'folder': folder_path
})
def parse_url_list(self, text: str) -> List[Dict[str, str]]:
"""
解析纯文本URL列表
Args:
text: 文本内容,每行一个URL
Returns:
List[Dict]: URL列表
"""
urls = []
lines = text.strip().split('\n')
for line in lines:
line = line.strip()
if not line or line.startswith('#'):
continue
# 简单的URL验证
if re.match(r'^https?://', line):
urls.append({
'url': line,
'name': '', # 名称留空,后续自动获取
'folder': ''
})
return urls
@staticmethod
def clean_title(title: str) -> str:
"""清理网站标题,提取网站名称"""
if not title:
return ''
# 去除HTML实体
title = re.sub(r'&', '&', title)
title = re.sub(r'<', '<', title)
title = re.sub(r'>', '>', title)
title = re.sub(r'"', '"', title)
# 常见的分隔符
separators = [' - ', ' | ', ' · ', '·', '|', ' — ']
for sep in separators:
if sep in title:
parts = title.split(sep)
# 过滤掉常见的无用部分
filtered_parts = []
skip_keywords = ['官网', '首页', 'official', 'home', 'page', 'website']
for part in parts:
part = part.strip()
if part and not any(kw in part.lower() for kw in skip_keywords):
filtered_parts.append(part)
if filtered_parts:
# 返回最短的部分(通常是网站名)
return min(filtered_parts, key=len)
# 去除一些常见的后缀词
suffixes = [
r'\s*官网\s*$', r'\s*首页\s*$',
r'\s*Official Site\s*$', r'\s*Home Page\s*$',
r'\s*Homepage\s*$', r'\s*Website\s*$'
]
for suffix in suffixes:
title = re.sub(suffix, '', title, flags=re.IGNORECASE)
return title.strip()
@staticmethod
def extract_domain(url: str) -> str:
"""从URL提取域名"""
match = re.search(r'https?://([^/]+)', url)
if match:
domain = match.group(1)
# 去除www前缀
domain = re.sub(r'^www\.', '', domain)
return domain
return url