"""OneNav/Chrome书签HTML文件解析工具""" from bs4 import BeautifulSoup from typing import List, Dict import re class BookmarkParser: """解析OneNav/Chrome导出的书签HTML文件""" def parse_html_file(self, html_content: str, debug=False) -> Dict[str, any]: """ 解析OneNav/Chrome书签HTML文件 Args: html_content: HTML文件内容 debug: 是否打印调试信息 Returns: Dict: 包含 categories(标签列表) 和 sites(网站列表) """ soup = BeautifulSoup(html_content, 'html.parser') categories = set() # 使用set去重 sites = [] # 找到第一个DL标签作为起点 first_dl = soup.find('dl') if first_dl: # 递归解析书签,收集分类和网站 self._parse_dl_tag(first_dl, categories, sites, current_category=None, debug=debug) return { 'categories': sorted(list(categories)), # 转为排序的列表 'sites': sites } def parse_html_file_legacy(self, html_content: str) -> List[Dict[str, str]]: """ 解析Chrome书签HTML文件(旧版格式) Args: html_content: HTML文件内容 Returns: List[Dict]: 书签列表,每个书签包含 name, url, folder """ soup = BeautifulSoup(html_content, 'html.parser') bookmarks = [] # 递归解析书签 self._parse_dl_tag_legacy(soup, bookmarks, folder_path="") return bookmarks def _parse_dl_tag(self, element, categories: set, sites: List[Dict], current_category: str, debug=False): """递归解析DL标签(OneNav格式)""" # 查找所有DT标签(不限制为直接子元素,因为可能在p标签内) dt_list = element.find_all('dt') if debug and dt_list: print(f"Found {len(dt_list)} DT tags total") for dt in dt_list: # 检查是否是文件夹/分类 h3 = dt.find('h3', recursive=False) if h3: category_name = h3.get_text(strip=True) # 跳过根节点和默认分类 if category_name not in ['OneNav', 'OneNav默认分类']: categories.add(category_name) if debug: print(f" Category: {category_name}") # 检查是否是书签链接(并且不在子分类的DL中) a = dt.find('a', recursive=False) if a and a.get('href'): # 找到这个DT所属的最近的H3分类 parent_category = None # 向上查找同级或父级的H3 prev = dt.find_previous('h3') if prev: parent_category = prev.get_text(strip=True) # 跳过根节点和默认分类 if parent_category in ['OneNav', 'OneNav默认分类']: parent_category = None if parent_category: url = a['href'] title = a.get_text(strip=True) sites.append({ 'title': title, 'url': url, 'category': parent_category, 'add_date': a.get('add_date', '') }) if debug: print(f" Site: {title} -> {parent_category}") def _parse_dl_tag_legacy(self, element, bookmarks: List[Dict], folder_path: str): """递归解析DL标签(Chrome旧格式)""" # 查找所有DT标签(书签项) for dt in element.find_all('dt', recursive=False): # 检查是否是文件夹 h3 = dt.find('h3', recursive=False) if h3: folder_name = h3.get_text(strip=True) new_folder_path = f"{folder_path}/{folder_name}" if folder_path else folder_name # 递归解析子文件夹 dl = dt.find('dl', recursive=False) if dl: self._parse_dl_tag_legacy(dl, bookmarks, new_folder_path) # 检查是否是书签链接 a = dt.find('a', recursive=False) if a and a.get('href'): url = a['href'] name = a.get_text(strip=True) bookmarks.append({ 'name': name, 'url': url, 'folder': folder_path }) def parse_url_list(self, text: str) -> List[Dict[str, str]]: """ 解析纯文本URL列表 Args: text: 文本内容,每行一个URL Returns: List[Dict]: URL列表 """ urls = [] lines = text.strip().split('\n') for line in lines: line = line.strip() if not line or line.startswith('#'): continue # 简单的URL验证 if re.match(r'^https?://', line): urls.append({ 'url': line, 'name': '', # 名称留空,后续自动获取 'folder': '' }) return urls @staticmethod def clean_title(title: str) -> str: """清理网站标题,提取网站名称""" if not title: return '' # 去除HTML实体 title = re.sub(r'&', '&', title) title = re.sub(r'<', '<', title) title = re.sub(r'>', '>', title) title = re.sub(r'"', '"', title) # 常见的分隔符 separators = [' - ', ' | ', ' · ', '·', '|', ' — '] for sep in separators: if sep in title: parts = title.split(sep) # 过滤掉常见的无用部分 filtered_parts = [] skip_keywords = ['官网', '首页', 'official', 'home', 'page', 'website'] for part in parts: part = part.strip() if part and not any(kw in part.lower() for kw in skip_keywords): filtered_parts.append(part) if filtered_parts: # 返回最短的部分(通常是网站名) return min(filtered_parts, key=len) # 去除一些常见的后缀词 suffixes = [ r'\s*官网\s*$', r'\s*首页\s*$', r'\s*Official Site\s*$', r'\s*Home Page\s*$', r'\s*Homepage\s*$', r'\s*Website\s*$' ] for suffix in suffixes: title = re.sub(suffix, '', title, flags=re.IGNORECASE) return title.strip() @staticmethod def extract_domain(url: str) -> str: """从URL提取域名""" match = re.search(r'https?://([^/]+)', url) if match: domain = match.group(1) # 去除www前缀 domain = re.sub(r'^www\.', '', domain) return domain return url