from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.action_chains import ActionChains import random import time from bs4 import BeautifulSoup import os import re import html2text import requests #import base64 # 自定义 HTML 转换器类,继承自 html2text.HTML2Text class CustomHTML2Text(html2text.HTML2Text): def handle_tag(self, tag, attrs, start): if tag == 'sub': if start: self.o(r'') else: self.o(r'') elif tag == 'sup': if start: self.o(r'') # 开始上标 else: self.o(r'') # 结束上标 elif tag == 'span' and attrs: style = dict(attrs).get('style', '') if 'text-decoration: line-through' in style: print(f"Detected line-through span: {attrs}") # 调试输出 if start: self.o('~~') else: self.o('~~') else: super().handle_tag(tag, attrs, start) else: super().handle_tag(tag, attrs, start) def sanitize_filename(filename): # 替换非法字符 sanitized = re.sub(r'[\\/*?:"<>|]', '', filename) return sanitized # 预处理 HTML 内容,将带有 text-decoration: line-through 样式的 span 标签替换为包含 ~~ 的文本 def process_strikethrough(element): for span in element.find_all('span', style=lambda s: s and 'text-decoration:line-through' in s.replace(' ', '')): # 处理内部内容 contents = [] for content in span.contents: if isinstance(content, str): text = content.strip() if text: contents.append(text) elif content.name == 'br': contents.append('\n') elif content.name == 'span': # 处理嵌套的span inner_text = content.get_text().strip() if inner_text: contents.append(inner_text) else: inner_text = str(content).strip() if inner_text: contents.append(inner_text) # 组合处理后的内容 if contents: # 将内容按换行符分割并单独添加删除线 parts = '\n'.join([f"~~{part.strip()}~~" for part in ' '.join(contents).split('\n') if part.strip()]) new_content = BeautifulSoup(parts, 'html.parser') span.replace_with(new_content) # ...existing code... def download_image(url, save_path): # 设置请求头 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", "Referer": "https://www.soujianzhu.cn/", "Sec-Ch-Ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"', "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": "Windows" } if not url.startswith(('http://', 'https://')): if 'www.soujianzhu.cn' not in url: url = 'https://www.soujianzhu.cn' + url try: # 发送HTTP请求下载图片 response = requests.get(url, timeout=30) # 检查响应状态码 if response.status_code != 200: print(f"下载失败,状态码: {response.status_code}") return False # 将图片内容写入文件 with open(save_path, 'wb') as f: f.write(response.content) # 验证文件是否成功保存 if os.path.exists(save_path) and os.path.getsize(save_path) > 0: return True else: print("文件保存失败") return False except Exception as e: print(f"下载图片时发生错误: {str(e)}") return False def save_lemma_content_as_markdown(driver, title_name): # 获取渲染后的 HTML 内容 html = driver.page_source # 使用 BeautifulSoup 解析 HTML soup = BeautifulSoup(html, 'html.parser') # 提取网页中的文本内容 content_div = soup.find('div', class_='lemma-main-content') if content_div: # 创建保存图片的文件夹 os.makedirs('images', exist_ok=True) # 下载图片并更新图片地址 downloaded_images = [] for img in content_div.find_all('img'): img_url = img['src'] img_name = os.path.basename(img_url) img_path = os.path.join('images', img_name) try: if download_image(img_url, img_path): img['src'] = img_path downloaded_images.append((img_url, img_path)) else: print(f"Failed to save image {img_url}") except Exception as e: print(f"Failed to save image {img_url}: {e}") # 查找所有的标签 links = content_div.find_all('a') # 提取title和href属性 links_info = [] for link in links: title = link.get('title') href = link.get('href') if title and href: links_info.append({'title': title, 'href': href}) # 打印结果 for info in links_info: print(f"Title: {info['title']}, Href: {info['href']}") # 在处理content_div之前调用处理函数 process_strikethrough(content_div) # 将提取的内容保存为 Markdown 文件 html_content = content_div.prettify() # 创建自定义的 html2text 转换器实例 converter = CustomHTML2Text() converter.ignore_links = False # 如果需要保留链接 converter.ignore_images = False # 如果需要保留图片 converter.body_width = 0 # 设置为 0 以避免自动换行 markdown_content = converter.handle(html_content) # 更新Markdown内容中的图片路径 for original_url, new_url in downloaded_images: markdown_content = markdown_content.replace(original_url, new_url) with open(title_name + '.md', 'w', encoding='utf-8') as file: file.write(markdown_content) print(f'内容已成功保存为 Markdown 文件: {title_name}.md') else: print('未找到具有指定类名的 div 元素。') # 配置 Chrome 浏览器选项 chrome_options = Options() chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222") # 设置 Chrome 驱动路径 chrome_driver_path = r'C:\path\to\chromedriver.exe' # 请替换为你的 chromedriver 路径 # 启动 Chrome 浏览器并接管已运行的实例 service = Service(chrome_driver_path) driver = webdriver.Chrome(service=service, options=chrome_options) # 等待页面加载完成 try: WebDriverWait(driver, 60).until( EC.presence_of_element_located((By.ID, 'sideToolbar')) ) except Exception as e: print(f"页面加载超时: {e}") driver.quit() exit() # 获取网页标题并处理文件名 title_name = driver.title sanitized_title_name = sanitize_filename(title_name) # 保存网页内容为 Markdown 文件 save_lemma_content_as_markdown(driver, sanitized_title_name) # 关闭浏览器 driver.quit()