add parser
This commit is contained in:
		| @@ -0,0 +1,5 @@ | ||||
| ### 接管打开的chrome | ||||
|  | ||||
| C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe --remote-debugging-port=9222 | ||||
|  | ||||
|  | ||||
							
								
								
									
										225
									
								
								download2markdown - one.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										225
									
								
								download2markdown - one.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,225 @@ | ||||
| from selenium import webdriver | ||||
| from selenium.webdriver.chrome.service import Service | ||||
| from selenium.webdriver.chrome.options import Options | ||||
| from selenium.webdriver.common.by import By | ||||
| from selenium.webdriver.support.ui import WebDriverWait | ||||
| from selenium.webdriver.support import expected_conditions as EC | ||||
| from selenium.webdriver.common.action_chains import ActionChains | ||||
| import random | ||||
| import time | ||||
| from bs4 import BeautifulSoup | ||||
| import os | ||||
| import re | ||||
| import html2text | ||||
| import requests | ||||
| #import base64 | ||||
|  | ||||
| # 自定义 HTML 转换器类,继承自 html2text.HTML2Text | ||||
| class CustomHTML2Text(html2text.HTML2Text): | ||||
|     def handle_tag(self, tag, attrs, start): | ||||
|         if tag == 'sub': | ||||
|             if start: | ||||
|                 self.o(r'<sub>') | ||||
|             else: | ||||
|                 self.o(r'</sub>') | ||||
|         elif tag == 'sup': | ||||
|             if start: | ||||
|                 self.o(r'<sup>')  # 开始上标 | ||||
|             else: | ||||
|                 self.o(r'</sup>')  # 结束上标 | ||||
|         elif tag == 'span' and attrs: | ||||
|             style = dict(attrs).get('style', '') | ||||
|             if 'text-decoration: line-through' in style: | ||||
|                 print(f"Detected line-through span: {attrs}")  # 调试输出 | ||||
|                 if start: | ||||
|                     self.o('~~') | ||||
|                 else: | ||||
|                     self.o('~~') | ||||
|             else: | ||||
|                 super().handle_tag(tag, attrs, start) | ||||
|         else: | ||||
|             super().handle_tag(tag, attrs, start) | ||||
|  | ||||
|  | ||||
| def sanitize_filename(filename): | ||||
|     # 替换非法字符 | ||||
|     sanitized = re.sub(r'[\\/*?:"<>|]', '', filename) | ||||
|     return sanitized | ||||
|  | ||||
| # 预处理 HTML 内容,将带有 text-decoration: line-through 样式的 span 标签替换为包含 ~~ 的文本 | ||||
| def process_strikethrough(element): | ||||
|     for span in element.find_all('span', style=lambda s: s and 'text-decoration:line-through' in s.replace(' ', '')): | ||||
|         # 处理内部内容 | ||||
|         contents = [] | ||||
|         for content in span.contents: | ||||
|             if isinstance(content, str): | ||||
|                 text = content.strip() | ||||
|                 if text: | ||||
|                     contents.append(text) | ||||
|             elif content.name == 'br': | ||||
|                 contents.append('\n') | ||||
|             elif content.name == 'span': | ||||
|                 # 处理嵌套的span | ||||
|                 inner_text = content.get_text().strip() | ||||
|                 if inner_text: | ||||
|                     contents.append(inner_text) | ||||
|             else: | ||||
|                 inner_text = str(content).strip() | ||||
|                 if inner_text: | ||||
|                     contents.append(inner_text) | ||||
|          | ||||
|         # 组合处理后的内容 | ||||
|         if contents: | ||||
|             # 将内容按换行符分割并单独添加删除线 | ||||
|             parts = '\n'.join([f"~~{part.strip()}~~" for part in ' '.join(contents).split('\n') if part.strip()]) | ||||
|             new_content = BeautifulSoup(parts, 'html.parser') | ||||
|             span.replace_with(new_content) | ||||
|  | ||||
| # ...existing code... | ||||
|  | ||||
|  | ||||
| def download_image(url, save_path): | ||||
|         # 设置请求头 | ||||
|     headers = { | ||||
|         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", | ||||
|         "Referer": "https://www.soujianzhu.cn/", | ||||
|         "Sec-Ch-Ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"', | ||||
|         "Sec-Ch-Ua-Mobile": "?0", | ||||
|         "Sec-Ch-Ua-Platform": "Windows" | ||||
|     } | ||||
|     if not url.startswith(('http://', 'https://')): | ||||
|         if 'www.soujianzhu.cn' not in url: | ||||
|             url = 'https://www.soujianzhu.cn' + url | ||||
|     try: | ||||
|         # 发送HTTP请求下载图片 | ||||
|         response = requests.get(url, timeout=30) | ||||
|          | ||||
|         # 检查响应状态码 | ||||
|         if response.status_code != 200: | ||||
|             print(f"下载失败,状态码: {response.status_code}") | ||||
|             return False | ||||
|              | ||||
|         # 将图片内容写入文件 | ||||
|         with open(save_path, 'wb') as f: | ||||
|             f.write(response.content) | ||||
|              | ||||
|         # 验证文件是否成功保存 | ||||
|         if os.path.exists(save_path) and os.path.getsize(save_path) > 0: | ||||
|             return True | ||||
|         else: | ||||
|             print("文件保存失败") | ||||
|             return False | ||||
|              | ||||
|     except Exception as e: | ||||
|         print(f"下载图片时发生错误: {str(e)}") | ||||
|         return False | ||||
|  | ||||
|  | ||||
|  | ||||
| def save_lemma_content_as_markdown(driver, title_name): | ||||
|     # 获取渲染后的 HTML 内容 | ||||
|     html = driver.page_source | ||||
|  | ||||
|     # 使用 BeautifulSoup 解析 HTML | ||||
|     soup = BeautifulSoup(html, 'html.parser') | ||||
|  | ||||
|     # 提取网页中的文本内容 | ||||
|     content_div = soup.find('div', class_='lemma-main-content') | ||||
|  | ||||
|     if content_div: | ||||
|         # 创建保存图片的文件夹 | ||||
|         os.makedirs('images', exist_ok=True) | ||||
|  | ||||
|  | ||||
|  | ||||
|         # 下载图片并更新图片地址 | ||||
|         downloaded_images = [] | ||||
|         for img in content_div.find_all('img'): | ||||
|             img_url = img['src'] | ||||
|             img_name = os.path.basename(img_url) | ||||
|             img_path = os.path.join('images', img_name) | ||||
|             try: | ||||
|  | ||||
|  | ||||
|                 if download_image(img_url, img_path): | ||||
|                     img['src'] = img_path | ||||
|                     downloaded_images.append((img_url, img_path)) | ||||
|                 else: | ||||
|                     print(f"Failed to save image {img_url}") | ||||
|                  | ||||
|  | ||||
|                  | ||||
|             except Exception as e: | ||||
|                 print(f"Failed to save image {img_url}: {e}") | ||||
|  | ||||
|         # 查找所有的<a>标签 | ||||
|         links = content_div.find_all('a') | ||||
|  | ||||
|         # 提取title和href属性 | ||||
|         links_info = [] | ||||
|         for link in links: | ||||
|             title = link.get('title') | ||||
|             href = link.get('href') | ||||
|             if title and href: | ||||
|                 links_info.append({'title': title, 'href': href}) | ||||
|  | ||||
|         # 打印结果 | ||||
|         for info in links_info: | ||||
|             print(f"Title: {info['title']}, Href: {info['href']}") | ||||
|  | ||||
|         # 在处理content_div之前调用处理函数 | ||||
|         process_strikethrough(content_div) | ||||
|  | ||||
|         # 将提取的内容保存为 Markdown 文件 | ||||
|         html_content = content_div.prettify() | ||||
|  | ||||
|         # 创建自定义的 html2text 转换器实例 | ||||
|         converter = CustomHTML2Text() | ||||
|         converter.ignore_links = False  # 如果需要保留链接 | ||||
|         converter.ignore_images = False  # 如果需要保留图片 | ||||
|         converter.body_width = 0        # 设置为 0 以避免自动换行 | ||||
|  | ||||
|         markdown_content = converter.handle(html_content) | ||||
|  | ||||
|         # 更新Markdown内容中的图片路径 | ||||
|         for original_url, new_url in downloaded_images: | ||||
|             markdown_content = markdown_content.replace(original_url, new_url) | ||||
|  | ||||
|         with open(title_name + '.md', 'w', encoding='utf-8') as file: | ||||
|             file.write(markdown_content) | ||||
|  | ||||
|         print(f'内容已成功保存为 Markdown 文件: {title_name}.md') | ||||
|     else: | ||||
|         print('未找到具有指定类名的 div 元素。') | ||||
|  | ||||
|  | ||||
| # 配置 Chrome 浏览器选项 | ||||
| chrome_options = Options() | ||||
| chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222") | ||||
|  | ||||
| # 设置 Chrome 驱动路径 | ||||
| chrome_driver_path = r'C:\path\to\chromedriver.exe'  # 请替换为你的 chromedriver 路径 | ||||
|  | ||||
| # 启动 Chrome 浏览器并接管已运行的实例 | ||||
| service = Service(chrome_driver_path) | ||||
| driver = webdriver.Chrome(service=service, options=chrome_options) | ||||
|  | ||||
| # 等待页面加载完成 | ||||
| try: | ||||
|     WebDriverWait(driver, 60).until( | ||||
|         EC.presence_of_element_located((By.ID, 'sideToolbar')) | ||||
|     ) | ||||
| except Exception as e: | ||||
|     print(f"页面加载超时: {e}") | ||||
|     driver.quit() | ||||
|     exit() | ||||
|  | ||||
| # 获取网页标题并处理文件名 | ||||
| title_name = driver.title | ||||
| sanitized_title_name = sanitize_filename(title_name) | ||||
|  | ||||
| # 保存网页内容为 Markdown 文件 | ||||
| save_lemma_content_as_markdown(driver, sanitized_title_name) | ||||
|  | ||||
| # 关闭浏览器 | ||||
| driver.quit() | ||||
		Reference in New Issue
	
	Block a user