add parser
This commit is contained in:
		@@ -0,0 +1,5 @@
 | 
				
			|||||||
 | 
					### 接管打开的chrome
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe --remote-debugging-port=9222
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										225
									
								
								download2markdown - one.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										225
									
								
								download2markdown - one.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,225 @@
 | 
				
			|||||||
 | 
					from selenium import webdriver
 | 
				
			||||||
 | 
					from selenium.webdriver.chrome.service import Service
 | 
				
			||||||
 | 
					from selenium.webdriver.chrome.options import Options
 | 
				
			||||||
 | 
					from selenium.webdriver.common.by import By
 | 
				
			||||||
 | 
					from selenium.webdriver.support.ui import WebDriverWait
 | 
				
			||||||
 | 
					from selenium.webdriver.support import expected_conditions as EC
 | 
				
			||||||
 | 
					from selenium.webdriver.common.action_chains import ActionChains
 | 
				
			||||||
 | 
					import random
 | 
				
			||||||
 | 
					import time
 | 
				
			||||||
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					import html2text
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					#import base64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 自定义 HTML 转换器类,继承自 html2text.HTML2Text
 | 
				
			||||||
 | 
					class CustomHTML2Text(html2text.HTML2Text):
 | 
				
			||||||
 | 
					    def handle_tag(self, tag, attrs, start):
 | 
				
			||||||
 | 
					        if tag == 'sub':
 | 
				
			||||||
 | 
					            if start:
 | 
				
			||||||
 | 
					                self.o(r'<sub>')
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                self.o(r'</sub>')
 | 
				
			||||||
 | 
					        elif tag == 'sup':
 | 
				
			||||||
 | 
					            if start:
 | 
				
			||||||
 | 
					                self.o(r'<sup>')  # 开始上标
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                self.o(r'</sup>')  # 结束上标
 | 
				
			||||||
 | 
					        elif tag == 'span' and attrs:
 | 
				
			||||||
 | 
					            style = dict(attrs).get('style', '')
 | 
				
			||||||
 | 
					            if 'text-decoration: line-through' in style:
 | 
				
			||||||
 | 
					                print(f"Detected line-through span: {attrs}")  # 调试输出
 | 
				
			||||||
 | 
					                if start:
 | 
				
			||||||
 | 
					                    self.o('~~')
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    self.o('~~')
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                super().handle_tag(tag, attrs, start)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            super().handle_tag(tag, attrs, start)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def sanitize_filename(filename):
 | 
				
			||||||
 | 
					    # 替换非法字符
 | 
				
			||||||
 | 
					    sanitized = re.sub(r'[\\/*?:"<>|]', '', filename)
 | 
				
			||||||
 | 
					    return sanitized
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 预处理 HTML 内容,将带有 text-decoration: line-through 样式的 span 标签替换为包含 ~~ 的文本
 | 
				
			||||||
 | 
					def process_strikethrough(element):
 | 
				
			||||||
 | 
					    for span in element.find_all('span', style=lambda s: s and 'text-decoration:line-through' in s.replace(' ', '')):
 | 
				
			||||||
 | 
					        # 处理内部内容
 | 
				
			||||||
 | 
					        contents = []
 | 
				
			||||||
 | 
					        for content in span.contents:
 | 
				
			||||||
 | 
					            if isinstance(content, str):
 | 
				
			||||||
 | 
					                text = content.strip()
 | 
				
			||||||
 | 
					                if text:
 | 
				
			||||||
 | 
					                    contents.append(text)
 | 
				
			||||||
 | 
					            elif content.name == 'br':
 | 
				
			||||||
 | 
					                contents.append('\n')
 | 
				
			||||||
 | 
					            elif content.name == 'span':
 | 
				
			||||||
 | 
					                # 处理嵌套的span
 | 
				
			||||||
 | 
					                inner_text = content.get_text().strip()
 | 
				
			||||||
 | 
					                if inner_text:
 | 
				
			||||||
 | 
					                    contents.append(inner_text)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                inner_text = str(content).strip()
 | 
				
			||||||
 | 
					                if inner_text:
 | 
				
			||||||
 | 
					                    contents.append(inner_text)
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 组合处理后的内容
 | 
				
			||||||
 | 
					        if contents:
 | 
				
			||||||
 | 
					            # 将内容按换行符分割并单独添加删除线
 | 
				
			||||||
 | 
					            parts = '\n'.join([f"~~{part.strip()}~~" for part in ' '.join(contents).split('\n') if part.strip()])
 | 
				
			||||||
 | 
					            new_content = BeautifulSoup(parts, 'html.parser')
 | 
				
			||||||
 | 
					            span.replace_with(new_content)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ...existing code...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def download_image(url, save_path):
 | 
				
			||||||
 | 
					        # 设置请求头
 | 
				
			||||||
 | 
					    headers = {
 | 
				
			||||||
 | 
					        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
 | 
				
			||||||
 | 
					        "Referer": "https://www.soujianzhu.cn/",
 | 
				
			||||||
 | 
					        "Sec-Ch-Ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
 | 
				
			||||||
 | 
					        "Sec-Ch-Ua-Mobile": "?0",
 | 
				
			||||||
 | 
					        "Sec-Ch-Ua-Platform": "Windows"
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if not url.startswith(('http://', 'https://')):
 | 
				
			||||||
 | 
					        if 'www.soujianzhu.cn' not in url:
 | 
				
			||||||
 | 
					            url = 'https://www.soujianzhu.cn' + url
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        # 发送HTTP请求下载图片
 | 
				
			||||||
 | 
					        response = requests.get(url, timeout=30)
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 检查响应状态码
 | 
				
			||||||
 | 
					        if response.status_code != 200:
 | 
				
			||||||
 | 
					            print(f"下载失败,状态码: {response.status_code}")
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					        # 将图片内容写入文件
 | 
				
			||||||
 | 
					        with open(save_path, 'wb') as f:
 | 
				
			||||||
 | 
					            f.write(response.content)
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					        # 验证文件是否成功保存
 | 
				
			||||||
 | 
					        if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
 | 
				
			||||||
 | 
					            return True
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            print("文件保存失败")
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					    except Exception as e:
 | 
				
			||||||
 | 
					        print(f"下载图片时发生错误: {str(e)}")
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def save_lemma_content_as_markdown(driver, title_name):
 | 
				
			||||||
 | 
					    # 获取渲染后的 HTML 内容
 | 
				
			||||||
 | 
					    html = driver.page_source
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # 使用 BeautifulSoup 解析 HTML
 | 
				
			||||||
 | 
					    soup = BeautifulSoup(html, 'html.parser')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # 提取网页中的文本内容
 | 
				
			||||||
 | 
					    content_div = soup.find('div', class_='lemma-main-content')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if content_div:
 | 
				
			||||||
 | 
					        # 创建保存图片的文件夹
 | 
				
			||||||
 | 
					        os.makedirs('images', exist_ok=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # 下载图片并更新图片地址
 | 
				
			||||||
 | 
					        downloaded_images = []
 | 
				
			||||||
 | 
					        for img in content_div.find_all('img'):
 | 
				
			||||||
 | 
					            img_url = img['src']
 | 
				
			||||||
 | 
					            img_name = os.path.basename(img_url)
 | 
				
			||||||
 | 
					            img_path = os.path.join('images', img_name)
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                if download_image(img_url, img_path):
 | 
				
			||||||
 | 
					                    img['src'] = img_path
 | 
				
			||||||
 | 
					                    downloaded_images.append((img_url, img_path))
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    print(f"Failed to save image {img_url}")
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					            except Exception as e:
 | 
				
			||||||
 | 
					                print(f"Failed to save image {img_url}: {e}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # 查找所有的<a>标签
 | 
				
			||||||
 | 
					        links = content_div.find_all('a')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # 提取title和href属性
 | 
				
			||||||
 | 
					        links_info = []
 | 
				
			||||||
 | 
					        for link in links:
 | 
				
			||||||
 | 
					            title = link.get('title')
 | 
				
			||||||
 | 
					            href = link.get('href')
 | 
				
			||||||
 | 
					            if title and href:
 | 
				
			||||||
 | 
					                links_info.append({'title': title, 'href': href})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # 打印结果
 | 
				
			||||||
 | 
					        for info in links_info:
 | 
				
			||||||
 | 
					            print(f"Title: {info['title']}, Href: {info['href']}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # 在处理content_div之前调用处理函数
 | 
				
			||||||
 | 
					        process_strikethrough(content_div)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # 将提取的内容保存为 Markdown 文件
 | 
				
			||||||
 | 
					        html_content = content_div.prettify()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # 创建自定义的 html2text 转换器实例
 | 
				
			||||||
 | 
					        converter = CustomHTML2Text()
 | 
				
			||||||
 | 
					        converter.ignore_links = False  # 如果需要保留链接
 | 
				
			||||||
 | 
					        converter.ignore_images = False  # 如果需要保留图片
 | 
				
			||||||
 | 
					        converter.body_width = 0        # 设置为 0 以避免自动换行
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        markdown_content = converter.handle(html_content)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # 更新Markdown内容中的图片路径
 | 
				
			||||||
 | 
					        for original_url, new_url in downloaded_images:
 | 
				
			||||||
 | 
					            markdown_content = markdown_content.replace(original_url, new_url)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        with open(title_name + '.md', 'w', encoding='utf-8') as file:
 | 
				
			||||||
 | 
					            file.write(markdown_content)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        print(f'内容已成功保存为 Markdown 文件: {title_name}.md')
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        print('未找到具有指定类名的 div 元素。')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 配置 Chrome 浏览器选项
 | 
				
			||||||
 | 
					chrome_options = Options()
 | 
				
			||||||
 | 
					chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 设置 Chrome 驱动路径
 | 
				
			||||||
 | 
					chrome_driver_path = r'C:\path\to\chromedriver.exe'  # 请替换为你的 chromedriver 路径
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 启动 Chrome 浏览器并接管已运行的实例
 | 
				
			||||||
 | 
					service = Service(chrome_driver_path)
 | 
				
			||||||
 | 
					driver = webdriver.Chrome(service=service, options=chrome_options)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 等待页面加载完成
 | 
				
			||||||
 | 
					try:
 | 
				
			||||||
 | 
					    WebDriverWait(driver, 60).until(
 | 
				
			||||||
 | 
					        EC.presence_of_element_located((By.ID, 'sideToolbar'))
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					except Exception as e:
 | 
				
			||||||
 | 
					    print(f"页面加载超时: {e}")
 | 
				
			||||||
 | 
					    driver.quit()
 | 
				
			||||||
 | 
					    exit()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 获取网页标题并处理文件名
 | 
				
			||||||
 | 
					title_name = driver.title
 | 
				
			||||||
 | 
					sanitized_title_name = sanitize_filename(title_name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 保存网页内容为 Markdown 文件
 | 
				
			||||||
 | 
					save_lemma_content_as_markdown(driver, sanitized_title_name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 关闭浏览器
 | 
				
			||||||
 | 
					driver.quit()
 | 
				
			||||||
		Reference in New Issue
	
	Block a user