add parser

2025-01-22 08:44:25 +08:00
parent c761948e7b
commit a2f89fa5f0
2 changed files with 230 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,5 @@
+### 接管打开的chrome
+
+C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe --remote-debugging-port=9222
+
+![image-20250122083408387](http://192.168.107.248:18089/i/2025/01/22/67903d0a6c61f.png)
--- a/one.py
+++ b/one.py
@@ -0,0 +1,225 @@
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.action_chains import ActionChains
+import random
+import time
+from bs4 import BeautifulSoup
+import os
+import re
+import html2text
+import requests
+#import base64
+
+# 自定义 HTML 转换器类，继承自 html2text.HTML2Text
+class CustomHTML2Text(html2text.HTML2Text):
+    def handle_tag(self, tag, attrs, start):
+        if tag == 'sub':
+            if start:
+                self.o(r'<sub>')
+            else:
+                self.o(r'</sub>')
+        elif tag == 'sup':
+            if start:
+                self.o(r'<sup>')  # 开始上标
+            else:
+                self.o(r'</sup>')  # 结束上标
+        elif tag == 'span' and attrs:
+            style = dict(attrs).get('style', '')
+            if 'text-decoration: line-through' in style:
+                print(f"Detected line-through span: {attrs}")  # 调试输出
+                if start:
+                    self.o('~~')
+                else:
+                    self.o('~~')
+            else:
+                super().handle_tag(tag, attrs, start)
+        else:
+            super().handle_tag(tag, attrs, start)
+
+
+def sanitize_filename(filename):
+    # 替换非法字符
+    sanitized = re.sub(r'[\\/*?:"<>|]', '', filename)
+    return sanitized
+
+# 预处理 HTML 内容，将带有 text-decoration: line-through 样式的 span 标签替换为包含 ~~ 的文本
+def process_strikethrough(element):
+    for span in element.find_all('span', style=lambda s: s and 'text-decoration:line-through' in s.replace(' ', '')):
+        # 处理内部内容
+        contents = []
+        for content in span.contents:
+            if isinstance(content, str):
+                text = content.strip()
+                if text:
+                    contents.append(text)
+            elif content.name == 'br':
+                contents.append('\n')
+            elif content.name == 'span':
+                # 处理嵌套的span
+                inner_text = content.get_text().strip()
+                if inner_text:
+                    contents.append(inner_text)
+            else:
+                inner_text = str(content).strip()
+                if inner_text:
+                    contents.append(inner_text)
+        
+        # 组合处理后的内容
+        if contents:
+            # 将内容按换行符分割并单独添加删除线
+            parts = '\n'.join([f"~~{part.strip()}~~" for part in ' '.join(contents).split('\n') if part.strip()])
+            new_content = BeautifulSoup(parts, 'html.parser')
+            span.replace_with(new_content)
+
+# ...existing code...
+
+
+def download_image(url, save_path):
+        # 设置请求头
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
+        "Referer": "https://www.soujianzhu.cn/",
+        "Sec-Ch-Ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
+        "Sec-Ch-Ua-Mobile": "?0",
+        "Sec-Ch-Ua-Platform": "Windows"
+    }
+    if not url.startswith(('http://', 'https://')):
+        if 'www.soujianzhu.cn' not in url:
+            url = 'https://www.soujianzhu.cn' + url
+    try:
+        # 发送HTTP请求下载图片
+        response = requests.get(url, timeout=30)
+        
+        # 检查响应状态码
+        if response.status_code != 200:
+            print(f"下载失败，状态码: {response.status_code}")
+            return False
+            
+        # 将图片内容写入文件
+        with open(save_path, 'wb') as f:
+            f.write(response.content)
+            
+        # 验证文件是否成功保存
+        if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+            return True
+        else:
+            print("文件保存失败")
+            return False
+            
+    except Exception as e:
+        print(f"下载图片时发生错误: {str(e)}")
+        return False
+
+
+
+def save_lemma_content_as_markdown(driver, title_name):
+    # 获取渲染后的 HTML 内容
+    html = driver.page_source
+
+    # 使用 BeautifulSoup 解析 HTML
+    soup = BeautifulSoup(html, 'html.parser')
+
+    # 提取网页中的文本内容
+    content_div = soup.find('div', class_='lemma-main-content')
+
+    if content_div:
+        # 创建保存图片的文件夹
+        os.makedirs('images', exist_ok=True)
+
+
+
+        # 下载图片并更新图片地址
+        downloaded_images = []
+        for img in content_div.find_all('img'):
+            img_url = img['src']
+            img_name = os.path.basename(img_url)
+            img_path = os.path.join('images', img_name)
+            try:
+
+
+                if download_image(img_url, img_path):
+                    img['src'] = img_path
+                    downloaded_images.append((img_url, img_path))
+                else:
+                    print(f"Failed to save image {img_url}")
+                
+
+                
+            except Exception as e:
+                print(f"Failed to save image {img_url}: {e}")
+
+        # 查找所有的<a>标签
+        links = content_div.find_all('a')
+
+        # 提取title和href属性
+        links_info = []
+        for link in links:
+            title = link.get('title')
+            href = link.get('href')
+            if title and href:
+                links_info.append({'title': title, 'href': href})
+
+        # 打印结果
+        for info in links_info:
+            print(f"Title: {info['title']}, Href: {info['href']}")
+
+        # 在处理content_div之前调用处理函数
+        process_strikethrough(content_div)
+
+        # 将提取的内容保存为 Markdown 文件
+        html_content = content_div.prettify()
+
+        # 创建自定义的 html2text 转换器实例
+        converter = CustomHTML2Text()
+        converter.ignore_links = False  # 如果需要保留链接
+        converter.ignore_images = False  # 如果需要保留图片
+        converter.body_width = 0        # 设置为 0 以避免自动换行
+
+        markdown_content = converter.handle(html_content)
+
+        # 更新Markdown内容中的图片路径
+        for original_url, new_url in downloaded_images:
+            markdown_content = markdown_content.replace(original_url, new_url)
+
+        with open(title_name + '.md', 'w', encoding='utf-8') as file:
+            file.write(markdown_content)
+
+        print(f'内容已成功保存为 Markdown 文件: {title_name}.md')
+    else:
+        print('未找到具有指定类名的 div 元素。')
+
+
+# 配置 Chrome 浏览器选项
+chrome_options = Options()
+chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
+
+# 设置 Chrome 驱动路径
+chrome_driver_path = r'C:\path\to\chromedriver.exe'  # 请替换为你的 chromedriver 路径
+
+# 启动 Chrome 浏览器并接管已运行的实例
+service = Service(chrome_driver_path)
+driver = webdriver.Chrome(service=service, options=chrome_options)
+
+# 等待页面加载完成
+try:
+    WebDriverWait(driver, 60).until(
+        EC.presence_of_element_located((By.ID, 'sideToolbar'))
+    )
+except Exception as e:
+    print(f"页面加载超时: {e}")
+    driver.quit()
+    exit()
+
+# 获取网页标题并处理文件名
+title_name = driver.title
+sanitized_title_name = sanitize_filename(title_name)
+
+# 保存网页内容为 Markdown 文件
+save_lemma_content_as_markdown(driver, sanitized_title_name)
+
+# 关闭浏览器
+driver.quit()