From a2f89fa5f011629bdfd4c48581fcb593106e3973 Mon Sep 17 00:00:00 2001 From: glowzz <24627181@qq.com> Date: Wed, 22 Jan 2025 08:44:25 +0800 Subject: [PATCH] add parser --- README.md | 5 + download2markdown - one.py | 225 +++++++++++++++++++++++++++++++++++++ 2 files changed, 230 insertions(+) create mode 100644 download2markdown - one.py diff --git a/README.md b/README.md index e69de29..7f03609 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,5 @@ +### 接管打开的chrome + +C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe --remote-debugging-port=9222 + +![image-20250122083408387](http://192.168.107.248:18089/i/2025/01/22/67903d0a6c61f.png) \ No newline at end of file diff --git a/download2markdown - one.py b/download2markdown - one.py new file mode 100644 index 0000000..c510db9 --- /dev/null +++ b/download2markdown - one.py @@ -0,0 +1,225 @@ +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.action_chains import ActionChains +import random +import time +from bs4 import BeautifulSoup +import os +import re +import html2text +import requests +#import base64 + +# 自定义 HTML 转换器类,继承自 html2text.HTML2Text +class CustomHTML2Text(html2text.HTML2Text): + def handle_tag(self, tag, attrs, start): + if tag == 'sub': + if start: + self.o(r'') + else: + self.o(r'') + elif tag == 'sup': + if start: + self.o(r'') # 开始上标 + else: + self.o(r'') # 结束上标 + elif tag == 'span' and attrs: + style = dict(attrs).get('style', '') + if 'text-decoration: line-through' in style: + print(f"Detected line-through span: {attrs}") # 调试输出 + if start: + self.o('~~') + else: + self.o('~~') + else: + super().handle_tag(tag, attrs, start) + else: + super().handle_tag(tag, attrs, start) + + +def sanitize_filename(filename): + # 替换非法字符 + sanitized = re.sub(r'[\\/*?:"<>|]', '', filename) + return sanitized + +# 预处理 HTML 内容,将带有 text-decoration: line-through 样式的 span 标签替换为包含 ~~ 的文本 +def process_strikethrough(element): + for span in element.find_all('span', style=lambda s: s and 'text-decoration:line-through' in s.replace(' ', '')): + # 处理内部内容 + contents = [] + for content in span.contents: + if isinstance(content, str): + text = content.strip() + if text: + contents.append(text) + elif content.name == 'br': + contents.append('\n') + elif content.name == 'span': + # 处理嵌套的span + inner_text = content.get_text().strip() + if inner_text: + contents.append(inner_text) + else: + inner_text = str(content).strip() + if inner_text: + contents.append(inner_text) + + # 组合处理后的内容 + if contents: + # 将内容按换行符分割并单独添加删除线 + parts = '\n'.join([f"~~{part.strip()}~~" for part in ' '.join(contents).split('\n') if part.strip()]) + new_content = BeautifulSoup(parts, 'html.parser') + span.replace_with(new_content) + +# ...existing code... + + +def download_image(url, save_path): + # 设置请求头 + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", + "Referer": "https://www.soujianzhu.cn/", + "Sec-Ch-Ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": "Windows" + } + if not url.startswith(('http://', 'https://')): + if 'www.soujianzhu.cn' not in url: + url = 'https://www.soujianzhu.cn' + url + try: + # 发送HTTP请求下载图片 + response = requests.get(url, timeout=30) + + # 检查响应状态码 + if response.status_code != 200: + print(f"下载失败,状态码: {response.status_code}") + return False + + # 将图片内容写入文件 + with open(save_path, 'wb') as f: + f.write(response.content) + + # 验证文件是否成功保存 + if os.path.exists(save_path) and os.path.getsize(save_path) > 0: + return True + else: + print("文件保存失败") + return False + + except Exception as e: + print(f"下载图片时发生错误: {str(e)}") + return False + + + +def save_lemma_content_as_markdown(driver, title_name): + # 获取渲染后的 HTML 内容 + html = driver.page_source + + # 使用 BeautifulSoup 解析 HTML + soup = BeautifulSoup(html, 'html.parser') + + # 提取网页中的文本内容 + content_div = soup.find('div', class_='lemma-main-content') + + if content_div: + # 创建保存图片的文件夹 + os.makedirs('images', exist_ok=True) + + + + # 下载图片并更新图片地址 + downloaded_images = [] + for img in content_div.find_all('img'): + img_url = img['src'] + img_name = os.path.basename(img_url) + img_path = os.path.join('images', img_name) + try: + + + if download_image(img_url, img_path): + img['src'] = img_path + downloaded_images.append((img_url, img_path)) + else: + print(f"Failed to save image {img_url}") + + + + except Exception as e: + print(f"Failed to save image {img_url}: {e}") + + # 查找所有的标签 + links = content_div.find_all('a') + + # 提取title和href属性 + links_info = [] + for link in links: + title = link.get('title') + href = link.get('href') + if title and href: + links_info.append({'title': title, 'href': href}) + + # 打印结果 + for info in links_info: + print(f"Title: {info['title']}, Href: {info['href']}") + + # 在处理content_div之前调用处理函数 + process_strikethrough(content_div) + + # 将提取的内容保存为 Markdown 文件 + html_content = content_div.prettify() + + # 创建自定义的 html2text 转换器实例 + converter = CustomHTML2Text() + converter.ignore_links = False # 如果需要保留链接 + converter.ignore_images = False # 如果需要保留图片 + converter.body_width = 0 # 设置为 0 以避免自动换行 + + markdown_content = converter.handle(html_content) + + # 更新Markdown内容中的图片路径 + for original_url, new_url in downloaded_images: + markdown_content = markdown_content.replace(original_url, new_url) + + with open(title_name + '.md', 'w', encoding='utf-8') as file: + file.write(markdown_content) + + print(f'内容已成功保存为 Markdown 文件: {title_name}.md') + else: + print('未找到具有指定类名的 div 元素。') + + +# 配置 Chrome 浏览器选项 +chrome_options = Options() +chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222") + +# 设置 Chrome 驱动路径 +chrome_driver_path = r'C:\path\to\chromedriver.exe' # 请替换为你的 chromedriver 路径 + +# 启动 Chrome 浏览器并接管已运行的实例 +service = Service(chrome_driver_path) +driver = webdriver.Chrome(service=service, options=chrome_options) + +# 等待页面加载完成 +try: + WebDriverWait(driver, 60).until( + EC.presence_of_element_located((By.ID, 'sideToolbar')) + ) +except Exception as e: + print(f"页面加载超时: {e}") + driver.quit() + exit() + +# 获取网页标题并处理文件名 +title_name = driver.title +sanitized_title_name = sanitize_filename(title_name) + +# 保存网页内容为 Markdown 文件 +save_lemma_content_as_markdown(driver, sanitized_title_name) + +# 关闭浏览器 +driver.quit() \ No newline at end of file