add parser
This commit is contained in:
parent
c761948e7b
commit
a2f89fa5f0
@ -0,0 +1,5 @@
|
|||||||
|
### 接管打开的chrome
|
||||||
|
|
||||||
|
C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe --remote-debugging-port=9222
|
||||||
|
|
||||||
|
![image-20250122083408387](http://192.168.107.248:18089/i/2025/01/22/67903d0a6c61f.png)
|
225
download2markdown - one.py
Normal file
225
download2markdown - one.py
Normal file
@ -0,0 +1,225 @@
|
|||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import html2text
|
||||||
|
import requests
|
||||||
|
#import base64
|
||||||
|
|
||||||
|
# 自定义 HTML 转换器类,继承自 html2text.HTML2Text
|
||||||
|
class CustomHTML2Text(html2text.HTML2Text):
|
||||||
|
def handle_tag(self, tag, attrs, start):
|
||||||
|
if tag == 'sub':
|
||||||
|
if start:
|
||||||
|
self.o(r'<sub>')
|
||||||
|
else:
|
||||||
|
self.o(r'</sub>')
|
||||||
|
elif tag == 'sup':
|
||||||
|
if start:
|
||||||
|
self.o(r'<sup>') # 开始上标
|
||||||
|
else:
|
||||||
|
self.o(r'</sup>') # 结束上标
|
||||||
|
elif tag == 'span' and attrs:
|
||||||
|
style = dict(attrs).get('style', '')
|
||||||
|
if 'text-decoration: line-through' in style:
|
||||||
|
print(f"Detected line-through span: {attrs}") # 调试输出
|
||||||
|
if start:
|
||||||
|
self.o('~~')
|
||||||
|
else:
|
||||||
|
self.o('~~')
|
||||||
|
else:
|
||||||
|
super().handle_tag(tag, attrs, start)
|
||||||
|
else:
|
||||||
|
super().handle_tag(tag, attrs, start)
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_filename(filename):
|
||||||
|
# 替换非法字符
|
||||||
|
sanitized = re.sub(r'[\\/*?:"<>|]', '', filename)
|
||||||
|
return sanitized
|
||||||
|
|
||||||
|
# 预处理 HTML 内容,将带有 text-decoration: line-through 样式的 span 标签替换为包含 ~~ 的文本
|
||||||
|
def process_strikethrough(element):
|
||||||
|
for span in element.find_all('span', style=lambda s: s and 'text-decoration:line-through' in s.replace(' ', '')):
|
||||||
|
# 处理内部内容
|
||||||
|
contents = []
|
||||||
|
for content in span.contents:
|
||||||
|
if isinstance(content, str):
|
||||||
|
text = content.strip()
|
||||||
|
if text:
|
||||||
|
contents.append(text)
|
||||||
|
elif content.name == 'br':
|
||||||
|
contents.append('\n')
|
||||||
|
elif content.name == 'span':
|
||||||
|
# 处理嵌套的span
|
||||||
|
inner_text = content.get_text().strip()
|
||||||
|
if inner_text:
|
||||||
|
contents.append(inner_text)
|
||||||
|
else:
|
||||||
|
inner_text = str(content).strip()
|
||||||
|
if inner_text:
|
||||||
|
contents.append(inner_text)
|
||||||
|
|
||||||
|
# 组合处理后的内容
|
||||||
|
if contents:
|
||||||
|
# 将内容按换行符分割并单独添加删除线
|
||||||
|
parts = '\n'.join([f"~~{part.strip()}~~" for part in ' '.join(contents).split('\n') if part.strip()])
|
||||||
|
new_content = BeautifulSoup(parts, 'html.parser')
|
||||||
|
span.replace_with(new_content)
|
||||||
|
|
||||||
|
# ...existing code...
|
||||||
|
|
||||||
|
|
||||||
|
def download_image(url, save_path):
|
||||||
|
# 设置请求头
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
||||||
|
"Referer": "https://www.soujianzhu.cn/",
|
||||||
|
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
|
||||||
|
"Sec-Ch-Ua-Mobile": "?0",
|
||||||
|
"Sec-Ch-Ua-Platform": "Windows"
|
||||||
|
}
|
||||||
|
if not url.startswith(('http://', 'https://')):
|
||||||
|
if 'www.soujianzhu.cn' not in url:
|
||||||
|
url = 'https://www.soujianzhu.cn' + url
|
||||||
|
try:
|
||||||
|
# 发送HTTP请求下载图片
|
||||||
|
response = requests.get(url, timeout=30)
|
||||||
|
|
||||||
|
# 检查响应状态码
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"下载失败,状态码: {response.status_code}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 将图片内容写入文件
|
||||||
|
with open(save_path, 'wb') as f:
|
||||||
|
f.write(response.content)
|
||||||
|
|
||||||
|
# 验证文件是否成功保存
|
||||||
|
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("文件保存失败")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"下载图片时发生错误: {str(e)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def save_lemma_content_as_markdown(driver, title_name):
|
||||||
|
# 获取渲染后的 HTML 内容
|
||||||
|
html = driver.page_source
|
||||||
|
|
||||||
|
# 使用 BeautifulSoup 解析 HTML
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
# 提取网页中的文本内容
|
||||||
|
content_div = soup.find('div', class_='lemma-main-content')
|
||||||
|
|
||||||
|
if content_div:
|
||||||
|
# 创建保存图片的文件夹
|
||||||
|
os.makedirs('images', exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 下载图片并更新图片地址
|
||||||
|
downloaded_images = []
|
||||||
|
for img in content_div.find_all('img'):
|
||||||
|
img_url = img['src']
|
||||||
|
img_name = os.path.basename(img_url)
|
||||||
|
img_path = os.path.join('images', img_name)
|
||||||
|
try:
|
||||||
|
|
||||||
|
|
||||||
|
if download_image(img_url, img_path):
|
||||||
|
img['src'] = img_path
|
||||||
|
downloaded_images.append((img_url, img_path))
|
||||||
|
else:
|
||||||
|
print(f"Failed to save image {img_url}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to save image {img_url}: {e}")
|
||||||
|
|
||||||
|
# 查找所有的<a>标签
|
||||||
|
links = content_div.find_all('a')
|
||||||
|
|
||||||
|
# 提取title和href属性
|
||||||
|
links_info = []
|
||||||
|
for link in links:
|
||||||
|
title = link.get('title')
|
||||||
|
href = link.get('href')
|
||||||
|
if title and href:
|
||||||
|
links_info.append({'title': title, 'href': href})
|
||||||
|
|
||||||
|
# 打印结果
|
||||||
|
for info in links_info:
|
||||||
|
print(f"Title: {info['title']}, Href: {info['href']}")
|
||||||
|
|
||||||
|
# 在处理content_div之前调用处理函数
|
||||||
|
process_strikethrough(content_div)
|
||||||
|
|
||||||
|
# 将提取的内容保存为 Markdown 文件
|
||||||
|
html_content = content_div.prettify()
|
||||||
|
|
||||||
|
# 创建自定义的 html2text 转换器实例
|
||||||
|
converter = CustomHTML2Text()
|
||||||
|
converter.ignore_links = False # 如果需要保留链接
|
||||||
|
converter.ignore_images = False # 如果需要保留图片
|
||||||
|
converter.body_width = 0 # 设置为 0 以避免自动换行
|
||||||
|
|
||||||
|
markdown_content = converter.handle(html_content)
|
||||||
|
|
||||||
|
# 更新Markdown内容中的图片路径
|
||||||
|
for original_url, new_url in downloaded_images:
|
||||||
|
markdown_content = markdown_content.replace(original_url, new_url)
|
||||||
|
|
||||||
|
with open(title_name + '.md', 'w', encoding='utf-8') as file:
|
||||||
|
file.write(markdown_content)
|
||||||
|
|
||||||
|
print(f'内容已成功保存为 Markdown 文件: {title_name}.md')
|
||||||
|
else:
|
||||||
|
print('未找到具有指定类名的 div 元素。')
|
||||||
|
|
||||||
|
|
||||||
|
# 配置 Chrome 浏览器选项
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
|
||||||
|
|
||||||
|
# 设置 Chrome 驱动路径
|
||||||
|
chrome_driver_path = r'C:\path\to\chromedriver.exe' # 请替换为你的 chromedriver 路径
|
||||||
|
|
||||||
|
# 启动 Chrome 浏览器并接管已运行的实例
|
||||||
|
service = Service(chrome_driver_path)
|
||||||
|
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||||
|
|
||||||
|
# 等待页面加载完成
|
||||||
|
try:
|
||||||
|
WebDriverWait(driver, 60).until(
|
||||||
|
EC.presence_of_element_located((By.ID, 'sideToolbar'))
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"页面加载超时: {e}")
|
||||||
|
driver.quit()
|
||||||
|
exit()
|
||||||
|
|
||||||
|
# 获取网页标题并处理文件名
|
||||||
|
title_name = driver.title
|
||||||
|
sanitized_title_name = sanitize_filename(title_name)
|
||||||
|
|
||||||
|
# 保存网页内容为 Markdown 文件
|
||||||
|
save_lemma_content_as_markdown(driver, sanitized_title_name)
|
||||||
|
|
||||||
|
# 关闭浏览器
|
||||||
|
driver.quit()
|
Loading…
Reference in New Issue
Block a user