处理lazy加载的图片，上传图片的markdowng更新为新文件

2025-01-26 16:49:16 +08:00
parent 4ffeb3ef50
commit 3df14930e4
2 changed files with 42 additions and 38 deletions
--- a/dialogue_download_change.py
+++ b/dialogue_download_change.py
@@ -62,6 +62,7 @@ def download_images():
    url_mapping = {}  # 存储URL到本地路径的映射

    for url in image_urls:
+
        try:
            response = requests.get(url)
            if response.status_code == 200:
@@ -130,7 +131,9 @@ def upload_images():

    for url in image_urls:
        try:
-            # 下载图片
+            if not url.startswith(('http://', 'https://')):
+                if 'image.lqsjy.cn' not in url:            
+            

                
                    # 上传到图床
@@ -151,8 +154,12 @@ def upload_images():
        for old_url, new_url in url_mapping.items():
            new_content = new_content.replace(old_url, new_url)
        
-        # 保存更新后的文件
-        with open(markdown_file_path, 'w', encoding='utf-8') as file:
+        # 构造新的文件名
+        file_name, file_ext = os.path.splitext(markdown_file_path)
+        new_file_path = f"{file_name}_lsky{file_ext}"
+        
+        # 保存为新文件
+        with open(new_file_path, 'w', encoding='utf-8') as file:
            file.write(new_content)
        
        messagebox.showinfo("完成", f"成功处理 {len(url_mapping)} 张图片")
--- a/one.py
+++ b/one.py
@@ -12,7 +12,7 @@ import os
 import re
 import html2text
 import requests
-#import base64
+import base64

 # 自定义 HTML 转换器类，继承自 html2text.HTML2Text
 class CustomHTML2Text(html2text.HTML2Text):
@@ -79,7 +79,10 @@ def process_strikethrough(element):


 def download_image(url, save_path):
-        # 设置请求头
+
+
+    
+    # 处理普通URL图片
    headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
                "Referer": "https://www.soujianzhu.cn/",
@@ -87,31 +90,20 @@ def download_image(url, save_path):
                "Sec-Ch-Ua-Mobile": "?0",
                "Sec-Ch-Ua-Platform": "Windows"
                }
+    
    if not url.startswith(('http://', 'https://')):
        if 'www.soujianzhu.cn' not in url:
            url = 'https://www.soujianzhu.cn' + url
+            
    try:
-        # 发送HTTP请求下载图片
-        response = requests.get(url, timeout=30)
-        
-        # 检查响应状态码
-        if response.status_code != 200:
-            print(f"下载失败，状态码: {response.status_code}")
-            return False
-            
-        # 将图片内容写入文件
+        response = requests.get(url, headers=headers, timeout=30)
+        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
-            
-        # 验证文件是否成功保存
-        if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
            return True
-        else:
-            print("文件保存失败")
-            return False
-            
    except Exception as e:
-        print(f"下载图片时发生错误: {str(e)}")
+        print(f"下载图片失败: {url}\n错误: {str(e)}")
+        return False
    return False


@@ -135,7 +127,12 @@ def save_lemma_content_as_markdown(driver, title_name):
        # 下载图片并更新图片地址
        downloaded_images = []
        for img in content_div.find_all('img'):
-            img_url = img['src']
+            img_url = img.get('data-original') or img.get('src')
+            
+            # 确保 img_url 不为空
+            if not img_url:
+                print(f"找不到图片URL: {img}")
+                continue
            img_name = os.path.basename(img_url)
            img_path = os.path.join('images', img_name)
            try: