feat: 添加Markdown文件处理器以支持分块处理
新增markdown_processor.py文件,提供Markdown文件的分块处理功能。该处理器能够根据一级标题将内容分块,并使用LLM处理每个块,最终将处理后的内容保存为新的Markdown文件。此功能适用于处理大文件,避免超出LLM的输入限制。
This commit is contained in:
		
							
								
								
									
										137
									
								
								markdown_processor.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										137
									
								
								markdown_processor.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,137 @@ | |||||||
|  | import os | ||||||
|  | import re | ||||||
|  | import argparse | ||||||
|  | import openai | ||||||
|  | from typing import List, Dict, Any | ||||||
|  |  | ||||||
|  | # 设置OpenAI API | ||||||
|  | openai.api_key = "sk-no-key-required" | ||||||
|  | openai.api_base = "http://localhost:1234/v1"  # LM-Studio默认地址 | ||||||
|  |  | ||||||
|  | def read_markdown_file(file_path: str) -> str: | ||||||
|  |     """读取Markdown文件内容""" | ||||||
|  |     try: | ||||||
|  |         with open(file_path, 'r', encoding='utf-8') as file: | ||||||
|  |             return file.read() | ||||||
|  |     except Exception as e: | ||||||
|  |         print(f"读取文件时出错: {e}") | ||||||
|  |         return "" | ||||||
|  |  | ||||||
|  | def split_by_headers(content: str, max_length: int) -> List[str]: | ||||||
|  |     """根据一级标题(#)将内容分块,确保每个块小于max_length""" | ||||||
|  |     # 查找所有一级标题的位置 | ||||||
|  |     pattern = r'^# .+$' | ||||||
|  |     headers = [(m.start(), m.group()) for m in re.finditer(pattern, content, re.MULTILINE)] | ||||||
|  |      | ||||||
|  |     if not headers: | ||||||
|  |         # 如果没有一级标题,将整个内容作为一个块 | ||||||
|  |         return [content] if len(content) <= max_length else chunk_content(content, max_length) | ||||||
|  |      | ||||||
|  |     chunks = [] | ||||||
|  |     current_chunk = "" | ||||||
|  |     last_pos = 0 | ||||||
|  |      | ||||||
|  |     # 遍历所有标题 | ||||||
|  |     for i, (pos, header) in enumerate(headers): | ||||||
|  |         # 获取当前标题到下一个标题之间的内容 | ||||||
|  |         if i == 0 and pos > 0: | ||||||
|  |             # 处理文件开头到第一个标题之间的内容 | ||||||
|  |             current_chunk = content[:pos] | ||||||
|  |          | ||||||
|  |         # 当前标题的内容 | ||||||
|  |         section_end = headers[i+1][0] if i+1 < len(headers) else len(content) | ||||||
|  |         section = content[pos:section_end] | ||||||
|  |          | ||||||
|  |         # 检查添加当前部分是否会超过最大长度 | ||||||
|  |         if len(current_chunk) + len(section) <= max_length: | ||||||
|  |             current_chunk += section | ||||||
|  |         else: | ||||||
|  |             # 如果当前块不为空,添加到chunks | ||||||
|  |             if current_chunk: | ||||||
|  |                 chunks.append(current_chunk) | ||||||
|  |              | ||||||
|  |             # 如果单个部分超过最大长度,需要进一步分割 | ||||||
|  |             if len(section) > max_length: | ||||||
|  |                 sub_chunks = chunk_content(section, max_length) | ||||||
|  |                 chunks.extend(sub_chunks) | ||||||
|  |                 current_chunk = "" | ||||||
|  |             else: | ||||||
|  |                 current_chunk = section | ||||||
|  |          | ||||||
|  |         last_pos = section_end | ||||||
|  |      | ||||||
|  |     # 添加最后一个块 | ||||||
|  |     if current_chunk: | ||||||
|  |         chunks.append(current_chunk) | ||||||
|  |      | ||||||
|  |     return chunks | ||||||
|  |  | ||||||
|  | def chunk_content(content: str, max_length: int) -> List[str]: | ||||||
|  |     """将内容分割成固定大小的块""" | ||||||
|  |     chunks = [] | ||||||
|  |     for i in range(0, len(content), max_length): | ||||||
|  |         chunks.append(content[i:i + max_length]) | ||||||
|  |     return chunks | ||||||
|  |  | ||||||
|  | def process_chunk_with_llm(chunk: str, model: str = "gpt-3.5-turbo") -> str: | ||||||
|  |     """使用LLM处理每个块""" | ||||||
|  |     try: | ||||||
|  |         response = openai.ChatCompletion.create( | ||||||
|  |             model=model, | ||||||
|  |             messages=[ | ||||||
|  |                 {"role": "system", "content": "你是一个有用的助手,请处理以下Markdown内容。"}, | ||||||
|  |                 {"role": "user", "content": chunk} | ||||||
|  |             ], | ||||||
|  |             temperature=0.7, | ||||||
|  |         ) | ||||||
|  |         return response.choices[0].message.content | ||||||
|  |     except Exception as e: | ||||||
|  |         print(f"处理块时出错: {e}") | ||||||
|  |         return chunk  # 出错时返回原始内容 | ||||||
|  |  | ||||||
|  | def save_markdown_file(content: str, output_path: str) -> None: | ||||||
|  |     """保存处理后的Markdown内容到文件""" | ||||||
|  |     try: | ||||||
|  |         with open(output_path, 'w', encoding='utf-8') as file: | ||||||
|  |             file.write(content) | ||||||
|  |         print(f"已保存处理后的文件到: {output_path}") | ||||||
|  |     except Exception as e: | ||||||
|  |         print(f"保存文件时出错: {e}") | ||||||
|  |  | ||||||
|  | def main(): | ||||||
|  |     parser = argparse.ArgumentParser(description='处理Markdown文件') | ||||||
|  |     parser.add_argument('input_file', help='输入的Markdown文件路径') | ||||||
|  |     parser.add_argument('output_file', help='输出的Markdown文件路径') | ||||||
|  |     parser.add_argument('--max_length', type=int, default=4000, help='每个块的最大长度') | ||||||
|  |     parser.add_argument('--model', default='gpt-3.5-turbo', help='使用的LLM模型名称') | ||||||
|  |     parser.add_argument('--api_base', default='http://localhost:1234/v1', help='API基础URL') | ||||||
|  |      | ||||||
|  |     args = parser.parse_args() | ||||||
|  |      | ||||||
|  |     # 设置API基础URL | ||||||
|  |     openai.api_base = args.api_base | ||||||
|  |      | ||||||
|  |     # 读取文件 | ||||||
|  |     content = read_markdown_file(args.input_file) | ||||||
|  |     if not content: | ||||||
|  |         return | ||||||
|  |      | ||||||
|  |     # 分块 | ||||||
|  |     chunks = split_by_headers(content, args.max_length) | ||||||
|  |     print(f"文件已分成 {len(chunks)} 个块") | ||||||
|  |      | ||||||
|  |     # 处理每个块 | ||||||
|  |     processed_chunks = [] | ||||||
|  |     for i, chunk in enumerate(chunks): | ||||||
|  |         print(f"处理块 {i+1}/{len(chunks)}...") | ||||||
|  |         processed_chunk = process_chunk_with_llm(chunk, args.model) | ||||||
|  |         processed_chunks.append(processed_chunk) | ||||||
|  |      | ||||||
|  |     # 合并处理后的内容 | ||||||
|  |     final_content = '\n'.join(processed_chunks) | ||||||
|  |      | ||||||
|  |     # 保存结果 | ||||||
|  |     save_markdown_file(final_content, args.output_file) | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
		Reference in New Issue
	
	Block a user