diff --git a/src/get_pos_pdf.py b/src/get_pos_pdf.py new file mode 100644 index 0000000..3bc8378 --- /dev/null +++ b/src/get_pos_pdf.py @@ -0,0 +1,219 @@ +import requests +import io +import os +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter +from pdfminer.converter import PDFPageAggregator +from pdfminer.layout import LAParams, LTText, LTChar, LTAnno + +def parse_char_layout(layout): + """解析页面内容,一个字母一个字母的解析""" + # bbox: + # x0:从页面左侧到框左边缘的距离。 + # y0:从页面底部到框的下边缘的距离。 + # x1:从页面左侧到方框右边缘的距离。 + # y1:从页面底部到框的上边缘的距离 + char_list = [] + for textbox in layout: + if isinstance(textbox, LTText): + for line in textbox: + for char in line: + # If the char is a line-break or an empty space, the word is complete + if isinstance(char, LTAnno): + char_info = { + 'x': char.bbox[0] if hasattr(char, 'bbox') else 0, + 'y': char.bbox[3] if hasattr(char, 'bbox') else 0, + 'char': char.get_text() + } + char_list.append(char_info) + elif isinstance(char, LTChar): + char_info = { + 'x': char.bbox[0], + 'y': char.bbox[3], + 'char': char.get_text() + } + char_list.append(char_info) + return char_list + +def normalize_text(text): + """标准化文本,移除多余空白字符""" + # 将换行符、制表符等替换为空格,然后合并多个空格为一个 + import re + normalized = re.sub(r'\s+', ' ', text.strip()) + return normalized + +def find_text_positions(pdf_path, target_text): + """ + 在PDF中查找指定文本并返回坐标 + + Args: + pdf_path (str): PDF文件路径 + target_text (str): 要查找的文本 + + Returns: + list: 包含匹配文本坐标信息的列表 + """ + if not os.path.exists(pdf_path): + raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") + + # 标准化目标文本 + normalized_target = normalize_text(target_text) + + # 打开本地PDF文件 + with open(pdf_path, 'rb') as fp: + parser = PDFParser(fp) + doc = PDFDocument(parser) + + rsrcmgr = PDFResourceManager() + laparams = LAParams() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + + all_chars = [] # 存储所有页面的字符 + page_start_indices = [] # 存储每页开始的索引 + + # 处理每一页并收集所有字符 + for page_num, page in enumerate(PDFPage.create_pages(doc), 1): + page_start_indices.append(len(all_chars)) + interpreter.process_page(page) + layout = device.get_result() + char_list = parse_char_layout(layout) + all_chars.extend(char_list) + + # 将所有字符组合成文本并标准化 + full_text = ''.join([char_info['char'] for char_info in all_chars]) + normalized_full_text = normalize_text(full_text) + + # 在标准化文本中查找目标文本 + found_positions = [] + start = 0 + while True: + pos = normalized_full_text.find(normalized_target, start) + if pos == -1: + break + + # 找到匹配项,获取对应的坐标信息 + if pos < len(all_chars): + start_char = all_chars[pos] + end_pos = pos + len(normalized_target) - 1 + if end_pos < len(all_chars): + end_char = all_chars[end_pos] + # 确定在哪一页 + page_num = 1 + for i, page_start in enumerate(page_start_indices): + if pos >= page_start: + page_num = i + 1 + + position_info = { + 'page': page_num, + 'text': normalized_target, + 'start_x': start_char['x'], + 'start_y': start_char['y'], + 'end_x': end_char['x'], + 'end_y': end_char['y'] + } + found_positions.append(position_info) + + start = pos + 1 + + return found_positions + +def find_text_in_pdf_per_page(pdf_path, target_text): + """ + 在PDF中逐页查找指定文本并返回坐标 + + Args: + pdf_path (str): PDF文件路径 + target_text (str): 要查找的文本 + + Returns: + list: 包含匹配文本坐标信息的列表 + """ + if not os.path.exists(pdf_path): + raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") + + # 标准化目标文本 + normalized_target = normalize_text(target_text) + + # 打开本地PDF文件 + with open(pdf_path, 'rb') as fp: + parser = PDFParser(fp) + doc = PDFDocument(parser) + + rsrcmgr = PDFResourceManager() + laparams = LAParams() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + + found_positions = [] + + # 处理每一页 + for page_num, page in enumerate(PDFPage.create_pages(doc), 1): + interpreter.process_page(page) + layout = device.get_result() + char_list = parse_char_layout(layout) + + # 将页面字符组合成文本并标准化 + page_text = ''.join([char_info['char'] for char_info in char_list]) + normalized_page_text = normalize_text(page_text) + + # 在页面文本中查找目标文本 + pos = normalized_page_text.find(normalized_target) + if pos != -1: + # 找到匹配项,获取对应的坐标信息 + if pos < len(char_list): + start_char = char_list[pos] + end_pos = pos + len(normalized_target) - 1 + if end_pos < len(char_list): + end_char = char_list[end_pos] + + position_info = { + 'page': page_num, + 'text': normalized_target, + 'start_x': start_char['x'], + 'start_y': start_char['y'], + 'end_x': end_char['x'], + 'end_y': end_char['y'] + } + found_positions.append(position_info) + + return found_positions + + + + + + + +if __name__ == '__main__': + # 使用本地PDF文件 + pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf' # 修改为你的PDF文件路径 + target_text = '''执行方式: +• 在当前 chat 中,已有上下文,但可能混乱 +• 新开一个 chat,干净的上下文,需要填充''' + + try: + print("方法1:全文搜索") + positions = find_text_positions(pdf_file_path, target_text) + if positions: + print(f"找到文本在以下位置:") + for pos in positions: + print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})") + else: + print("未找到文本") + + print("\n方法2:逐页搜索") + positions = find_text_in_pdf_per_page(pdf_file_path, target_text) + if positions: + print(f"找到文本在以下位置:") + for pos in positions: + print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})") + else: + print("未找到文本") + + except FileNotFoundError as e: + print(e) + except Exception as e: + print(f"处理PDF时出错: {e}") \ No newline at end of file