From 866fa22461f929fa977fc7d8a629836d9f905da8 Mon Sep 17 00:00:00 2001 From: jdysya <1912377458@qq.com> Date: Thu, 27 Mar 2025 18:37:31 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=80=E4=BA=9B=E9=A2=84?= =?UTF-8?q?=E5=A4=84=E7=90=86=E7=9A=84=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- script/draw_relation.py | 217 ++++++++++++++ script/extract_images.py | 44 +++ script/pdf_text_extract.py | 482 +++++++++++++++++++++++++++++++ script/pdf_whole_text_extract.py | 43 +++ 4 files changed, 786 insertions(+) create mode 100644 script/draw_relation.py create mode 100644 script/extract_images.py create mode 100644 script/pdf_text_extract.py create mode 100644 script/pdf_whole_text_extract.py diff --git a/script/draw_relation.py b/script/draw_relation.py new file mode 100644 index 0000000..ccc2a52 --- /dev/null +++ b/script/draw_relation.py @@ -0,0 +1,217 @@ +from graphviz import Digraph + +""" +绘制家族树 +""" + +# 创建有向图 +dot = Digraph(comment='家族树', node_attr={'fontname': 'WenQuanYi Zen Hei'}, + edge_attr={'fontname': 'WenQuanYi Zen Hei'}) +dot.attr(rankdir='TB', dpi='300') + +# 重名 +dot.node('1', "发昌") +dot.node('2', "发昌") +dot.node('3', "长松") +dot.node('4', "长松") + +# 女性人物 +female_nodes = ["德萍", "长香", "发双", "发娜", "发雅", "发桢", "发雯", "发莲", "发花", "发宇", "发洵", "其晞", "其园", + "其楠", "其晴", "其菲","长婷","长欢","长娟","长怡","发兰"] + +# 定义父子关系 +parent_child_relations = [ + ('可观', '光友'), + ('光友', '世法'), + ('世法', '道胜'), ('世法', '道启'), ('世法', '道佑'), + ('道佑', '尚铎'), ('道佑', '尚助'), ('道佑', '尚达'), ('道佑', '尚朝'), ('道佑', '尚国'), + ('尚铎', '士进'), ('尚铎', '士忠'), ('尚铎', '士富'), + ('尚助', '士元'), ('尚助', '士怀'), ('尚助', '士纲'), ('尚助', '士位'), ('尚助', '士恺'), + ('尚达', '士贵'), + ('尚朝', '士恭'), + ('尚国', '士宽'), ('尚国', '士信'), ('尚国', '士敏'), ('尚国', '士惠'), + ('士进', '邦龙'), ('士进', '邦乐'), + ('士忠', '邦志'), ('士忠', '邦正'), + ('士富', '邦秀'), + ('士怀', '邦朝'), ('士怀', '邦田'), ('士怀', '邦奇'), ('士怀', '邦学'), ('士怀', '邦强'), + ('士纲', '邦彦'), ('士纲', '邦清'), + ('士位', '邦宗'), + ('士贵', '邦国'), ('士贵', '邦胜'), + ('士恭', '邦应'), ('士恭', '邦全'), + ('士信', '邦和'), ('士信', '邦云'), + ('士敏', '邦元'), + ('士惠', '邦霖'), + ('邦乐', '载万'), ('邦乐', '载有'), ('邦乐', '载彦'), + ('邦秀', '载禄'), ('邦秀', '载厚'), ('邦秀', '载元'), + ('邦田', '载祥'), ('邦田', '载春'), + ('邦彦', '载阳'), + ('邦国', '载兴'), ('邦国', '载清'), ('邦国', '载荣'), ('邦国', '载祜'), + ('邦胜', '载赓'), ('邦胜', '载歌'), + ('邦应', '载福'), ('邦应', '载寿'), ('邦应', '载恩'), + ('邦全', '载仁'), + ('邦和', '载道'), ('邦和', '载富'), + ('邦元', '载德'), ('邦元', '载义'), ('邦元', '载礼'), + ('邦霖', '载让'), ('邦霖', '载成'), ('邦霖', '载宝'), + ('载彦', '元年'), ('载彦', '元春'), ('载彦', '元长'), + ('载阳', '元泰'), ('载阳', '元唱'), ('载阳', '元庆'), ('载阳', '元魁'), ('载阳', '元丰'), + ('载清', '元炳'), ('载清', '元济'), ('载清', '元勋'), ('载清', '元功'), ('载清', '元修'), + ('载荣', '元鼎'), + ('载荣', '元钟'), + ('载荣', '元彛'), + ('载荣', '元琛'), + ('载祜', '元抡'), ('载祜', '元会'), ('载祜', '元松'), + ('载赓', '元良'), + ('载歌', '元恺'), ('载歌', '元选'), ('载歌', '元举'), ('载歌', '元宪'), + ('载福', '元臣'), ('载福', '元定'), ('载福', '元顺'), + ('载恩', '元凤'), ('载恩', '元龙'), + ('载仁', '元勤'), ('载仁', '元俭'), ('载仁', '元学'), + ('载道', '元端'), ('载道', '元章'), + ('载让', '元甫'), + ('元春', '善全'), + ('元唱', '善怀'), ('元唱', '善元'), + ('元庆', '善良'), ('元庆', '善和'), + ('元丰', '善琪'), + ('元炳', '善辅'), ('元炳', '善举'), ('元炳', '善传'), ('元炳', '善富'), + ('元勋', '善宏'), + ('元修', '善行'), ('元修', '善言'), + ('元抡', '善彰'), ('元抡', '善扬'), + ('元会', '善藏'), ('元会', '善身'), ('元会', '善维'), ('元会', '善宝'), + ('元良', '善述'), + ('元恺', '善荫'), + ('元举', '善炽'), ('元举', '善祯'), ('元举', '善进'), ('元举', '善从'), ('元举', '善持'), + ('元臣', '善道'), ('元臣', '善为'), ('元臣', '善庆'), ('元臣', '善保'), ('元臣', '善守'), + ('元学', '善祥'), ('元学', '善士'), + ('善全', '德昌'), + ('善怀', '德大'), ('善怀', '德恒'), + ('善元', '德安'), ('善元', '德溥'), + ('善良', '德太'), ('善良', '德定'), ('善良', '德友'), ('善良', '德仁'), + ('善和', '德文'), ('善和', '德武'), + ('善琪', '德金'), ('善琪', '德艳'), ('善琪', '德映'), + ('善辅', '德远'), ('善辅', '德麟'), + ('善传', '德长'), + ('善富', '德兴'), ('善富', '德海'), ('善富', '德怀'), ('善富', '德周'), ('善富', '德齐'), + ('善宏', '德政'), + ('善行', '德明'), ('善行', '德辅'), + ('善言', '德庆'), ('善言', '德广'), ('善言', '德林'), + ('善扬', '德敬'), ('善扬', '德钰'), + ('善身', '德备'), ('善身', '德胜'), + ('善维', '德峰'), ('善维', '德萍'), + ('善宝', '德忠'), ('善宝', '德成'), + ('善述', '德聘'), ('善述', '德炎'), ('善述', '德种'), + ('善进', '德元'), ('善进', '德利'), + ('善持', '德福'), ('善持', '德泳'), + ('德昌', '长辉'), ('德昌', '长怡'), + ('德安', '长江'), ('德安', '长恩'), ('德安', '长有'), + ('德定', '长泳'), + ('德仁', '长明'), ('德仁', '长伦'), + ('德文', '长元'), ('德文', '长绵'), ('德文', '长新'), ('德文', '长生'), + ('德艳', '长益'), + ('德映', '长威'), ('德映', '长翔'), + ('德远', '3'), ('德远', '长青'), ('德远', '长学'), + ('德兴', '长存'), ('德兴', '长友'), ('德兴', '长胜'), + ('德海', '长文'), ('德海', '长峰'), + ('德庆', '长润'), ('德庆', '长柏'), + ('德广', '长华'), ('德广', '长国'), + ('德林', '长旺'), ('德林', '长平'), + ('德敬', '4'), ('德敬', '长征'), + ('德胜', '长宏'), ('德胜', '长香'), + ('德峰', '长婷'), + ('德萍', '长伟'), + ('德忠', '长磊'), ('德忠', '长欢'), + ('德成', '长乐'), ('德成', '长娟'), + ('德炎', '长福'), ('德炎', '长贵'), ('德炎', '长安'), ('德炎', '长寿'), + ('德种', '长林'), + ('德元', '长刚'), ('德元', '长龙'), + ('长江', '发双'), + ('长有', '发逸'), ('长有', '发娜'), + ('长泳', '发瑞'), ('长泳', '发雅'), + ('长明', '发东'), + ('长绵', '发旺'), ('长绵', '发辉'), + ('长新', '发洋'), + ('长生', '发桢'), + ('长威', '发雯'), + ('长青', '发良'), ('长青', '发莲'), + ('长友', '发家'), ('长友', '发花'), + ('长胜', '发传'), ('长胜', '发涛'), + ('长润', '发翔'), + ('长柏', '发波'), + ('长华', '发松'), + ('长旺', '1'), + ('长平', '发鹏'), ('长平', '发磊'), + ('长香', '2'), ('长香', '发广'), ('长香', '发宇'), + ('长伟', '发洵'), + ('长福', '发启'), ('长福', '发明'), + ('长贵', '发文'), ('长贵', '发祥'), + ('长寿', '发平'), + ('长林', '发義'), ('长林', '发兰'), + ('发良', '其非'), ('发良', '其凡'), + ('发翔', '其晞'), + ('发文', '其旺'), ('发文', '其园'), + ('发祥', '其达'), ('发祥', '其楠'), + ('发平', '其红'), ('发平', '其晴'), ('发平', '其菲'), + ('发義', '其昊') +] + +# 定义过继关系 +adoption_relations = [ + ('士宽', '邦云'), + ('邦朝', '载祥'), + ('邦奇', '载祥'), + ('邦学', '载春'), + ('邦强', '载春'), + ('邦清', '载阳'), + ('邦宗', '载阳'), + ('邦宗', '元魁'), + ('邦清', '元庆'), + ('邦云', '载富'), + ('载有', '元俭'), + ('载禄', '元俭'), + ('载元', '元俭'), + ('载祥', '元丰'), + ('载春', '元丰'), + ('载兴', '元济'), + ('载富', '元章'), + ('元俭', '善和'), + ('元济', '善传'), + ('元鼎', '善言'), + ('元钟', '善言'), + ('元宪', '善祯'), + ('元顺', '善为'), + ('元松', '善宝'), + ('善举', '德兴'), + ('善彰', '德敬'), + ('善荫', '德种'), + ('德武', '长新'), + ('德政', '长青'), + ('德齐', '长胜') +] + +# 添加节点和父子关系边 +# for parent, child in parent_child_relations: +# dot.node(parent) +# dot.node(child) +# dot.edge(parent, child) + +# 添加节点和父子关系边 +for parent, child in parent_child_relations: + if parent in female_nodes: + dot.node(parent, shape='ellipse', style='filled', color='pink') + else: + dot.node(parent, shape='box', style='filled', color='lightblue') + + if child in female_nodes: + dot.node(child, shape='ellipse', style='filled', color='pink') + else: + dot.node(child, shape='box', style='filled', color='lightblue') + + dot.edge(parent, child) + +# 添加过继关系边 +for adoptive_parent, adoptee in adoption_relations: + dot.node(adoptive_parent) + # dot.node(adoptee) + dot.node(adoptee, shape='diamond', style='filled', color='lightgreen') + dot.edge(adoptive_parent, adoptee, style='dashed', label='继', color="blue") + +# 渲染图形 +dot.render('family_tree.gv', view=True) diff --git a/script/extract_images.py b/script/extract_images.py new file mode 100644 index 0000000..d448fee --- /dev/null +++ b/script/extract_images.py @@ -0,0 +1,44 @@ +import fitz # PyMuPDF +import os +from PIL import Image +import io + +""" +提取PDF中的图片 +""" + +def extract_images_from_pdf(pdf_path, output_folder): + # 打开PDF文件 + document = fitz.open(pdf_path) + # 确保输出文件夹存在 + os.makedirs(output_folder, exist_ok=True) + + # 遍历每一页 + for page_number in range(len(document)): + page = document.load_page(page_number) + # 获取页面中的图片 + images = page.get_images(full=True) + + for image_index, img in enumerate(images): + xref = img[0] + # 提取图片 + base_image = document.extract_image(xref) + image_bytes = base_image["image"] + image_ext = base_image["ext"] + + # 使用Pillow处理图片 + image = Image.open(io.BytesIO(image_bytes)) + # 构建图片文件名 + image_filename = f"page_{page_number + 1}_img_{image_index + 1}.{image_ext}" + image_path = os.path.join(output_folder, image_filename) + # 保存图片 + image.save(image_path) + print(f"保存图片: {image_path}") + + # 关闭文档 + document.close() + +# 使用示例 +pdf_path = "/Users/xiangyu/Documents/余氏宗谱(新洲区等支族)/第一册/(5)余氏彩页 P17-40.pdf" # 替换为你的PDF文件路径 +output_folder = "extracted_images" # 替换为你想要保存图片的文件夹 +extract_images_from_pdf(pdf_path, output_folder) \ No newline at end of file diff --git a/script/pdf_text_extract.py b/script/pdf_text_extract.py new file mode 100644 index 0000000..c03e405 --- /dev/null +++ b/script/pdf_text_extract.py @@ -0,0 +1,482 @@ +import cv2 +import numpy as np +import fitz # PyMuPDF +import tkinter as tk +from tkinter import ttk +from PIL import Image, ImageTk +import tkinter.messagebox as messagebox +import datetime + +""" +提取PDF中的竖排文本(针对分隔区域) +""" + +class PDFProcessor: + def __init__(self): + self.image = None + self.regions = [] + + def convert_pdf_to_image(self, pdf_path, page_num=0): + # 打开PDF文件 + doc = fitz.open(pdf_path) + page = doc[page_num] + + # 将PDF页面转换为图片 + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2倍缩放以获得更好的质量 + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + + # 转换为OpenCV格式 + self.image = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + return self.image + + def detect_regions(self): + if self.image is None: + return [] + + # 转换为灰度图 + gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY) + + # 使用高斯模糊减少噪声 + blurred = cv2.GaussianBlur(gray, (5, 5), 0) + + # 自适应阈值处理 + binary = cv2.adaptiveThreshold( + blurred, + 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY_INV, + 11, + 2 + ) + + # 膨胀操作 + kernel = np.ones((3,3), np.uint8) + dilated = cv2.dilate(binary, kernel, iterations=1) + + # 查找所有轮廓 + contours, hierarchy = cv2.findContours( + dilated, + cv2.RETR_TREE, + cv2.CHAIN_APPROX_SIMPLE + ) + + # 收集所有可能的矩形区域 + candidates = [] + min_area = 1000 + + for cnt in contours: + peri = cv2.arcLength(cnt, True) + approx = cv2.approxPolyDP(cnt, 0.02 * peri, True) + + x, y, w, h = cv2.boundingRect(cnt) + area = w * h + aspect_ratio = w / float(h) if h != 0 else 0 + + if (area > min_area and + w > 30 and h > 30 and + 0.1 < aspect_ratio < 10): + + if len(approx) >= 4: + candidates.append((x, y, w, h)) + + # 按宽度从大到小排序 + candidates.sort(key=lambda x: x[2], reverse=True) + + # 查找宽高一致的五个区域 + target_regions = [] + for i, (x1, y1, w1, h1) in enumerate(candidates): + similar_regions = [(x1, y1, w1, h1)] + + # 在剩余区域中查找宽高相似的区域 + for x2, y2, w2, h2 in candidates[i+1:]: + # 检查宽高是否相似(允许2%的误差) + if (abs(w2 - w1) / w1 < 0.02 and + abs(h2 - h1) / h1 < 0.02): + similar_regions.append((x2, y2, w2, h2)) + + # 如果找到至少5个相似区域 + if len(similar_regions) >= 5: + # 按y坐标排序(从上到下) + similar_regions.sort(key=lambda x: x[1]) + target_regions = similar_regions[:5] + break + + self.regions = target_regions + return self.regions + +class PDFViewer: + def __init__(self, root): + self.root = root + self.root.title("PDF区域检测器") + + # 设置窗口初始大小 + self.root.geometry("1024x768") + + # 创建主框架 + self.main_frame = ttk.Frame(self.root) + self.main_frame.pack(fill=tk.BOTH, expand=True) + + # 创建工具栏 + self.toolbar = ttk.Frame(self.main_frame) + self.toolbar.pack(side=tk.TOP, fill=tk.X) + + # 添加缩放控制 + self.zoom_label = ttk.Label(self.toolbar, text="缩放: ") + self.zoom_label.pack(side=tk.LEFT, padx=5) + + self.zoom_scale = ttk.Scale( + self.toolbar, + from_=10, + to=200, + orient=tk.HORIZONTAL, + length=200, + command=self._on_zoom_change + ) + self.zoom_scale.set(30) # 设置初始缩放为30% + self.zoom_scale.pack(side=tk.LEFT, padx=5) + + # 创建框架来容纳画布和滚动条 + self.frame = ttk.Frame(self.main_frame) + self.frame.pack(fill=tk.BOTH, expand=True) + + # 创建水平和垂直滚动条 + self.v_scrollbar = ttk.Scrollbar(self.frame, orient="vertical") + self.h_scrollbar = ttk.Scrollbar(self.frame, orient="horizontal") + self.v_scrollbar.pack(side=tk.RIGHT, fill=tk.Y) + self.h_scrollbar.pack(side=tk.BOTTOM, fill=tk.X) + + # 创建画布并配置滚动 + self.canvas = tk.Canvas( + self.frame, + yscrollcommand=self.v_scrollbar.set, + xscrollcommand=self.h_scrollbar.set, + bg='gray90' # 添加背景色以便于区分 + ) + self.canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) + + # 配置滚动条 + self.v_scrollbar.config(command=self.canvas.yview) + self.h_scrollbar.config(command=self.canvas.xview) + + # 绑定鼠标事件 + self.canvas.bind('', self._on_mousewheel_y) + self.canvas.bind('', self._on_mousewheel_x) + self.canvas.bind('', self._on_mousewheel_y) + self.canvas.bind('', self._on_mousewheel_y) + self.canvas.bind('', self._on_mousewheel_x) + self.canvas.bind('', self._on_mousewheel_x) + self.canvas.bind('', self._start_drag) + self.canvas.bind('', self._drag) + + # 添加Ctrl+鼠标滚轮缩放 + self.canvas.bind('', self._on_zoom_wheel) + + self.processor = PDFProcessor() + self.current_image = None + self.current_regions = None + self.zoom_factor = 0.3 # 改为30%的初始缩放 + + # 添加适应窗口大小的按钮 + self.fit_button = ttk.Button( + self.toolbar, + text="适应窗口", + command=self._fit_to_window + ) + self.fit_button.pack(side=tk.LEFT, padx=5) + + # 绑定窗口大小改变事件 + self.root.bind('', self._on_window_resize) + + # 添加页面控制 + self.page_frame = ttk.Frame(self.toolbar) + self.page_frame.pack(side=tk.LEFT, padx=5) + + self.page_label = ttk.Label(self.page_frame, text="页码:") + self.page_label.pack(side=tk.LEFT) + + self.current_page = tk.StringVar(value="1") + self.total_pages = 1 + + self.page_entry = ttk.Entry(self.page_frame, textvariable=self.current_page, width=5) + self.page_entry.pack(side=tk.LEFT, padx=2) + + self.total_pages_label = ttk.Label(self.page_frame, text="/1") + self.total_pages_label.pack(side=tk.LEFT) + + self.prev_button = ttk.Button(self.page_frame, text="上一页", command=self._prev_page) + self.prev_button.pack(side=tk.LEFT, padx=2) + + self.next_button = ttk.Button(self.page_frame, text="下一页", command=self._next_page) + self.next_button.pack(side=tk.LEFT, padx=2) + + # 修改确认按钮文本 + self.confirm_button = ttk.Button( + self.toolbar, + text="提取所有页面文字", + command=self._extract_text + ) + self.confirm_button.pack(side=tk.LEFT, padx=5) + + self.pdf_path = None + self.doc = None + + # 加载默认PDF文件 + default_pdf = "test.pdf" + try: + self.load_pdf(default_pdf) + except Exception as e: + messagebox.showerror("错误", f"无法加载默认PDF文件:{str(e)}") + + def load_pdf(self, pdf_path): + """加载PDF并显示""" + self.pdf_path = pdf_path + self.doc = fitz.open(pdf_path) + self.total_pages = len(self.doc) + self.total_pages_label.configure(text=f"/{self.total_pages}") + + self._load_current_page() + + def _load_current_page(self): + """加载当前页面""" + try: + page_num = int(self.current_page.get()) - 1 + if 0 <= page_num < self.total_pages: + image = self.processor.convert_pdf_to_image(self.pdf_path, page_num) + regions = self.processor.detect_regions() + self.current_image = image + self.current_regions = regions + self._update_display() + except ValueError: + messagebox.showerror("错误", "请输入有效的页码") + + def _prev_page(self): + """显示上一页""" + try: + current = int(self.current_page.get()) + if current > 1: + self.current_page.set(str(current - 1)) + self._load_current_page() + except ValueError: + pass + + def _next_page(self): + """显示下一页""" + try: + current = int(self.current_page.get()) + if current < self.total_pages: + self.current_page.set(str(current + 1)) + self._load_current_page() + except ValueError: + pass + + def _update_display(self): + """更新显示""" + if self.current_image is None: + return + + # 获取原始尺寸 + height, width = self.current_image.shape[:2] + + # 计算缩放后的尺寸 + new_width = int(width * self.zoom_factor) + new_height = int(height * self.zoom_factor) + + # 缩放图片 + resized_image = cv2.resize(self.current_image, (new_width, new_height)) + + # 转换并显示图片 + image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB) + image_pil = Image.fromarray(image) + self.photo = ImageTk.PhotoImage(image=image_pil) + + # 清除画布 + self.canvas.delete("all") + + # 显示图片 + self.canvas.create_image(0, 0, image=self.photo, anchor=tk.NW) + + # 绘制区域 + if self.current_regions: + for i, (x, y, w, h) in enumerate(self.current_regions): + # 缩放坐标和尺寸 + scaled_x = int(x * self.zoom_factor) + scaled_y = int(y * self.zoom_factor) + scaled_w = int(w * self.zoom_factor) + scaled_h = int(h * self.zoom_factor) + + rect_id = self.canvas.create_rectangle( + scaled_x, scaled_y, + scaled_x + scaled_w, + scaled_y + scaled_h, + outline="red", + width=max(1, int(2 * self.zoom_factor)), + tags=f"region_{i+1}" + ) + + self.canvas.create_text( + scaled_x + scaled_w//2, + scaled_y - 10 * self.zoom_factor, + text=f"目标区域 {i+1} ({w}x{h})", + fill="red", + tags=f"region_{i+1}" + ) + + # 绑定鼠标事件 + self.canvas.tag_bind( + f"region_{i+1}", + '', + lambda e, rid=rect_id: self._highlight_region(rid) + ) + self.canvas.tag_bind( + f"region_{i+1}", + '', + lambda e, rid=rect_id: self._unhighlight_region(rid) + ) + + # 更新画布滚动区域 + self.canvas.config(scrollregion=self.canvas.bbox(tk.ALL)) + + def _on_zoom_wheel(self, event): + """处理Ctrl+滚轮缩放""" + if event.delta > 0: + self.zoom_scale.set(min(200, self.zoom_scale.get() + 10)) + else: + self.zoom_scale.set(max(10, self.zoom_scale.get() - 10)) + + def _on_zoom_change(self, value): + """处理缩放变化""" + self.zoom_factor = float(value) / 100 + self._update_display() + + def _on_mousewheel_y(self, event): + """处理垂直方向的鼠标滚轮事件""" + if event.num == 4 or event.delta > 0: + self.canvas.yview_scroll(-1, "units") + elif event.num == 5 or event.delta < 0: + self.canvas.yview_scroll(1, "units") + + def _on_mousewheel_x(self, event): + """处理水平方向的鼠标滚轮事件""" + if event.num == 4 or event.delta > 0: + self.canvas.xview_scroll(-1, "units") + elif event.num == 5 or event.delta < 0: + self.canvas.xview_scroll(1, "units") + + def _start_drag(self, event): + """开始拖动""" + self.canvas.scan_mark(event.x, event.y) + + def _drag(self, event): + """拖动画布""" + self.canvas.scan_dragto(event.x, event.y, gain=1) + + def _highlight_region(self, region_id): + """高亮显示区域""" + self.canvas.itemconfig(region_id, width=max(2, int(3 * self.zoom_factor)), outline="yellow") + + def _unhighlight_region(self, region_id): + """取消高亮显示""" + self.canvas.itemconfig(region_id, width=max(1, int(2 * self.zoom_factor)), outline="red") + + def _fit_to_window(self): + """调整缩放以适应窗口大小""" + if self.current_image is None: + return + + # 获取窗口和图像尺寸 + window_width = self.canvas.winfo_width() + window_height = self.canvas.winfo_height() + image_height, image_width = self.current_image.shape[:2] + + # 计算合适的缩放比例 + width_ratio = window_width / image_width + height_ratio = window_height / image_height + + # 选择较小的比例以确保完全显示 + new_zoom = min(width_ratio, height_ratio) * 0.9 # 留出一些边距 + + # 更新缩放 + self.zoom_scale.set(new_zoom * 100) + + def _on_window_resize(self, event): + """窗口大小改变时的处理""" + # 仅当事件来自主窗口时才处理 + if event.widget == self.root: + # 更新画布滚动区域 + self.canvas.config(scrollregion=self.canvas.bbox(tk.ALL)) + + def _extract_text(self): + """提取所有页面的文字并分别保存到五个文件""" + if not self.doc: + messagebox.showwarning("警告", "请先加载PDF文件!") + return + + try: + # 创建五个输出文件(使用追加模式) + base_path = self.pdf_path.rsplit('.', 1)[0] + output_files = [open(f"{base_path}_region{i+1}.txt", 'a', encoding='utf-8') for i in range(5)] + + # 写入分隔符(带时间戳) + timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + for i, f in enumerate(output_files): + f.write(f"\n\n=== 新的提取任务 {timestamp} ===\n\n") + + # 处理每一页 + for page_num in range(self.total_pages): + # 更新当前页面以获取区域 + self.current_page.set(str(page_num + 1)) + self._load_current_page() + + if not self.current_regions or len(self.current_regions) < 5: + messagebox.showwarning("警告", f"第 {page_num+1} 页未检测到足够区域,跳过该页") + continue + + page = self.doc[page_num] + + # 按y坐标排序处理每个区域 + sorted_regions = sorted(self.current_regions, key=lambda x: x[1]) + for region_idx, region in enumerate(sorted_regions[:5]): # 只取前五个区域 + x, y, w, h = region + # 将OpenCV的坐标转换为PDF坐标(除以2因为之前放大了2倍) + pdf_x = x / 2 + pdf_y = y / 2 + pdf_w = w / 2 + pdf_h = h / 2 + + rect = fitz.Rect(pdf_x, pdf_y, pdf_x + pdf_w, pdf_y + pdf_h) + text = page.get_text(clip=rect, sort=True) + + if text.strip(): + output_files[region_idx].write( + f"=== 第 {page_num + 1} 页 ===\n" + f"区域坐标: ({x}, {y}) 尺寸: {w}x{h}\n" + f"{text}\n" + "-------------------\n\n" + ) + + # 关闭所有文件 + for f in output_files: + f.close() + + messagebox.showinfo("成功", f"文字已分别保存到:\n" + f"{base_path}_region1.txt\n" + f"{base_path}_region2.txt\n" + f"{base_path}_region3.txt\n" + f"{base_path}_region4.txt\n" + f"{base_path}_region5.txt") + + except Exception as e: + messagebox.showerror("错误", f"提取文字时发生错误:{str(e)}") + finally: + # 确保文件被关闭 + for f in output_files: + if not f.closed: + f.close() + +def main(): + root = tk.Tk() + app = PDFViewer(root) + root.mainloop() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/script/pdf_whole_text_extract.py b/script/pdf_whole_text_extract.py new file mode 100644 index 0000000..8803003 --- /dev/null +++ b/script/pdf_whole_text_extract.py @@ -0,0 +1,43 @@ +import fitz # PyMuPDF + +""" +提取PDF中的竖排文本(针对整页文本) +""" + +def extract_vertical_text(pdf_path): + """ + 提取竖排文本(从右至左阅读顺序)并包含页码信息 + """ + doc = fitz.open(pdf_path) + full_text = [] + + for page in doc: + # 添加页码标识 + full_text.append(f"=== 第{page.number + 1}页 ===") + + # 旋转页面以适应竖排文本阅读方向 + page.set_rotation(270) + + # 获取并排序文本块 + blocks = page.get_text("blocks", flags=fitz.TEXT_PRESERVE_LIGATURES) + blocks.sort(key=lambda b: (-b[2], b[1])) + + # 收集当前页文本 + page_text = [] + for b in blocks: + text = b[4].strip() + if text: + page_text.append(text) + full_text.append('\n'.join(page_text)) + + return '\n\n'.join(full_text) + +# 修改后的使用示例 +if __name__ == "__main__": + import sys + text = extract_vertical_text('/Users/xiangyu/Documents/余氏宗谱(新洲区等支族)/第一册/(6)余氏宗谱 (四諫堂) 卷之首一 (三校).pdf') + + # 写入文件 + with open('output.txt', 'w', encoding='utf-8') as f: + f.write(text) + print("文本已保存到 output.txt")