Family-Tree/script/pdf_text_extract.py

482 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import cv2
import numpy as np
import fitz # PyMuPDF
import tkinter as tk
from tkinter import ttk
from PIL import Image, ImageTk
import tkinter.messagebox as messagebox
import datetime
"""
提取PDF中的竖排文本(针对分隔区域)
"""
class PDFProcessor:
def __init__(self):
self.image = None
self.regions = []
def convert_pdf_to_image(self, pdf_path, page_num=0):
# 打开PDF文件
doc = fitz.open(pdf_path)
page = doc[page_num]
# 将PDF页面转换为图片
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2倍缩放以获得更好的质量
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# 转换为OpenCV格式
self.image = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
return self.image
def detect_regions(self):
if self.image is None:
return []
# 转换为灰度图
gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
# 使用高斯模糊减少噪声
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
# 自适应阈值处理
binary = cv2.adaptiveThreshold(
blurred,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV,
11,
2
)
# 膨胀操作
kernel = np.ones((3,3), np.uint8)
dilated = cv2.dilate(binary, kernel, iterations=1)
# 查找所有轮廓
contours, hierarchy = cv2.findContours(
dilated,
cv2.RETR_TREE,
cv2.CHAIN_APPROX_SIMPLE
)
# 收集所有可能的矩形区域
candidates = []
min_area = 1000
for cnt in contours:
peri = cv2.arcLength(cnt, True)
approx = cv2.approxPolyDP(cnt, 0.02 * peri, True)
x, y, w, h = cv2.boundingRect(cnt)
area = w * h
aspect_ratio = w / float(h) if h != 0 else 0
if (area > min_area and
w > 30 and h > 30 and
0.1 < aspect_ratio < 10):
if len(approx) >= 4:
candidates.append((x, y, w, h))
# 按宽度从大到小排序
candidates.sort(key=lambda x: x[2], reverse=True)
# 查找宽高一致的五个区域
target_regions = []
for i, (x1, y1, w1, h1) in enumerate(candidates):
similar_regions = [(x1, y1, w1, h1)]
# 在剩余区域中查找宽高相似的区域
for x2, y2, w2, h2 in candidates[i+1:]:
# 检查宽高是否相似允许2%的误差)
if (abs(w2 - w1) / w1 < 0.02 and
abs(h2 - h1) / h1 < 0.02):
similar_regions.append((x2, y2, w2, h2))
# 如果找到至少5个相似区域
if len(similar_regions) >= 5:
# 按y坐标排序从上到下
similar_regions.sort(key=lambda x: x[1])
target_regions = similar_regions[:5]
break
self.regions = target_regions
return self.regions
class PDFViewer:
def __init__(self, root):
self.root = root
self.root.title("PDF区域检测器")
# 设置窗口初始大小
self.root.geometry("1024x768")
# 创建主框架
self.main_frame = ttk.Frame(self.root)
self.main_frame.pack(fill=tk.BOTH, expand=True)
# 创建工具栏
self.toolbar = ttk.Frame(self.main_frame)
self.toolbar.pack(side=tk.TOP, fill=tk.X)
# 添加缩放控制
self.zoom_label = ttk.Label(self.toolbar, text="缩放: ")
self.zoom_label.pack(side=tk.LEFT, padx=5)
self.zoom_scale = ttk.Scale(
self.toolbar,
from_=10,
to=200,
orient=tk.HORIZONTAL,
length=200,
command=self._on_zoom_change
)
self.zoom_scale.set(30) # 设置初始缩放为30%
self.zoom_scale.pack(side=tk.LEFT, padx=5)
# 创建框架来容纳画布和滚动条
self.frame = ttk.Frame(self.main_frame)
self.frame.pack(fill=tk.BOTH, expand=True)
# 创建水平和垂直滚动条
self.v_scrollbar = ttk.Scrollbar(self.frame, orient="vertical")
self.h_scrollbar = ttk.Scrollbar(self.frame, orient="horizontal")
self.v_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
self.h_scrollbar.pack(side=tk.BOTTOM, fill=tk.X)
# 创建画布并配置滚动
self.canvas = tk.Canvas(
self.frame,
yscrollcommand=self.v_scrollbar.set,
xscrollcommand=self.h_scrollbar.set,
bg='gray90' # 添加背景色以便于区分
)
self.canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
# 配置滚动条
self.v_scrollbar.config(command=self.canvas.yview)
self.h_scrollbar.config(command=self.canvas.xview)
# 绑定鼠标事件
self.canvas.bind('<MouseWheel>', self._on_mousewheel_y)
self.canvas.bind('<Shift-MouseWheel>', self._on_mousewheel_x)
self.canvas.bind('<Button-4>', self._on_mousewheel_y)
self.canvas.bind('<Button-5>', self._on_mousewheel_y)
self.canvas.bind('<Shift-Button-4>', self._on_mousewheel_x)
self.canvas.bind('<Shift-Button-5>', self._on_mousewheel_x)
self.canvas.bind('<ButtonPress-1>', self._start_drag)
self.canvas.bind('<B1-Motion>', self._drag)
# 添加Ctrl+鼠标滚轮缩放
self.canvas.bind('<Control-MouseWheel>', self._on_zoom_wheel)
self.processor = PDFProcessor()
self.current_image = None
self.current_regions = None
self.zoom_factor = 0.3 # 改为30%的初始缩放
# 添加适应窗口大小的按钮
self.fit_button = ttk.Button(
self.toolbar,
text="适应窗口",
command=self._fit_to_window
)
self.fit_button.pack(side=tk.LEFT, padx=5)
# 绑定窗口大小改变事件
self.root.bind('<Configure>', self._on_window_resize)
# 添加页面控制
self.page_frame = ttk.Frame(self.toolbar)
self.page_frame.pack(side=tk.LEFT, padx=5)
self.page_label = ttk.Label(self.page_frame, text="页码:")
self.page_label.pack(side=tk.LEFT)
self.current_page = tk.StringVar(value="1")
self.total_pages = 1
self.page_entry = ttk.Entry(self.page_frame, textvariable=self.current_page, width=5)
self.page_entry.pack(side=tk.LEFT, padx=2)
self.total_pages_label = ttk.Label(self.page_frame, text="/1")
self.total_pages_label.pack(side=tk.LEFT)
self.prev_button = ttk.Button(self.page_frame, text="上一页", command=self._prev_page)
self.prev_button.pack(side=tk.LEFT, padx=2)
self.next_button = ttk.Button(self.page_frame, text="下一页", command=self._next_page)
self.next_button.pack(side=tk.LEFT, padx=2)
# 修改确认按钮文本
self.confirm_button = ttk.Button(
self.toolbar,
text="提取所有页面文字",
command=self._extract_text
)
self.confirm_button.pack(side=tk.LEFT, padx=5)
self.pdf_path = None
self.doc = None
# 加载默认PDF文件
default_pdf = "test.pdf"
try:
self.load_pdf(default_pdf)
except Exception as e:
messagebox.showerror("错误", f"无法加载默认PDF文件{str(e)}")
def load_pdf(self, pdf_path):
"""加载PDF并显示"""
self.pdf_path = pdf_path
self.doc = fitz.open(pdf_path)
self.total_pages = len(self.doc)
self.total_pages_label.configure(text=f"/{self.total_pages}")
self._load_current_page()
def _load_current_page(self):
"""加载当前页面"""
try:
page_num = int(self.current_page.get()) - 1
if 0 <= page_num < self.total_pages:
image = self.processor.convert_pdf_to_image(self.pdf_path, page_num)
regions = self.processor.detect_regions()
self.current_image = image
self.current_regions = regions
self._update_display()
except ValueError:
messagebox.showerror("错误", "请输入有效的页码")
def _prev_page(self):
"""显示上一页"""
try:
current = int(self.current_page.get())
if current > 1:
self.current_page.set(str(current - 1))
self._load_current_page()
except ValueError:
pass
def _next_page(self):
"""显示下一页"""
try:
current = int(self.current_page.get())
if current < self.total_pages:
self.current_page.set(str(current + 1))
self._load_current_page()
except ValueError:
pass
def _update_display(self):
"""更新显示"""
if self.current_image is None:
return
# 获取原始尺寸
height, width = self.current_image.shape[:2]
# 计算缩放后的尺寸
new_width = int(width * self.zoom_factor)
new_height = int(height * self.zoom_factor)
# 缩放图片
resized_image = cv2.resize(self.current_image, (new_width, new_height))
# 转换并显示图片
image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB)
image_pil = Image.fromarray(image)
self.photo = ImageTk.PhotoImage(image=image_pil)
# 清除画布
self.canvas.delete("all")
# 显示图片
self.canvas.create_image(0, 0, image=self.photo, anchor=tk.NW)
# 绘制区域
if self.current_regions:
for i, (x, y, w, h) in enumerate(self.current_regions):
# 缩放坐标和尺寸
scaled_x = int(x * self.zoom_factor)
scaled_y = int(y * self.zoom_factor)
scaled_w = int(w * self.zoom_factor)
scaled_h = int(h * self.zoom_factor)
rect_id = self.canvas.create_rectangle(
scaled_x, scaled_y,
scaled_x + scaled_w,
scaled_y + scaled_h,
outline="red",
width=max(1, int(2 * self.zoom_factor)),
tags=f"region_{i+1}"
)
self.canvas.create_text(
scaled_x + scaled_w//2,
scaled_y - 10 * self.zoom_factor,
text=f"目标区域 {i+1} ({w}x{h})",
fill="red",
tags=f"region_{i+1}"
)
# 绑定鼠标事件
self.canvas.tag_bind(
f"region_{i+1}",
'<Enter>',
lambda e, rid=rect_id: self._highlight_region(rid)
)
self.canvas.tag_bind(
f"region_{i+1}",
'<Leave>',
lambda e, rid=rect_id: self._unhighlight_region(rid)
)
# 更新画布滚动区域
self.canvas.config(scrollregion=self.canvas.bbox(tk.ALL))
def _on_zoom_wheel(self, event):
"""处理Ctrl+滚轮缩放"""
if event.delta > 0:
self.zoom_scale.set(min(200, self.zoom_scale.get() + 10))
else:
self.zoom_scale.set(max(10, self.zoom_scale.get() - 10))
def _on_zoom_change(self, value):
"""处理缩放变化"""
self.zoom_factor = float(value) / 100
self._update_display()
def _on_mousewheel_y(self, event):
"""处理垂直方向的鼠标滚轮事件"""
if event.num == 4 or event.delta > 0:
self.canvas.yview_scroll(-1, "units")
elif event.num == 5 or event.delta < 0:
self.canvas.yview_scroll(1, "units")
def _on_mousewheel_x(self, event):
"""处理水平方向的鼠标滚轮事件"""
if event.num == 4 or event.delta > 0:
self.canvas.xview_scroll(-1, "units")
elif event.num == 5 or event.delta < 0:
self.canvas.xview_scroll(1, "units")
def _start_drag(self, event):
"""开始拖动"""
self.canvas.scan_mark(event.x, event.y)
def _drag(self, event):
"""拖动画布"""
self.canvas.scan_dragto(event.x, event.y, gain=1)
def _highlight_region(self, region_id):
"""高亮显示区域"""
self.canvas.itemconfig(region_id, width=max(2, int(3 * self.zoom_factor)), outline="yellow")
def _unhighlight_region(self, region_id):
"""取消高亮显示"""
self.canvas.itemconfig(region_id, width=max(1, int(2 * self.zoom_factor)), outline="red")
def _fit_to_window(self):
"""调整缩放以适应窗口大小"""
if self.current_image is None:
return
# 获取窗口和图像尺寸
window_width = self.canvas.winfo_width()
window_height = self.canvas.winfo_height()
image_height, image_width = self.current_image.shape[:2]
# 计算合适的缩放比例
width_ratio = window_width / image_width
height_ratio = window_height / image_height
# 选择较小的比例以确保完全显示
new_zoom = min(width_ratio, height_ratio) * 0.9 # 留出一些边距
# 更新缩放
self.zoom_scale.set(new_zoom * 100)
def _on_window_resize(self, event):
"""窗口大小改变时的处理"""
# 仅当事件来自主窗口时才处理
if event.widget == self.root:
# 更新画布滚动区域
self.canvas.config(scrollregion=self.canvas.bbox(tk.ALL))
def _extract_text(self):
"""提取所有页面的文字并分别保存到五个文件"""
if not self.doc:
messagebox.showwarning("警告", "请先加载PDF文件")
return
try:
# 创建五个输出文件(使用追加模式)
base_path = self.pdf_path.rsplit('.', 1)[0]
output_files = [open(f"{base_path}_region{i+1}.txt", 'a', encoding='utf-8') for i in range(5)]
# 写入分隔符(带时间戳)
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
for i, f in enumerate(output_files):
f.write(f"\n\n=== 新的提取任务 {timestamp} ===\n\n")
# 处理每一页
for page_num in range(self.total_pages):
# 更新当前页面以获取区域
self.current_page.set(str(page_num + 1))
self._load_current_page()
if not self.current_regions or len(self.current_regions) < 5:
messagebox.showwarning("警告", f"{page_num+1} 页未检测到足够区域,跳过该页")
continue
page = self.doc[page_num]
# 按y坐标排序处理每个区域
sorted_regions = sorted(self.current_regions, key=lambda x: x[1])
for region_idx, region in enumerate(sorted_regions[:5]): # 只取前五个区域
x, y, w, h = region
# 将OpenCV的坐标转换为PDF坐标除以2因为之前放大了2倍
pdf_x = x / 2
pdf_y = y / 2
pdf_w = w / 2
pdf_h = h / 2
rect = fitz.Rect(pdf_x, pdf_y, pdf_x + pdf_w, pdf_y + pdf_h)
text = page.get_text(clip=rect, sort=True)
if text.strip():
output_files[region_idx].write(
f"=== 第 {page_num + 1} 页 ===\n"
f"区域坐标: ({x}, {y}) 尺寸: {w}x{h}\n"
f"{text}\n"
"-------------------\n\n"
)
# 关闭所有文件
for f in output_files:
f.close()
messagebox.showinfo("成功", f"文字已分别保存到:\n"
f"{base_path}_region1.txt\n"
f"{base_path}_region2.txt\n"
f"{base_path}_region3.txt\n"
f"{base_path}_region4.txt\n"
f"{base_path}_region5.txt")
except Exception as e:
messagebox.showerror("错误", f"提取文字时发生错误:{str(e)}")
finally:
# 确保文件被关闭
for f in output_files:
if not f.closed:
f.close()
def main():
root = tk.Tk()
app = PDFViewer(root)
root.mainloop()
if __name__ == "__main__":
main()