Family-Tree/script/extract_images.py

44 lines
1.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import fitz # PyMuPDF
import os
from PIL import Image
import io
"""
提取PDF中的图片
"""
def extract_images_from_pdf(pdf_path, output_folder):
# 打开PDF文件
document = fitz.open(pdf_path)
# 确保输出文件夹存在
os.makedirs(output_folder, exist_ok=True)
# 遍历每一页
for page_number in range(len(document)):
page = document.load_page(page_number)
# 获取页面中的图片
images = page.get_images(full=True)
for image_index, img in enumerate(images):
xref = img[0]
# 提取图片
base_image = document.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# 使用Pillow处理图片
image = Image.open(io.BytesIO(image_bytes))
# 构建图片文件名
image_filename = f"page_{page_number + 1}_img_{image_index + 1}.{image_ext}"
image_path = os.path.join(output_folder, image_filename)
# 保存图片
image.save(image_path)
print(f"保存图片: {image_path}")
# 关闭文档
document.close()
# 使用示例
pdf_path = "/Users/xiangyu/Documents/余氏宗谱(新洲区等支族)/第一册/5余氏彩页 P17-40.pdf" # 替换为你的PDF文件路径
output_folder = "extracted_images" # 替换为你想要保存图片的文件夹
extract_images_from_pdf(pdf_path, output_folder)