Family-Tree/script/pdf_whole_text_extract.py

44 lines
1.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import fitz # PyMuPDF
"""
提取PDF中的竖排文本(针对整页文本)
"""
def extract_vertical_text(pdf_path):
"""
提取竖排文本(从右至左阅读顺序)并包含页码信息
"""
doc = fitz.open(pdf_path)
full_text = []
for page in doc:
# 添加页码标识
full_text.append(f"=== 第{page.number + 1}页 ===")
# 旋转页面以适应竖排文本阅读方向
page.set_rotation(270)
# 获取并排序文本块
blocks = page.get_text("blocks", flags=fitz.TEXT_PRESERVE_LIGATURES)
blocks.sort(key=lambda b: (-b[2], b[1]))
# 收集当前页文本
page_text = []
for b in blocks:
text = b[4].strip()
if text:
page_text.append(text)
full_text.append('\n'.join(page_text))
return '\n\n'.join(full_text)
# 修改后的使用示例
if __name__ == "__main__":
import sys
text = extract_vertical_text('/Users/xiangyu/Documents/余氏宗谱(新洲区等支族)/第一册/6余氏宗谱 (四諫堂) 卷之首一 (三校).pdf')
# 写入文件
with open('output.txt', 'w', encoding='utf-8') as f:
f.write(text)
print("文本已保存到 output.txt")