Family-Tree/script/pdf_whole_text_extract.py

import fitz  # PyMuPDF

"""
提取PDF中的竖排文本(针对整页文本)
"""

def extract_vertical_text(pdf_path):
    """
    提取竖排文本（从右至左阅读顺序）并包含页码信息
    """
    doc = fitz.open(pdf_path)
    full_text = []

    for page in doc:
        # 添加页码标识
        full_text.append(f"=== 第{page.number + 1}页 ===")

        # 旋转页面以适应竖排文本阅读方向
        page.set_rotation(270)

        # 获取并排序文本块
        blocks = page.get_text("blocks", flags=fitz.TEXT_PRESERVE_LIGATURES)
        blocks.sort(key=lambda b: (-b[2], b[1]))

        # 收集当前页文本
        page_text = []
        for b in blocks:
            text = b[4].strip()
            if text:
                page_text.append(text)
        full_text.append('\n'.join(page_text))

    return '\n\n'.join(full_text)

# 修改后的使用示例
if __name__ == "__main__":
    import sys
    text = extract_vertical_text('/Users/xiangyu/Documents/余氏宗谱（新洲区等支族）/第一册/（6）余氏宗谱  （四諫堂） 卷之首一   （三校）.pdf')

    # 写入文件
    with open('output.txt', 'w', encoding='utf-8') as f:
        f.write(text)
    print("文本已保存到 output.txt")