Scripts/pdf/pdf_total_processor.py

import fitz  # PyMuPDF

def extract_vertical_text(pdf_path):
    """
    提取竖排文本（从右至左阅读顺序）并包含页码信息
    """
    doc = fitz.open(pdf_path)
    full_text = []

    for page in doc:
        # 添加页码标识
        full_text.append(f"=== 第{page.number + 1}页 ===")

        # 旋转页面以适应竖排文本阅读方向
        page.set_rotation(270)

        # 获取并排序文本块
        blocks = page.get_text("blocks", flags=fitz.TEXT_PRESERVE_LIGATURES)
        blocks.sort(key=lambda b: (-b[2], b[1]))

        # 收集当前页文本
        page_text = []
        for b in blocks:
            text = b[4].strip()
            if text:
                page_text.append(text)
        full_text.append('\n'.join(page_text))

    return '\n\n'.join(full_text)

# 修改后的使用示例
if __name__ == "__main__":
    import sys
    text = extract_vertical_text('origin_second.pdf')

    # 写入文件
    with open('output/output.txt', 'w', encoding='utf-8') as f:
        f.write(text)
    print("文本已保存到 output.txt")