Scripts/pdf/pdf_total_processor.py
2025-10-11 13:36:48 +08:00

40 lines
1.1 KiB
Python

import fitz # PyMuPDF
def extract_vertical_text(pdf_path):
"""
提取竖排文本(从右至左阅读顺序)并包含页码信息
"""
doc = fitz.open(pdf_path)
full_text = []
for page in doc:
# 添加页码标识
full_text.append(f"=== 第{page.number + 1}页 ===")
# 旋转页面以适应竖排文本阅读方向
page.set_rotation(270)
# 获取并排序文本块
blocks = page.get_text("blocks", flags=fitz.TEXT_PRESERVE_LIGATURES)
blocks.sort(key=lambda b: (-b[2], b[1]))
# 收集当前页文本
page_text = []
for b in blocks:
text = b[4].strip()
if text:
page_text.append(text)
full_text.append('\n'.join(page_text))
return '\n\n'.join(full_text)
# 修改后的使用示例
if __name__ == "__main__":
import sys
text = extract_vertical_text('origin_second.pdf')
# 写入文件
with open('output/output.txt', 'w', encoding='utf-8') as f:
f.write(text)
print("文本已保存到 output.txt")