40 lines
1.1 KiB
Python
40 lines
1.1 KiB
Python
import fitz # PyMuPDF
|
|
|
|
def extract_vertical_text(pdf_path):
|
|
"""
|
|
提取竖排文本(从右至左阅读顺序)并包含页码信息
|
|
"""
|
|
doc = fitz.open(pdf_path)
|
|
full_text = []
|
|
|
|
for page in doc:
|
|
# 添加页码标识
|
|
full_text.append(f"=== 第{page.number + 1}页 ===")
|
|
|
|
# 旋转页面以适应竖排文本阅读方向
|
|
page.set_rotation(270)
|
|
|
|
# 获取并排序文本块
|
|
blocks = page.get_text("blocks", flags=fitz.TEXT_PRESERVE_LIGATURES)
|
|
blocks.sort(key=lambda b: (-b[2], b[1]))
|
|
|
|
# 收集当前页文本
|
|
page_text = []
|
|
for b in blocks:
|
|
text = b[4].strip()
|
|
if text:
|
|
page_text.append(text)
|
|
full_text.append('\n'.join(page_text))
|
|
|
|
return '\n\n'.join(full_text)
|
|
|
|
# 修改后的使用示例
|
|
if __name__ == "__main__":
|
|
import sys
|
|
text = extract_vertical_text('origin_second.pdf')
|
|
|
|
# 写入文件
|
|
with open('output/output.txt', 'w', encoding='utf-8') as f:
|
|
f.write(text)
|
|
print("文本已保存到 output.txt")
|