44 lines
1.3 KiB
Python
44 lines
1.3 KiB
Python
import fitz # PyMuPDF
|
||
|
||
"""
|
||
提取PDF中的竖排文本(针对整页文本)
|
||
"""
|
||
|
||
def extract_vertical_text(pdf_path):
|
||
"""
|
||
提取竖排文本(从右至左阅读顺序)并包含页码信息
|
||
"""
|
||
doc = fitz.open(pdf_path)
|
||
full_text = []
|
||
|
||
for page in doc:
|
||
# 添加页码标识
|
||
full_text.append(f"=== 第{page.number + 1}页 ===")
|
||
|
||
# 旋转页面以适应竖排文本阅读方向
|
||
page.set_rotation(270)
|
||
|
||
# 获取并排序文本块
|
||
blocks = page.get_text("blocks", flags=fitz.TEXT_PRESERVE_LIGATURES)
|
||
blocks.sort(key=lambda b: (-b[2], b[1]))
|
||
|
||
# 收集当前页文本
|
||
page_text = []
|
||
for b in blocks:
|
||
text = b[4].strip()
|
||
if text:
|
||
page_text.append(text)
|
||
full_text.append('\n'.join(page_text))
|
||
|
||
return '\n\n'.join(full_text)
|
||
|
||
# 修改后的使用示例
|
||
if __name__ == "__main__":
|
||
import sys
|
||
text = extract_vertical_text('/Users/xiangyu/Documents/余氏宗谱(新洲区等支族)/第一册/(6)余氏宗谱 (四諫堂) 卷之首一 (三校).pdf')
|
||
|
||
# 写入文件
|
||
with open('output.txt', 'w', encoding='utf-8') as f:
|
||
f.write(text)
|
||
print("文本已保存到 output.txt")
|