python实现pdf转word,利用python进行数据分析 第三版 pdf
终极管理员 知识笔记 43阅读
文章目录 背景需求环境安装完整代码效果
背景需求

已经获取到了大量的pdf在download文件夹中但是我需要的是txt文件和word文件
环境安装pip install pdf2docx pdfminer.six
完整代码 # pip install pdf2docx pdfminer.siximport osfrom pdf2docx import Converterfrom pdfminer.high_level import extract_text# 忽略警告import warningswarnings.filterwarnings(ignore, categoryUserWarning, modulepdf2docx)# pdf转txtdef pdf_to_txt(pdf_path, txt_path): text extract_text(pdf_path) with open(txt_path, w, encodingutf-8) as f: f.write(text)# pdf转worddef pdf_to_docx(pdf_path, docx_path): cv Converter(pdf_path) cv.convert(docx_path, start0, endNone) cv.close()# 分批转换文件格式检测def batch_convert(download_folder, data_folder, output_formattxt): # 确保输出文件夹存在 if not os.path.exists(data_folder): os.makedirs(data_folder) # 遍历download文件夹中的所有PDF文件 for filename in os.listdir(download_folder): if filename.endswith(.pdf): pdf_path os.path.join(download_folder, filename) if output_format txt: txt_filename os.path.splitext(filename)[0] .txt txt_path os.path.join(data_folder, txt_filename) pdf_to_txt(pdf_path, txt_path) elif output_format docx: docx_filename os.path.splitext(filename)[0] .docx docx_path os.path.join(data_folder, docx_filename) pdf_to_docx(pdf_path, docx_path)# 转换txtbatch_convert(download, data_txt, output_formattxt)# 转换word文件batch_convert(download, data_docx, output_formatdocx)
效果
标签: