python自动化pdf文档操作
使用第三方库PyPDF2
读取、写入、分割、合并PDF文
安装:pdfplumber
读取 PDF 文件中内容和提取 PDF 中的表格
安装:!pip3 install PyPDF2 复制代码Collecting PyPDF2 Downloading PyPDF2-1.26.0.tar.gz (77 kB) [K | | 77 kB 125 kB/s [?25hBuilding wheels for collected packages: PyPDF2 Building wheel for PyPDF2 (setup.py) ... [?25ldone [?25h Created wheel for PyPDF2: filename=PyPDF2-1.26.0-py3-none-any.whl size=61085 sha256=dba5825a58d21d37cf9e417039694f1b9e716760ee7b9f0fdcca8fc65b8ef3fe Stored in directory: /Users/lichizou/Library/Caches/pip/wheels/d9/dc/ec/72da68331f30074b9950c1737c23cb8a67484e61498bc9713d Successfully built PyPDF2 Installing collected packages: PyPDF2 Successfully installed PyPDF2-1.26.0 复制代码!pip3 install pdfplumber 复制代码Collecting pdfplumber Downloading pdfplumber-0.5.28.tar.gz (45 kB) [K | | 45 kB 152 kB/s [?25hCollecting pdfminer.six==20200517 Downloading pdfminer.six-20200517-py3-none-any.whl (5.6 MB) [K | | 5.6 MB 145 kB/s [?25hCollecting Pillow>=7.0.0 Downloading Pillow-8.2.0-cp39-cp39-macosx_10_10_x86_64.whl (2.8 MB) [K | | 2.8 MB 70 kB/s [?25hCollecting Wand Downloading Wand-0.6.6-py2.py3-none-any.whl (138 kB) [K | | 138 kB 16 kB/s [?25hCollecting pycryptodome Downloading pycryptodome-3.10.1-cp35-abi3-macosx_10_9_x86_64.whl (1.5 MB) [K | | 1.5 MB 43 kB/s [?25hCollecting sortedcontainers Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB) Collecting chardet Downloading chardet-4.0.0-py2.py3-none-any.whl (178 kB) [K | | 178 kB 96 kB/s [?25hBuilding wheels for collected packages: pdfplumber Building wheel for pdfplumber (setup.py) ... [?25ldone [?25h Created wheel for pdfplumber: filename=pdfplumber-0.5.28-py3-none-any.whl size=32219 sha256=23afcf4aa92bdd0fb4e2a71b0974c1edd5ca9e59b37ef5e41b15fae3bffc5c30 Stored in directory: /Users/lichizou/Library/Caches/pip/wheels/e9/cb/1b/c5dbba18cac53515263c5f26443e4617fa432c32cbaa7d8e48 Successfully built pdfplumber Installing collected packages: sortedcontainers, pycryptodome, chardet, Wand, Pillow, pdfminer.six, pdfplumber Successfully installed Pillow-8.2.0 Wand-0.6.6 chardet-4.0.0 pdfminer.six-20200517 pdfplumber-0.5.28 pycryptodome-3.10.1 sortedcontainers-2.4.0 复制代码for i in range(0, 10, 2): print(i) 复制代码0 2 4 6 8 复制代码批量拆分def split_pdf(filename, filepath, save_dirpath, step=5): """ 拆分pdf为多个小pdf文件, @param filename: 要拆分的文件名 @param filepath: 要拆分的文件路径 @param save_dirpath: 保存小的的pdf的文件路径 @param step: 每个step页生成一个文件,默认情况,0-4页问一个文件,5-9页为第二个文件 @return: """ if not os.path.exists(save_dirpath): os.makedirs(save_dirpath) pdf_reader = PdfFileReader(filepath) pages = pdf_reader.getNumPages() for page in range(0, pages, step): # page取值:list [0, 5, 10...pages) pdf_writer = PdfFileWriter() for index in range(page, page + step): # index取值, 区间: [0, 5),[5, 10) if index < pages: # 防止全部页数不能整除 pdf_writer.addPage(pdf_reader.getPage(index)) # 逐页添加 save_path = os.path.join(save_dirpath, filename + str(int(page / step) + 1) + ".pdf") with open(save_path, "wb") as out: pdf_writer.write(out) 复制代码from PyPDF2 import PdfFileReader from PyPDF2 import PdfFileWriter 复制代码path = "易方达中小板指数证券投资基金(LOF)2020年中期报告" # 当前路径下 split_pdf(path, path + ".pdf", "test_pdf") 复制代码批量合并"易方达中小板指数证券投资基金(LOF)2020年中期报告1.pdf"[:-4].replace("易方达中小板指数证券投资基金(LOF)2020年中期报告", "") 复制代码"1" 复制代码"123456789"[:-4] 复制代码"12345" 复制代码os.listdir("./test_pdf/") 复制代码["易方达中小板指数证券投资基金(LOF)2020年中期报告3.pdf", "易方达中小板指数证券投资基金(LOF)2020年中期报告2.pdf", "易方达中小板指数证券投资基金(LOF)2020年中期报告1.pdf", "易方达中小板指数证券投资基金(LOF)2020年中期报告5.pdf", "易方达中小板指数证券投资基金(LOF)2020年中期报告4.pdf", "易方达中小板指数证券投资基金(LOF)2020年中期报告6.pdf", "易方达中小板指数证券投资基金(LOF)2020年中期报告7.pdf", "易方达中小板指数证券投资基金(LOF)2020年中期报告9.pdf", "易方达中小板指数证券投资基金(LOF)2020年中期报告8.pdf", "易方达中小板指数证券投资基金(LOF)2020年中期报告10.pdf"] 复制代码def concat_pdf(filename, read_dirpath, save_filepath): """ 合并多个pdf文件 @param filename: 文件名 @param read_dirpath: 要合并的pdf目录 @param save_filepath: 合并后的pdf文件路径 @return """ pdf_writer = PdfFileWriter() list_filename = os.listdir(read_dirpath) list_filename.sort(key=lambda x: int(x[:-4].replace(filename, ""))) # 去掉.pdf,替换文件名,获得文件编号 for fn in list_filename: filepath = os.path.join(read_dirpath, fn) pdf_reader = PdfFileReader(filepath) pages = pdf_reader.getNumPages() for page in range(pages): # 逐页添加 pdf_writer.addPage(pdf_reader.getPage(page)) with open(save_filepath, "wb") as out: pdf_writer.write(out) 复制代码concat_pdf("易方达中小板指数证券投资基金(LOF)2020年中期报告", "./test_pdf/", "concat.pdf") 复制代码提取文字内容import pdfplumber 复制代码def extract_pdf_text(pdf_path, page_order=-1): """ 提取pdf指定页文字 @param pdf_path: 文件路径 @param page_order: 页数,[0,];-1表示读取全部 @return """ with pdfplumber.open(pdf_path) as pdf: if page_order == -1: for page in pdf.pages: print(page.extract_text()) return page = pdf.pages[page_order] print(page.extract_text()) 复制代码extract_pdf_text("concat.pdf", 0) 复制代码易方达中小板指数证券投资基金(LOF)2020年中期报告 易方达中小板指数证券投资基金(LOF) 2020 年中期报告 2020 年 6 月 30 日 基金管理人:易方达基金管理有限公司 基金托管人:中国建设银行股份有限公司 送出日期:二〇二〇年八月二十八日 复制代码# 将pdf内容存入txt def save_pdf_text_as_txt(pdf_path, txt_path, page_order=-1): str = "" with pdfplumber.open(pdf_path) as pdf: if page_order == -1: for page in pdf.pages: str += page.extract_text() #print(page.extract_text()) return page = pdf.pages[page_order] #print(page.extract_text()) str += page.extract_text() with open(txt_path, "w") as file: file.write(str) 复制代码save_pdf_text_as_txt("concat.pdf","concat_1.txt", 0) 复制代码# 格式丢失 def get_pdf_text(pdf_path, page_order=-1): str = "" with pdfplumber.open(pdf_path) as pdf: if page_order == -1: for page in pdf.pages: str += page.extract_text() #print(page.extract_text()) return page = pdf.pages[page_order] #print(page.extract_text()) str += page.extract_text() return str 复制代码# savepath带有文件后缀,因此可以获得文件类型:txt或word def save_pdf_text(str, save_path): pass 复制代码提取表格内容import pdfplumber 复制代码def extract_pdf_table(pdf_path, page_order = 0): with pdfplumber.open(pdf_path) as pdf: page = pdf.pages[page_order] table_info = page.extract_tables() # 该页只有一个表格的情况下,设置表格第一行为表头,其余为数据 for index in range(len(table_info)): df_table = pd.DataFrame(table_info[index][1: ], columns=table_info[index][0]) #print(index) #print(df_table) # todo: 是否可以合并成1个文件? file_name = "pdf_table_" + str(page_order + 1) + "_" + str(index + 1) +".csv" print(file_name) df_table.to_csv(file_name, index=False, encoding="gbk") # gbk能用excel打开,utf-8用vscode打开 !pip3 install Pandas Requirement already satisfied: Pandas in /usr/local/lib/python3.9/site-packages (1.2.4) Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.9/site-packages (from Pandas) (2021.1) Requirement already satisfied: python-dateutil>=2.7.3 in /Users/lichizou/Library/Python/3.9/lib/python/site-packages (from Pandas) (2.8.1) Requirement already satisfied: numpy>=1.16.5 in /usr/local/lib/python3.9/site-packages (from Pandas) (1.20.3) Requirement already satisfied: six>=1.5 in /Users/lichizou/Library/Python/3.9/lib/python/site-packages (from python-dateutil>=2.7.3->Pandas) (1.16.0) import pandas as pd extract_pdf_table("易方达中小板指数证券投资基金(LOF)2020年中期报告.pdf", 6) pdf_table7.csv extract_pdf_table("易方达中小板指数证券投资基金(LOF)2020年中期报告.pdf", 7) # 第8页,单表格 pdf_table8.csv extract_pdf_table("易方达中小板指数证券投资基金(LOF)2020年中期报告.pdf", 15) # 第16页,多表格,第一个表格残缺,最终只输出完整的第二个表格 pdf_table_16_1.csv pdf_table_16_2.csv extract_pdf_table("易方达中小板指数证券投资基金(LOF)2020年中期报告.pdf", 14) # 第15页,表格有部分在第16页,检测是否完全输出:没有,被截断了.bug了 pdf_table_15_1.csv pdf_table_15_2.csv # 完整的2个表格,第20页 extract_pdf_table("易方达中小板指数证券投资基金(LOF)2020年中期报告.pdf", 19) pdf_table_20_1.csv pdf_table_20_2.csv 提取图片内容使用模块fitz,最新版1.18.13和非最新版在部分函数名称上有差异需要先安装PyMuPDF: pip3 install PyMuPDF
官方文档:pymupdf.readthedocs.io/en/latest/
提取图片的整体逻辑如下:使用 fitz 打开文档,获取文档详细数据遍历每一个元素,通过正则找到图片的索引位置使用 Pixmap 将索引对应的元素生成图片通过 size 函数过滤较小的图片!pip3 install PyMuPDF Collecting PyMuPDF Downloading PyMuPDF-1.18.14-cp39-cp39-macosx_10_9_x86_64.whl (5.6 MB) [K | | 5.6 MB 5.1 MB/s [?25hInstalling collected packages: PyMuPDF Successfully installed PyMuPDF-1.18.14 import fitz import re pdf = fitz.open("易方达中小板指数证券投资基金(LOF)2020年中期报告.pdf") xref_len = pdf.xref_length() xref_len # 文档对象个数 17912 len(pdf) # 实际pdf页数 49 # 导出图片地址 pic_dir = "pdf_export_img" if not os.path.exists(pic_dir): os.makedirs(pic_dir) # 使用正则表达式来查找图片 check_XObject = r"/Type(?= */XObject)" check_Image = r"/Subtype(?= */Image)" # 遍历pdf对象,找到图像 img_count = 0 for index in range(1, xref_len): text = pdf.xref_object(index) is_xobject = re.search(check_XObject, text) is_image = re.search(check_Image, text) if is_xobject or is_image: img_count += 1 # 根据索引生成图像 pix = fitz.Pixmap(pdf, index) pic_filepath = os.path.join(pic_dir, "img_" + str(img_count) + ".png") """pix.size 可以反映像素多少,简单的色素块该值较低,可以通过设置一个阈值过滤。以阈值 10000 为例过滤""" # if pix.size < 10000: # continue # 将图片保存为png格式 if pix.n >= 5: pix = fitz.Pixmap(fitz.csRGB, pix) pix.writePNG(pic_filepath) print("导出图片:" + str(img_count)) # 整个pdf只有1张图 导出图片:1 转存为图片
将每一页转换为一张张图片
安装pdf2image, github:github.com/Belval/pdf2…
安装:pip3 install pdf2image对于 mac 用户,需要安装 poppler for Mac: brew install poppler!pip3 install pdf2image Collecting pdf2image Downloading pdf2image-1.15.1-py3-none-any.whl (10 kB) Requirement already satisfied: pillow in /usr/local/lib/python3.9/site-packages (from pdf2image) (8.2.0) Installing collected packages: pdf2image Successfully installed pdf2image-1.15.1 复制代码# 3种方法,这里使用官方图推荐的更好的一种 from pdf2image import convert_from_path import tempfile import send2trash import shutil # 肉眼可见的慢 def pdf_saveas_img(pdf_path, img_dir): # 方便测试, send2trash.send2trash(img_dir) if not os.path.exists(img_dir): os.makedirs(img_dir) with tempfile.TemporaryDirectory() as path: ppm = convert_from_path(pdf_path, output_folder=img_dir) # 将ppm文件保存为png图片 for index, pm in enumerate(ppm, 1): img_name = pdf_name + "-" + str(index).zfill(2) + ".png" pm.save(os.path.join(os.path.dirname(pm.filename), img_name)) # 删除ppm文件 for root, dir, file in os.walk(img_dir): for f in file: if f[-4:] == ".ppm": #print(f) send2trash.send2trash(os.path.join(root, f)) pdf_path = "易方达中小板指数证券投资基金(LOF)2020年中期报告.pdf" img_dir = "pdf_img" # 发现生成了.ppm格式的文件,不是图片格式,且命名带有规律,但不是原来的名字 ppm = pdf_saveas_img(pdf_path, img_dir) #help(ppm[0]) ppm[0].save("pdf_img.png") type(ppm[0]) PIL.PpmImagePlugin.PpmImageFile ppm[0].filename "pdf_img/76b22d8c-b50f-4bfe-b5ef-f8b446d7d0ae-01.ppm" 复制代码pdf_path.split(".")[0] 复制代码"易方达中小板指数证券投资基金(LOF)2020年中期报告" "pdf_img/76b22d8c-b50f-4bfe-b5ef-f8b446d7d0ae-01.ppm"[:-4] "pdf_img/76b22d8c-b50f-4bfe-b5ef-f8b446d7d0ae-01" 添加水印
使用库: github.com/2Dou/waterm…
步骤:
1.获得一个带水印pdf文件
1.1在图片添加水印,图片插入到word,word保存为pdf
1.2python处理:参考
github.com/2Dou/waterm…
mp.weixin.qq.com/s/_oJA6lbsd…
2.将水印pdf文件合并到源目标pdf文件的每一页获取水印图片步骤1:省车给你一张透明的水印步骤2: 将水印添加到一张空白图上生成水印背景步骤3: 将水印背景粘贴到原图对应的位置上去两种水印类型固定位置水印:容易被去除全屏水印:不容易被去除,但可能会影响阅读土办法
法1,直接在word,插入文本框,旋转,设置无边框,注意设置背景透明,然后复制多个。导出为pdf。
法2,word本身自己的增加水印,设计-水印-自定义文字或图片。缺点是只能是单个水印。# 给pdf添加水印 def add_pdf_watermask(mask_path, pdf_path, save_path): watermask = PdfFileReader(mask_path) watermask_page = watermask.getPage(0) pdf_reader = PdfFileReader(pdf_path) pdf_writer = PdfFileWriter() pdf_page_num = pdf_reader.getNumPages() # 对pdf的每一页(除了首页),逐页与水印pdf合并,并添加到新的pdf for page_index in range(pdf_page_num): current_page = pdf_reader.getPage(page_index) if page_index == 0: # 第一页不加水印 pdf_writer.addPage(current_page) continue current_page.mergePage(watermask_page) current_page.compressContentStreams() # 压缩内容 pdf_writer.addPage(current_page) # 保存新的pdf文件 with open(save_path, "wb") as out: pdf_writer.write(out) add_pdf_watermask("chizou.pdf", "concat.pdf", "concat_watermarker.pdf")
缺点是水印在内容pdf上方,如果设置过大,颜色不太透明,会遮挡内容。
文档加密与解密
解密:并不是破解,而是在已知密码的情况下解密from PyPDF2 import PdfFileReader from PyPDF2 import PdfFileWriter # 加密 pdf_reader = PdfFileReader("concat.pdf") pdf_writer = PdfFileWriter() page_num = pdf_reader.getNumPages() for page_index in range(page_num): pdf_writer.addPage(pdf_reader.getPage(page_index)) pdf_writer.encrypt("masaikemasaike") # 先逐页添加页,最后加密 with open("concat_encrypt.pdf", "wb") as out: # 必须是wb,二进制打开 pdf_writer.write(out) # 解密:对concat_encrypt.pdf解密,另存 pdf_reader = PdfFileReader("concat_encrypt.pdf") pdf_reader.decrypt("masaikemasaike") # 一次性解密 pdf_writer = PdfFileWriter() page_num = pdf_reader.getNumPages() for page_index in range(page_num): pdf_writer.addPage(pdf_reader.getPage(page_index)) with open("concat_decrypt.pdf", "wb") as out: pdf_writer.write(out)
作者:秦与商
链接:https://juejin.cn/post/7069582991982329893