PDF Manipulation using Python - fitz
pip install PyMuPDF
1. Extract Text from a PDF
import fitz
def extract_text(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
pdf_path = "clcoding.pdf"
text = extract_text(pdf_path)
print(text)
Hello World!
2. Extract Images from a PDF
import fitz
import os
def extract_images(pdf_path, output_dir):
doc fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
for img in page.get_images (full=True):
xref = img[0]
base_image doc.extract_image(xref)
image_bytes = base_image ["image"]
image_ext = base_image["ext"]
image_filename os.path.join(output_dir,
f"image_{page_num+1}_{xref}.{image_ext}")
with open(image_filename, "wb") as image_file:
image_file.write(image_bytes)
pdf_path = "clcoding.pdf"
output_dir = "images"
os.makedirs(output_dir, exist_ok=True)
extract_images(pdf_path, output_dir)
3. Merge Multiple PDFs into One
import fitz
def merge_pdfs(pdf_list, output_pdf):
merged_doc = fitz.open()
for pdf in pdf_list:
with fitz.open(pdf) as doc:
merged_doc.insert_pdf(doc)
merged_doc.save(output_pdf)
pdf_list = ["clcoding.pdf", "clcodingpdf.pdf"]
output_pdf = "clcodingmerged.pdf"
merge_pdfs(pdf_list, output_pdf)
4. Split a PDF into Individual Pages
import fitz
import os
def split_pdf(pdf_path, output_dir):
doc fitz.open(pdf_path)
for page_num in range(len(doc)):
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
output_filename = os.path.join(output_dir, f"page_{page_num+1}.pdf")
new_doc.save(output_filename)
pdf_path = "clcodingpdf.pdf"
output_dir = "split_pages"
os.makedirs(output_dir, exist_ok=True)
split_pdf(pdf_path, output_dir)