0% found this document useful (0 votes)
26 views2 pages

PDF Manipulation Using Python

Uploaded by

alin76us
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
26 views2 pages

PDF Manipulation Using Python

Uploaded by

alin76us
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

PDF Manipulation using Python - fitz

pip install PyMuPDF

1. Extract Text from a PDF

import fitz
def extract_text(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text

pdf_path = "clcoding.pdf"
text = extract_text(pdf_path)
print(text)

Hello World!

2. Extract Images from a PDF

import fitz
import os

def extract_images(pdf_path, output_dir):


doc fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
for img in page.get_images (full=True):
xref = img[0]
base_image doc.extract_image(xref)
image_bytes = base_image ["image"]
image_ext = base_image["ext"]
image_filename os.path.join(output_dir,
f"image_{page_num+1}_{xref}.{image_ext}")
with open(image_filename, "wb") as image_file:
image_file.write(image_bytes)

pdf_path = "clcoding.pdf"
output_dir = "images"
os.makedirs(output_dir, exist_ok=True)
extract_images(pdf_path, output_dir)

3. Merge Multiple PDFs into One

import fitz

def merge_pdfs(pdf_list, output_pdf):


merged_doc = fitz.open()
for pdf in pdf_list:
with fitz.open(pdf) as doc:
merged_doc.insert_pdf(doc)
merged_doc.save(output_pdf)

pdf_list = ["clcoding.pdf", "clcodingpdf.pdf"]


output_pdf = "clcodingmerged.pdf"
merge_pdfs(pdf_list, output_pdf)
4. Split a PDF into Individual Pages

import fitz
import os

def split_pdf(pdf_path, output_dir):


doc fitz.open(pdf_path)
for page_num in range(len(doc)):
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
output_filename = os.path.join(output_dir, f"page_{page_num+1}.pdf")
new_doc.save(output_filename)

pdf_path = "clcodingpdf.pdf"
output_dir = "split_pages"
os.makedirs(output_dir, exist_ok=True)
split_pdf(pdf_path, output_dir)

You might also like