My purpose is to extract specific page ranges from PDF documents stored in a source MediaSet, split them into separate PDF files, and then systematically save these extracted PDFs into a new destination MediaSet.
I have a dataset (certificate_data) that contains:
pdf_media_item_rid: The RID of the source PDF
same_certificate_pages: An array of page numbers that belong to each certificate (e.g., [175], [165, 166, 167])
Approach 1: Using Palantir’s built-in slice_document()
I encountered Invalid page range errors in some cases.
Multiple writes to the same MediaSet seem to overwrite each other. Each split_pdf_mediaset.write(transform) call appears to replace the previous output rather than adding to it, resulting in only the last processed PDF being saved.
def compute(ctx, source_pdf_mediaset, certificate_data, split_pdf_mediaset):df = certificate_data.dataframe()
for row in df.collect():
pdf_media_item_rid = str(row['pdf_media_item_rid'])
same_cert_pages = row['same_certificate_pages']
certificate_first_page = int(same_cert_pages[0]) - 1 # Convert to 0-indexed
certificate_last_page = int(same_cert_pages[-1]) # Exclusive end page
transform = source_pdf_mediaset.transform().slice_document(
certificate_first_page,
certificate_last_page,
media_item_rid=pdf_media_item_rid
)
split_pdf_mediaset.write(transform)
Approach 2: Using PyPDF2
Issue: This approach works, but is extremely slow. Processing just 95 rows took over 1.5 hours.
def compute(ctx, source_pdf_mediaset, certificate_data, split_pdf_mediaset):df = certificate_data.dataframe()
for row in df.collect():
pdf_media_item_rid = str(row['pdf_media_item_rid'])
same_cert_pages = row['same_certificate_pages']
material_heat_number = str(row.get('material_heat_number', 'unknown'))
certification_number = str(row.get('certification_number', 'unknown'))
certificate_first_page = int(same_cert_pages[0]) - 1
certificate_last_page = int(same_cert_pages[-1]) - 1
# Create meaningful filename
output_filename = f"{material_heat_number}_{certification_number}_pages_{certificate_first_page+1}-{certificate_last_page+1}.pdf"
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_input_pdf:
with source_pdf_mediaset.get_media_item(pdf_media_item_rid) as pdf_stream:
temp_input_pdf.write(pdf_stream.read())
temp_input_pdf.flush()
reader = PdfReader(temp_input_pdf.name)
writer = PdfWriter()
for page_num in range(certificate_first_page, certificate_last_page + 1):
writer.add_page(reader.pages[page_num])
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_output_pdf:
writer.write(temp_output_pdf.name)
temp_output_pdf.flush()
with open(temp_output_pdf.name, 'rb') as output_stream:
split_pdf_mediaset.put_media_item(output_stream, output_filename)
os.remove(temp_output_pdf.name)