Calling the transform method yeilds the following error in preview:
A PreviewMediaSetInput object does not have an attribute transform. Please check the spelling and/or the datatype of the object.
It builds just fine though. Full code below with RIDs replaced.
from transforms.api import transform, Output, TransformOutput
from transforms.mediasets import MediaSetInput, MediaSetInputParam
@transform(
media_input=MediaSetInput("<My_RID>"),
output=Output("<My_RID>"),
)
def compute(ctx, media_input: MediaSetInputParam, output: TransformOutput):
"""
Extract text from each page of PDFs in the media set.
Output columns:
- page_number: The page number (zero-indexed)
- extracted_text: Text extracted from the page via OCR
- media_item_rid: RID of the media item (document)
- path: Path of the media item in the media set
- media_reference: Reference to the media item
"""
# Create a MediaSetInputTransform instance and apply OCR with page_per_row
# This will create one row per page with the page number and extracted text
transform_instance = media_input.transform()
# Use OCR to extract text from PDFs, returning one row per page
result_df = transform_instance.ocr(
return_structure="page_per_row", # One row per page
languages=["eng"], # English language for OCR
suppress_errors=True, # Continue processing even if some pages fail
)
# The resulting DataFrame will have columns:
# - media_item_rid: RID of the document
# - path: Path of the document in the media set
# - media_reference: Reference column for the media item
# - page_number: Page number (zero-indexed)
# - extracted_text: Text extracted from that page
# Define column type classes to preserve media reference
column_typeclasses = {"media_reference": [{"kind": "reference", "name": "media_reference"}]}
# Write the result to the output dataset
output.write_dataframe(result_df, column_typeclasses=column_typeclasses)