Time sensitive! - Need Help extracting frames from an mp4 video in code repository - please

The short version is that I am getting error that for my output should have should_snapshot=True in the error, here is the code in question:

from transforms.api import transform, Input, Output, lightweight
from transforms.mediasets import MediaSetInput, MediaSetOutput
from transforms.external.systems import use_external_systems, external_systems, Source, EgressPolicy, ExportControl
import cv2
from PIL import Image
import pandas as pd
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
import io


@lightweight(gpu_type='NVIDIA_T4')
@external_systems(
    jrp_external_connection=Source("ri.magritte..source.7f4aad1c-f213-43b5-a266-d5f8747ed7af")
)
# @use_external_systems(
#     # export_control=ExportControl(markings=['<marking ID>']),
#     egress=EgressPolicy('ri.resource-policy-manager.global.network-egress-policy.e4982a71-6117-483c-9bc0-cc855d85c918v'),
# )
@transform(
    # MediaSetInput not Input
    video_input=MediaSetInput("ri.mio.main.media-set.52287ebd-f5d2-4c90-91ad-6d45146b1fdf"),
    output=MediaSetOutput("ri.mio.main.media-set.8d7d566a-14a3-43af-aaad-fc812bb5e5c2", should_snapshot=True),
)
def compute(ctx, jrp_external_connection, video_input, output):
    # Load BLIP processor and model
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    video = video_input.get_media_item("ri.mio.main.media-item.0198aa08-e56f-7157-a066-c92235051fcd")
    results = []
    
    cap = cv2.VideoCapture(video)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    # Sample every 30th frame (adjust as needed)
    for i in range(0, frame_count, 30):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if not ret:
            continue
        # Convert frame to PIL Image
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        inputs = processor(image, return_tensors="pt").to(device)
        out = model.generate(**inputs)
        caption = processor.decode(out[0], skip_special_tokens=True)
        # Save image to bytes
        img_bytes = io.BytesIO()
        image.save(img_bytes, format='JPEG')
        img_bytes.seek(0)
        # Add image to mediaset
        asset_name = f"astronaut_vid_frame_{i}.jpg"
        output.put_media_item(img_bytes.read(), asset_name)
        # results.append({
        #     "video_path": video_path,
        #     "frame_index": i,
        #     "caption": caption
        # })
    cap.release()

I could not find an “extract frames from mp4” transform in pipeline builder so here I am trying to do it in code repositories. I’ve ironed out all the complaints about libraries/dependencies, and also the transforms library (recommended by ai assist) was trying to reach out to huggingface so I made an egress policy and the network connection error seems to have gone away. Heres the actual complaint in builder about 6m into the job:

Job failed with status 1:
Traceback (most recent call last):
  File "/foundry/python_environment/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/foundry/python_environment/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/foundry/user_code/myproject/datasets/examples.py", line 25, in <module>
    def compute(ctx, jrp_external_connection, video_input, output):
  File "/foundry/python_environment/lib/python3.9/site-packages/transforms/api/_lightweight/_decorators.py", line 133, in _lightweight
    return ContainerTransform(
  File "/foundry/python_environment/lib/python3.9/site-packages/transforms/api/_lightweight/_transform.py", line 112, in __init__
    self.__name__ = transform.__name__
  File "/foundry/python_environment/lib/python3.9/site-packages/transforms/api/_lightweight/_transform.py", line 158, in __name__
    self.compute()
  File "/foundry/python_environment/lib/python3.9/site-packages/transforms/api/_lightweight/_transform.py", line 205, in compute
    self._compute()
  File "/foundry/python_environment/lib/python3.9/site-packages/transforms/api/_lightweight/_transform.py", line 275, in _compute
    inputs, outputs = self._params.instantiate_params(self._incremental)
  File "/foundry/python_environment/lib/python3.9/site-packages/transforms/api/_lightweight/_configuration.py", line 200, in instantiate_params
    return ParamInstantiator(self, incremental).instantiate()
  File "/foundry/python_environment/lib/python3.9/site-packages/transforms/api/_lightweight/_param_instantiator.py", line 49, in instantiate
    outputs = {
  File "/foundry/python_environment/lib/python3.9/site-packages/transforms/api/_lightweight/_param_instantiator.py", line 52, in <dictcomp>
    for param_name, output_obj in [self._create_output_obj(output_param)]
  File "/foundry/python_environment/lib/python3.9/site-packages/transforms/api/_lightweight/_param_instantiator.py", line 79, in _create_output_obj
    return param_name, self._create_v2_incremental_output_obj(param_name, output_param)
  File "/foundry/python_environment/lib/python3.9/site-packages/transforms/api/_lightweight/_param_instantiator.py", line 120, in _create_v2_incremental_output_obj
    return instance.get_non_incremental()
  File "/foundry/python_environment/lib/python3.9/site-packages/transforms/mediasets/outputs/_output_operations.py", line 92, in get_non_incremental
    raise ValueError(
ValueError: Media set output should be snapshotted, but is not configured to do so. Resolved by setting "should_snapshot=True"

What I dont understand is, as you can see, i’ve added should_snapshot=True to my mediaSetOutput, so why does it think its not configured for that?

Also if anyone has any experience doing this, does what I’m doing make sense? any help would be appreciated this is very time sensitive and I don’t want to abandon it.

PS: I realize I’m not doing anything with the caption, I realized that it can’t go into the mediaset the caption will have to go into a dataset output instead but I’m trying to get the frames first

Is your mediaset configured to be Transactional or Transactionless?

If you have set should_snapshot=True but still receive the error, it means your output media set (ri.mio.main.media-set.8d7d566a-14a3-43af-aaad-fc812bb5e5c2) is configured as transactionless. Transactionless media sets do not support snapshotting, and attempting to enable it will always result in this error.

How to resolve:

If you need transactional guarantees (atomic writes, rollback on failure), you must create a new transactional media set and use it as your output.

If you do not need transactional guarantees, set should_snapshot=False in your MediaSetOutput configuration.

To check if a media set in Foundry is transactional or transactionless, follow these steps:

Navigate to the Media Set: In Foundry, go to the Data Catalog or the location where your media set is stored.

Open the Media Set Details: Click on the media set to open its details page.

Look for Advanced Settings or Configuration: There should be a section labeled “Advanced Settings,” “Configuration,” or similar. Here, you can find the transaction policy listed as either “transactional” or “transactionless.”

If you see options or information about transactions, snapshotting, or rollback, the media set is likely transactional.

If you see that items are immediately visible upon upload and there is no mention of transactions or snapshotting, it is likely transactionless.

See “Advanced media set settings” in the documentation for more information on transactional vs. transactionless mediasets.

Hope this helps!

Yeah, I got the same error. Its telling me to set it to True. Is there something about my media type implied that it in fact has to be transactional/set to True? Maybe the error is indirectly saying it should be transactional? (I’m attempting to add images/frames to the mediaset)

THank you for the response i really appreciate it. So i checked the output mediaset and it says transactionless, I guess im confused then by the error telling me it needs to be set to True when I originally had no parameter passed for that (it was default I suppose). But if it was transactionless then why the error that I must set it to True, which I did then tried again and got the should_snapshot=True complaint even though I did set it to true.

For now I am re-running the build and it says should_snapshot=False since i’ve confirmed its transactionless

1 Like

Hey I saw you responded/reacted to my post. Just to be clear, im still getting the error if you have any other ideas.

I’ve been able to implement what you’re trying to do (minus the blip-image-captioning model to save time) but I ran into a problem that may have also affected you.

Here’s my code:

from transforms.mediasets.inputs import LightweightMediaSetInputParam
from transforms.api import lightweight, transform 
from transforms.mediasets import MediaSetInput, MediaSetOutput
import cv2
from PIL import Image
import io
import tempfile
import os


@lightweight
@transform(
    video_input=MediaSetInput("VIDEO_MEDIASET_RID"),
    image_output=MediaSetOutput("IMAGE_MEDIASET_RID"),
)
def compute(video_input: LightweightMediaSetInputParam, image_output):
    video_stream = video_input.get_media_item("VIDEO_FILE_RID")

    # Create a temporary file to store the video
    with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_video:
        # Copy the video stream to the temporary file
        temp_video.write(video_stream.read())
        temp_video_path = temp_video.name

    try:
        # Now use the temporary file path with cv2.VideoCapture
        cap = cv2.VideoCapture(temp_video_path)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        # Sample every 30th frame (adjust as needed)
        for i in range(0, frame_count, 30):
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = cap.read()
            if not ret:
                continue

            # Convert frame to PIL Image
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

            # Save image to bytes
            img_bytes = io.BytesIO()
            image.save(img_bytes, format='JPEG')
            img_bytes.seek(0)

            # Add image to mediaset
            asset_name = f"shark_vid_frame_{i}.jpg"
            image_output.put_media_item(img_bytes.read(), asset_name)

        cap.release()

    finally:
        # Clean up the temporary file
        if os.path.exists(temp_video_path):
            os.unlink(temp_video_path)

The problem I ran into is that the get_media_item() method returns a file-like stream object, but cv2.VideoCapture expects a path to an actual file on disk, so I had to first write out the video file to a temporary path so cv2.VideoCapture could open it.

Hope this helps!

THank you this is amazing, i like the temporary path trick you did as well. I will give this a try. I am also going to throw out the BLIP stuff for now, the frames are more important for my project than the caption.

hey just following up, this worked for me as well. Youre right im not sure how this would have been done without the temporary file trick you did. this is the solution and im gonna tag it as solution

again, thank you so much

1 Like

This topic was automatically closed 91 days after the last reply. New replies are no longer allowed.