I would like to upload a file to my code repository (a picture, some reference data, etc.) and use it in the lightweight transforms defined by my code repository.
There is a documentation page for Spark based transforms already: https://www.palantir.com/docs/foundry/transforms-python-spark/read-files-repository
How can I do this ?
Note: I also understand this is an anti-pattern, as data - even pictures or content - should be uploaded to datasets/mediasets and then be consumed in the transforms via the standard means (reading from the datasets/mediaset, etc.)
# ...
# Get the directory where this script is located
script_dir = os.path.dirname(os.path.abspath(__file__))
json_file_path = os.path.join(script_dir, "example_file.json")
# Read the JSON file
with open(json_file_path, "r") as f:
json_data = json.load(f)
# ...
And here is the full code of the transform:
# import polars as pl
import json
import os
import polars as pl
from transforms.api import Input, Output, transform, LightweightInput, LightweightOutput
@transform.using(
output_dataset=Output(
"/path/to/json_output_dataset"
),
)
def compute(output_dataset: LightweightOutput) -> None:
# Read the local "example_file.json"
# Get the directory where this script is located
script_dir = os.path.dirname(os.path.abspath(__file__))
json_file_path = os.path.join(script_dir, "example_file.json")
# Read the JSON file
with open(json_file_path, "r") as f:
json_data = json.load(f)
print(f"JSON data loaded: {json_data}")
# Option 1: Create a DataFrame from the JSON data
# If JSON is a simple key-value object
if isinstance(json_data, dict):
# Convert to a DataFrame with key-value pairs
df = pl.DataFrame(
{
"key": list(json_data.keys()),
"value": [str(v) for v in json_data.values()],
}
)
else:
# If JSON is a list or other structure, wrap it
df = pl.DataFrame({"json_content": [json.dumps(json_data)]})
# Option 2: You could also combine with input data
# input_df = input_dataset.polars()
# # Add JSON data as new columns or metadata
# combined_df = input_df.with_columns([
# pl.lit(json_data.get("key", "")).alias("json_key_value")
# ])
# Write the DataFrame to output
output_dataset.write_table(df)
which then drops the content of the json as a table in the output dataset: