Python transforms struct datatype

mitchp · February 18, 2025, 7:40pm

Do Python Transforms support writing to a data set column with a struct datatype? The docs do not explicitly mention Python Transforms/code repositories.

gbowman · February 18, 2025, 8:42pm

Hey Mitch,

You can write out a struct column to a dataset with transforms. Here’s a notional example using PySpark:

from pyspark.sql import types as T

from transforms.api import transform, Output


@transform(
    output=Output("ri.foundry.main.dataset...")
)
def compute(ctx, output):

    person_schema = T.StructType([
        T.StructField('name', T.StringType(), True),
        T.StructField('age', T.IntegerType(), True),
        T.StructField('height', T.DoubleType(), True)
    ])

    department_schema = T.StructType([
        T.StructField('id', T.StringType(), False),
        T.StructField('department', T.StringType(), True),
        T.StructField('person_info', person_schema, True)
    ])

    department_data = [
        ('001', 'Engineering', ('Alice', 28, 5.6)),
        ('002', 'Marketing', ('Bob', 35, 6.1)),
        ('003', 'HR', ('Charlie', 42, 5.9)),
        ('004', 'Engineering', ('David', 31, 5.8))
    ]

    department_df = ctx.spark_session.createDataFrame(department_data, schema=department_schema)

    output.write_dataframe(department_df)

rarifin · February 19, 2025, 2:44pm

Thanks for the quick response!

Would you happen to have an example to write to structs for a lightweight transform, using a library like Pandas?

gbowman · February 19, 2025, 8:16pm

Sure, here’s examples for both Polars and Pandas in lightweight transforms. I’ve also included the construction of the struct column itself for the person info.

import polars as pl

from transforms.api import transform, Output, lightweight


@lightweight(
    cpu_cores=2,
    memory_gb=8
)
@transform(
    output=Output("ri.foundry.main.dataset...")
)
def compute(output):

    department_data = {
        'department_id': ['001', '002', '003'],
        'department_name': ['Engineering', 'Marketing', 'HR']
    }

    person_data = {
        'name': ['Charlie', 'Alice', 'Bob'],
        'age': [42, 28, 35],
        'height': [5.9, 5.6, 6.1],
        'department_id': ['001', '002', '003']
    }

    department_df = pl.DataFrame(department_data)

    person_df = pl.DataFrame(person_data)

    person_df = (
        person_df
        .select(
            pl.struct(
                pl.col('name'),
                pl.col('age'),
                pl.col('height')
            ).alias('person_info'),
            pl.col('department_id')
        )
    )

    department_df = (
        department_df
        .join(
            person_df,
            on='department_id',
            how='left'
        )
    )

    output.write_table(department_df)

import pandas as pd

from transforms.api import transform, Output, lightweight


@lightweight(
    cpu_cores=1,
    memory_gb=2
)
@transform(
    output=Output("ri.foundry.main.dataset...")
)
def compute(output):

    department_data = {
        'department_id': ['001', '002', '003'],
        'department_name': ['Engineering', 'Marketing', 'HR']
    }

    person_data = {
        'name': ['Charlie', 'Alice', 'Bob'],
        'age': [42, 28, 35],
        'height': [5.9, 5.6, 6.1],
        'department_id': ['001', '002', '003']
    }

    department_df = pd.DataFrame.from_dict(department_data)

    person_df = pd.DataFrame.from_dict(person_data)

    person_df['person_info'] = (
        person_df
        .apply(lambda row: {'name': row['name'], 'age': row['age'], 'height': row['height']}, axis=1)
    )

    person_df = (
        person_df
        .drop(
            columns=['name', 'age', 'height']
        )
    )

    department_df = (
        department_df
        .merge(
            person_df,
            on='department_id',
            how='left'
        )
    )

    output.write_table(department_df)

Both have the same output with the person_info column being a struct datatype.

department_id	department_name	person_info
001	Engineering	{age:42,height:5.9,name:Charlie}
002	Marketing	{age:28,height:5.6,name:Alice}
003	HR	{age:35,height:6.1,name:Bob}

system · March 5, 2025, 8:16pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.