Python transforms struct datatype

Do Python Transforms support writing to a data set column with a struct datatype? The docs do not explicitly mention Python Transforms/code repositories.

Hey Mitch,

You can write out a struct column to a dataset with transforms. Here’s a notional example using PySpark:

from pyspark.sql import types as T

from transforms.api import transform, Output


@transform(
    output=Output("ri.foundry.main.dataset...")
)
def compute(ctx, output):

    person_schema = T.StructType([
        T.StructField('name', T.StringType(), True),
        T.StructField('age', T.IntegerType(), True),
        T.StructField('height', T.DoubleType(), True)
    ])

    department_schema = T.StructType([
        T.StructField('id', T.StringType(), False),
        T.StructField('department', T.StringType(), True),
        T.StructField('person_info', person_schema, True)
    ])

    department_data = [
        ('001', 'Engineering', ('Alice', 28, 5.6)),
        ('002', 'Marketing', ('Bob', 35, 6.1)),
        ('003', 'HR', ('Charlie', 42, 5.9)),
        ('004', 'Engineering', ('David', 31, 5.8))
    ]

    department_df = ctx.spark_session.createDataFrame(department_data, schema=department_schema)

    output.write_dataframe(department_df)

Thanks for the quick response!

Would you happen to have an example to write to structs for a lightweight transform, using a library like Pandas?

Sure, here’s examples for both Polars and Pandas in lightweight transforms. I’ve also included the construction of the struct column itself for the person info.

import polars as pl

from transforms.api import transform, Output, lightweight


@lightweight(
    cpu_cores=2,
    memory_gb=8
)
@transform(
    output=Output("ri.foundry.main.dataset...")
)
def compute(output):

    department_data = {
        'department_id': ['001', '002', '003'],
        'department_name': ['Engineering', 'Marketing', 'HR']
    }

    person_data = {
        'name': ['Charlie', 'Alice', 'Bob'],
        'age': [42, 28, 35],
        'height': [5.9, 5.6, 6.1],
        'department_id': ['001', '002', '003']
    }

    department_df = pl.DataFrame(department_data)

    person_df = pl.DataFrame(person_data)

    person_df = (
        person_df
        .select(
            pl.struct(
                pl.col('name'),
                pl.col('age'),
                pl.col('height')
            ).alias('person_info'),
            pl.col('department_id')
        )
    )

    department_df = (
        department_df
        .join(
            person_df,
            on='department_id',
            how='left'
        )
    )

    output.write_table(department_df)

import pandas as pd

from transforms.api import transform, Output, lightweight


@lightweight(
    cpu_cores=1,
    memory_gb=2
)
@transform(
    output=Output("ri.foundry.main.dataset...")
)
def compute(output):

    department_data = {
        'department_id': ['001', '002', '003'],
        'department_name': ['Engineering', 'Marketing', 'HR']
    }

    person_data = {
        'name': ['Charlie', 'Alice', 'Bob'],
        'age': [42, 28, 35],
        'height': [5.9, 5.6, 6.1],
        'department_id': ['001', '002', '003']
    }

    department_df = pd.DataFrame.from_dict(department_data)

    person_df = pd.DataFrame.from_dict(person_data)

    person_df['person_info'] = (
        person_df
        .apply(lambda row: {'name': row['name'], 'age': row['age'], 'height': row['height']}, axis=1)
    )

    person_df = (
        person_df
        .drop(
            columns=['name', 'age', 'height']
        )
    )

    department_df = (
        department_df
        .merge(
            person_df,
            on='department_id',
            how='left'
        )
    )

    output.write_table(department_df)

Both have the same output with the person_info column being a struct datatype.

department_id department_name person_info
001 Engineering {age:42,height:5.9,name:Charlie}
002 Marketing {age:28,height:5.6,name:Alice}
003 HR {age:35,height:6.1,name:Bob}
2 Likes

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.