Do Python Transforms support writing to a data set column with a struct
datatype? The docs do not explicitly mention Python Transforms/code repositories.
Hey Mitch,
You can write out a struct column to a dataset with transforms. Here’s a notional example using PySpark:
from pyspark.sql import types as T
from transforms.api import transform, Output
@transform(
output=Output("ri.foundry.main.dataset...")
)
def compute(ctx, output):
person_schema = T.StructType([
T.StructField('name', T.StringType(), True),
T.StructField('age', T.IntegerType(), True),
T.StructField('height', T.DoubleType(), True)
])
department_schema = T.StructType([
T.StructField('id', T.StringType(), False),
T.StructField('department', T.StringType(), True),
T.StructField('person_info', person_schema, True)
])
department_data = [
('001', 'Engineering', ('Alice', 28, 5.6)),
('002', 'Marketing', ('Bob', 35, 6.1)),
('003', 'HR', ('Charlie', 42, 5.9)),
('004', 'Engineering', ('David', 31, 5.8))
]
department_df = ctx.spark_session.createDataFrame(department_data, schema=department_schema)
output.write_dataframe(department_df)
Thanks for the quick response!
Would you happen to have an example to write to structs for a lightweight transform, using a library like Pandas?
Sure, here’s examples for both Polars and Pandas in lightweight transforms. I’ve also included the construction of the struct column itself for the person info.
import polars as pl
from transforms.api import transform, Output, lightweight
@lightweight(
cpu_cores=2,
memory_gb=8
)
@transform(
output=Output("ri.foundry.main.dataset...")
)
def compute(output):
department_data = {
'department_id': ['001', '002', '003'],
'department_name': ['Engineering', 'Marketing', 'HR']
}
person_data = {
'name': ['Charlie', 'Alice', 'Bob'],
'age': [42, 28, 35],
'height': [5.9, 5.6, 6.1],
'department_id': ['001', '002', '003']
}
department_df = pl.DataFrame(department_data)
person_df = pl.DataFrame(person_data)
person_df = (
person_df
.select(
pl.struct(
pl.col('name'),
pl.col('age'),
pl.col('height')
).alias('person_info'),
pl.col('department_id')
)
)
department_df = (
department_df
.join(
person_df,
on='department_id',
how='left'
)
)
output.write_table(department_df)
import pandas as pd
from transforms.api import transform, Output, lightweight
@lightweight(
cpu_cores=1,
memory_gb=2
)
@transform(
output=Output("ri.foundry.main.dataset...")
)
def compute(output):
department_data = {
'department_id': ['001', '002', '003'],
'department_name': ['Engineering', 'Marketing', 'HR']
}
person_data = {
'name': ['Charlie', 'Alice', 'Bob'],
'age': [42, 28, 35],
'height': [5.9, 5.6, 6.1],
'department_id': ['001', '002', '003']
}
department_df = pd.DataFrame.from_dict(department_data)
person_df = pd.DataFrame.from_dict(person_data)
person_df['person_info'] = (
person_df
.apply(lambda row: {'name': row['name'], 'age': row['age'], 'height': row['height']}, axis=1)
)
person_df = (
person_df
.drop(
columns=['name', 'age', 'height']
)
)
department_df = (
department_df
.merge(
person_df,
on='department_id',
how='left'
)
)
output.write_table(department_df)
Both have the same output with the person_info column being a struct datatype.
department_id | department_name | person_info |
---|---|---|
001 | Engineering | {age:42,height:5.9,name:Charlie} |
002 | Marketing | {age:28,height:5.6,name:Alice} |
003 | HR | {age:35,height:6.1,name:Bob} |
2 Likes
This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.