Create Excels files in a transforms for export or download

VincentF · January 16, 2025, 3:07pm

I have some data (writeback of object and/or datasets) that I want to let my users to download as Excel files. Exports from Workshop are limited to 100k rows. How can I export more ?

VincentF · January 16, 2025, 3:10pm

You can create a transform in your pipeline that will create a dataset containing files, in this case, Excel files, that will contain the rows you want. Those can be arbitrarly big, as they are generated at pipeline execution time.

The downside of this method is that the user can’t control on a fined-grain level what will be included in those Excel, given they are generated in the background on a schedule you define (not with the filters predefined)

Note that Excel can only support up to 1 million rows.
Note that depending on the number of columns and their content, you can reach multiple GB files for even a few 100k rows. Be aware that you might want to split those in multiple files if that’s the case.

Most performant approach:

from pyspark.sql import functions as F
from pyspark.sql import types as T
from transforms.api import transform, Input, Output, lightweight
import pandas as pd
import tempfile
import shutil
import time
import logging

NB_ROWS = 1_000_000

# Example function that would generate a row per some input.
def generate_row(row_number):
    # TODO: Replace this with your own logic to generate a row based on some input (e.g. here the row_number)
    return [f"Column {i+1} Value {row_number}" for i in range(10)]  # Assuming 10 columns

# e.g. EXECUTION TIME - 0.05s for 10k rows
def generation_1(column_names):
    # Generate all rows using a list comprehension
    all_rows = [generate_row(i) for i in range(NB_ROWS)]

    # Create the DataFrame using the generated rows
    df = pd.DataFrame(all_rows, columns=column_names)
    return df

# ~30% faster than generate_with_openpyxl
# e.g. EXECUTION TIME - 27.46s for 100k rows
def generate_with_xlsxwriter(fs, df):
    with fs.open("xlsxwrite.xlsx", "wb") as f:
        with pd.ExcelWriter(f, engine='xlsxwriter') as writer:
            df.to_excel(writer, index=False)

@lightweight()
@transform(
    out=Output("/path/excel_parsing__generation"),
    source_df=Input("path/some_input"),
)
def compute(source_df, out):
    # Initialize an empty DataFrame with column names
    column_names = [f"Column {i+1}" for i in range(10)]  # Assuming 10 columns

    # Generate the data
    start_time = time.time()  # Record the start time
    df = generation_1(column_names)
    end_time = time.time()  # Record the end time
    execution_time = end_time - start_time  # Calculate the execution time
    logging.info(f"EXECUTION TIME - {execution_time}")

    fs = out.filesystem()

    # Save the data
    start_time = time.time()  # Record the start time
    df = generate_with_xlsxwriter(fs, df)
    end_time = time.time()  # Record the end time
    execution_time = end_time - start_time  # Calculate the execution time
    logging.info(f"EXECUTION TIME - {execution_time}")

Here are some code snippet that shows the performance of the different approaches:

from pyspark.sql import functions as F
from pyspark.sql import types as T
from transforms.api import transform, Input, Output, lightweight
import pandas as pd
import tempfile
import shutil
import time
import logging

'''
The below code generate an arbitrary large Excel file and save it on the output dataset of the transform.
Note: the input dataset is not "really" used (the variable are just regenerated) but is still present to show how the input dataset can be read in order to populate the excel file.
# https://www.palantir.com/docs/foundry/transforms-python/unstructured-files/
'''

NB_ROWS = 1_000_000

# Example function that would generate a row per some input.


def generate_row(row_number):
    # TODO: Replace this with your own logic to generate a row based on some input (e.g. here the row_number)
    return [f"Column {i+1} Value {row_number}" for i in range(10)]  # Assuming 10 columns


# e.g. EXECUTION TIME - 0.05s for 10k rows
def generation_1(column_names):
    # Generate all rows using a list comprehension
    all_rows = [generate_row(i) for i in range(NB_ROWS)]

    # Create the DataFrame using the generated rows
    df = pd.DataFrame(all_rows, columns=column_names)
    return df


# ~1000x slower than generation_1
# e.g. EXECUTION TIME - 22.72s for 10k rows
def generation_2(column_names):
    df = pd.DataFrame(columns=column_names)

    # Add rows to the DataFrame using the generate_row function
    for i in range(NB_ROWS):
        row_data = generate_row(i)
        df.loc[i] = row_data
    return df


# e.g. EXECUTION TIME - 37.40s for 100k rows
def generate_with_openpyxl(fs, df):
    with fs.open("openpyxl.xlsx", "wb") as f:
        df.to_excel(f, engine='openpyxl', index=False)


# ~30% faster than generate_with_openpyxl
# e.g. EXECUTION TIME - 27.46s for 100k rows
def generate_with_xlsxwriter(fs, df):
    with fs.open("xlsxwrite.xlsx", "wb") as f:
        with pd.ExcelWriter(f, engine='xlsxwriter') as writer:
            df.to_excel(writer, index=False)


@lightweight()
@transform(
    out=Output("/path/excel_parsing__generation"),
    source_df=Input("path/some_input"),
)
def compute(source_df, out):
    # Initialize an empty DataFrame with column names
    column_names = [f"Column {i+1}" for i in range(10)]  # Assuming 10 columns

    # Generate the data
    start_time = time.time()  # Record the start time
    df = generation_1(column_names)
    end_time = time.time()  # Record the end time
    execution_time = end_time - start_time  # Calculate the execution time
    logging.info(f"EXECUTION TIME - {execution_time}")

    # start_time = time.time()  # Record the start time
    # df = generation_2(column_names)
    # end_time = time.time()  # Record the end time
    # execution_time = end_time - start_time  # Calculate the execution time
    # logging.info(f"EXECUTION TIME - {execution_time}")

    fs = out.filesystem()

    # Save the data
    # start_time = time.time()  # Record the start time
    # df = generate_with_openpyxl(fs, df)
    # end_time = time.time()  # Record the end time
    # execution_time = end_time - start_time  # Calculate the execution time
    # logging.info(f"EXECUTION TIME - {execution_time}")

    # Save the data
    start_time = time.time()  # Record the start time
    df = generate_with_xlsxwriter(fs, df)
    end_time = time.time()  # Record the end time
    execution_time = end_time - start_time  # Calculate the execution time
    logging.info(f"EXECUTION TIME - {execution_time}")

'''
* x500 or x1000 time reduction on the datastructure generation by using generation_1 (so generate the pd dataframe as late as possible from a normal list)
* 30% time reduction on the actual XLSX generation by using xlsxwriter
Combining both approaches : 3m13s build for a 65Mb Excel of 1M rows.
'''

system · January 30, 2025, 3:11pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.