I tried uploading a .csv file (22 GB) directly to Foundry through a manual drop and got an error saying that the file is too big.
What is the size limit? How can I upload this dataset to Foundry?
I tried uploading a .csv file (22 GB) directly to Foundry through a manual drop and got an error saying that the file is too big.
What is the size limit? How can I upload this dataset to Foundry?
Manual uploads are limited to 1gb.
If your only option is to do so, let me share this with you:
import csv
import os
def get_file_size(file_path):
"""Returns the size of the file in MB."""
return os.path.getsize(file_path) / (1024*1024.0)
def split_csv(input_file, max_size_mb=900):
"""Splits a CSV file into smaller files of specified maximum size."""
part_number = 1
current_size = 0
output_file = None
output_writer = None
with open(input_file, mode='r', newline='', encoding='utf-8') as infile:
reader = csv.reader(infile)
headers = next(reader)
global_count = 0
for row in reader:
# If no output file or if current file exceeds max_size_mb, create a new file
if output_file is None or current_size >= max_size_mb:
if output_file:
output_file.close()
output_file_name = f"{input_file.rsplit('.', 1)[0]}_part{part_number}.csv"
output_file = open(output_file_name, mode='w', newline='', encoding='utf-8')
print(f'New File {output_file_name}')
output_writer = csv.writer(output_file)
output_writer.writerow(headers) # Write headers to new file
current_size = get_file_size(output_file_name)
part_number += 1
row_counter = 0 # Reset row counter for the new file
output_writer.writerow(row)
row_counter += 1
global_count += 1
if row_counter >= 10000:
current_size = get_file_size(output_file_name)
print(f'File {output_file_name}, rows {global_count}, size {current_size}')
row_counter = 0 # Reset row counter after size check
if output_file:
output_file.close()
if __name__ == "__main__":
input_csv_file = r'big_file.csv'
split_csv(input_csv_file, max_size_mb=900)
If it’s easier without a python environment, you can also use the split
command along with awk
to split a large CSV file into smaller files while retaining the header row. Here’s a one-liner command to do that:
awk '(NR == 1) {header = $0; next} {print $0 > ("part_" int((NR-2)/100000) ".csv")} END {for (i = 0; i <= int((NR-2)/100000); i++) {system("sed -i \"1i " header "\" part_" i ".csv")}}' large_file.csv
Explanation:
awk '(NR == 1) {header = $0; next}
: This stores the first line (header) in the variable header
and skips to the next line.{print $0 > ("part_" int((NR-2)/100000) ".csv")}
: This writes each subsequent line to a new file named part_X.csv
. The int((NR-2)/100000)
part ensures that each file gets approximately 100000 lines (tweak this number based on your CSV to get files around 900MB).END {for (i = 0; i <= int((NR-2)/100000); i++) {system("sed -i \"1i " header "\" part_" i ".csv")}}
: After processing all lines, this loop inserts the header line at the beginning of each split file using sed
.Adjust the number 100000
to better fit your specific file size requirements. This number is a rough estimate and might need to be changed depending on the average size of your rows.
If you are already considering splitting the CSV, it may be beneficial to split into parquet files, which generally have better compression, which will save you a ton of time instead of just splitting out to separate CSVs.
First run this command:
pip install dask pyarrow
We’ll want to use dask to handle the CSV efficiently (if you have less than 22GB of RAM you will most likely run into issues, this will load it in chunks), and you may need to install pyarrow given we’re writing to parquet.
import dask.dataframe as dd
def split_csv_to_parquet(input_csv, output_dir, chunk_size):
# Read large CSV file
dask_df = dd.read_csv(input_csv, blocksize=chunk_size)
# Save Dask DF to output dir
dask_df.to_parquet(output_dir, engine='pyarrow', compression='snappy')
print(f"CSV file has been split and saved to Parquet files in {output_dir}")
if __name__ == "__main__":
input_csv = 'path/to/your/large_file.csv' # Path to CSV file
output_dir = 'path/to/output/dir' # Where parquet files will be saved
chunk_size = 100e6 # Size of chunk in MB i.e. 100MB
split_csv_to_parquet(input_csv, output_dir, chunk_size)
If you are already writing Python, why not use boto3 with multipart upload and upload the file directly?
First, install Foundry DevTools:
pip install "foundry-dev-tools[s3]"
Second, create a Token and copy it.
Third, use the following code to upload the file. Make sure to paste your token into the token variable and adjust the host, path_to_file, target_dataset_rid and path_in_dataset.
from foundry_dev_tools import FoundryContext, JWTTokenProvider
# do NOT store in version control
token = "eyJw..."
host = "stack.palantirfoundry.com"
path_to_file = "/path/to/hello.csv"
target_dataset_rid = "ri.foundry.main.dataset.de0ede77-0a1a-4bd3-8817-e5b8f26b1234"
path_in_dataset = "hello.csv"
ctx = FoundryContext(token_provider=JWTTokenProvider(host=host, jwt=token))
boto3_client = ctx.s3.get_boto3_client()
boto3_client.upload_file(path_to_file, target_dataset_rid, path_in_dataset)