Here is an example of a code that will extract the text from an Excel + a picture of the excel document, and use GPT-4o (as a multi-model model) to try to parse out the relevant information from the Excel sheet.
import base64
import io
import shutil
import tempfile
from io import StringIO
import matplotlib.pyplot as plt
import pandas as pd
from language_model_service_api.languagemodelservice_api import (
Base64ImageContent,
ChatMessageContent,
ChatMessageRole,
ImageDetail,
MultiContentChatMessage,
)
from language_model_service_api.languagemodelservice_api_completion_v3 import (
GptChatCompletionResponse,
GptChatWithVisionCompletionRequest,
)
from palantir_models.models import OpenAiGptChatLanguageModel
from palantir_models.transforms import OpenAiGptChatWithVisionLanguageModelInput
from pandas.plotting import table
from pyspark.sql import DataFrame
from pyspark.sql import types as T
from transforms.api import Input, Output, transform
def get_columns_from_schema(output_schema):
# Prepr the columns names
columns_list = [{"name": field.name, "type": field.dataType.simpleString()} for field in output_schema]
# Convert to a format suitable for LLM
columns_description = [f"{col['name']} ({col['type']})" for col in columns_list]
columns_description_str = ", ".join(columns_description)
return columns_description_str
def excel_sheet_to_base64_image(excel_file, sheet_name):
# Read the Excel file
df = pd.read_excel(excel_file, sheet_name=sheet_name)
# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 4)) # Adjust figsize as needed
# Hide the axes
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
ax.set_frame_on(False)
# Create the table
tbl = table(ax, df, loc="center", cellLoc="center", colWidths=[0.1] * len(df.columns))
# Scale the table to fit the figure
tbl.auto_set_font_size(False)
tbl.set_fontsize(10)
tbl.scale(1.2, 1.2)
# Save the figure to a BytesIO buffer - Adding DPI to help with the LLM
buf = io.BytesIO()
plt.savefig(buf, format="jpeg", bbox_inches="tight", dpi=600)
plt.close()
buf.seek(0)
# Encode the image to base64
img_str = base64.b64encode(buf.read()).decode("ascii")
return img_str
output_schema = T.StructType([
T.StructField("thethingtoextract_id", T.StringType()),
T.StructField("thethingtoextract_count", T.StringType()),
T.StructField("thethingtoextract_price", T.StringType()),
T.StructField("thethingtoextract_other_info", T.StringType()),
])
# Define a call to an LLM
def get_llm_response(excel_content, picture_as_b64, model):
prompt = f"""
### Context
We have Excels containing information about [ CONTEXT ]. Those Excels have various formatting.
We need to convert those excels into structured dataframe.
We are providing you with an export version of the Excel in CSV format, as well as a picture of the original Excel table.
### Goal
You need to return a table with the provided schema, containing the relevant information parsed-out of the Excel.
### Content
Excel content as CSV:
```
{excel_content}
```
Output target schema:
```
{get_columns_from_schema(output_schema)}
```
### Expected output
ONLY RETURN THE CSV.
No quotes, no backticks, no text, no verbiage.
Do not invent data. If a value is not available, keep it empty.
ONLY THE CSV matching the output target schema.
"""
request: GptChatWithVisionCompletionRequest = GptChatWithVisionCompletionRequest(
[
MultiContentChatMessage(
contents=[
ChatMessageContent(text=prompt),
ChatMessageContent(
image=Base64ImageContent(
image_url=f"data:image/jpeg;base64,{picture_as_b64}", detail=ImageDetail.HIGH
)
),
],
role=ChatMessageRole.USER,
)
],
max_tokens=200,
temperature=0.8,
)
response: GptChatCompletionResponse = model.create_chat_completion(request)
return response.choices[0].message.content
def list_excel_sheets(excel_file):
# Use pandas to load the Excel file
excel_data = pd.ExcelFile(excel_file)
# Get the sheet names
sheet_names = excel_data.sheet_names
return sheet_names
# See /workspace/model-catalog/model/view/ri.language-model-service..language-model.gpt-4-o if you want to have pre-generated code snippet
@transform(
out=Output("<RID>"),
model=OpenAiGptChatWithVisionLanguageModelInput("ri.language-model-service..language-model.gpt-4-o"),
source_df=Input("<RID>"),
)
def compute(ctx, source_df: DataFrame, model: OpenAiGptChatLanguageModel, out):
# List the files to process
input_filesystem = source_df.filesystem()
all_files = list(input_filesystem.ls(glob="*.xlsx"))
# Note: Looping over files is inefficient for high number of files. Please refer to the below link for a more performant RDD based approach
# https://www.palantir.com/docs/foundry/transforms-python/unstructured-files/
all_dfs = []
for curr_file in all_files:
# Open the file
with input_filesystem.open(curr_file.path, "rb") as f:
with tempfile.NamedTemporaryFile() as excel_tmp_file:
shutil.copyfileobj(f, excel_tmp_file)
excel_tmp_file.seek(0)
# List the sheets of the excel file
sheet_names = list_excel_sheets(excel_tmp_file)
for sheet_name in sheet_names:
# Load the Excel file
pandas_df = pd.read_excel(excel_tmp_file, sheet_name=sheet_name, engine="openpyxl")
# Get a picture of the Excel file
picture_as_b64 = excel_sheet_to_base64_image(excel_tmp_file, sheet_name)
# Convert the DataFrame to a string or other suitable format for GPT-4
# data_string = pandas_df.to_string()
# Alternative: CSV
data_string = pandas_df.to_csv(index=False)
# Call an LLM to extract the information from the Excel
response = get_llm_response(data_string, picture_as_b64, model)
# Use StringIO to simulate a file-like object from the string
csv_data = StringIO(response)
# Read the CSV string into a Pandas DataFrame
pandas_output = pd.read_csv(csv_data, delimiter=",")
# Convert the Pandas DataFrame to a Spark DataFrame
curr_df = ctx.spark_session.createDataFrame(pandas_output)
all_dfs.append(curr_df)
out.write_dataframe(all_dfs[0])