How to extract information from unstructured Excel in Code Repos?

VincentF · September 27, 2024, 8:47am

I have excels documents, and I want to extract specific information from those.
I know the target schema of the information I want to extract. I would like an LLM to give a first pass, so I can validate the information extracted later.

How can I leverage Code Repositories, AIP or LLMs to extract this unstructured information ?

VincentF · September 27, 2024, 8:50am

Here is an example of a code that will extract the text from an Excel + a picture of the excel document, and use GPT-4o (as a multi-model model) to try to parse out the relevant information from the Excel sheet.

import base64
import io
import shutil
import tempfile
from io import StringIO

import matplotlib.pyplot as plt
import pandas as pd
from language_model_service_api.languagemodelservice_api import (
    Base64ImageContent,
    ChatMessageContent,
    ChatMessageRole,
    ImageDetail,
    MultiContentChatMessage,
)
from language_model_service_api.languagemodelservice_api_completion_v3 import (
    GptChatCompletionResponse,
    GptChatWithVisionCompletionRequest,
)
from palantir_models.models import OpenAiGptChatLanguageModel
from palantir_models.transforms import OpenAiGptChatWithVisionLanguageModelInput
from pandas.plotting import table
from pyspark.sql import DataFrame
from pyspark.sql import types as T
from transforms.api import Input, Output, transform


def get_columns_from_schema(output_schema):
    # Prepr the columns names
    columns_list = [{"name": field.name, "type": field.dataType.simpleString()} for field in output_schema]
    # Convert to a format suitable for LLM
    columns_description = [f"{col['name']} ({col['type']})" for col in columns_list]
    columns_description_str = ", ".join(columns_description)
    return columns_description_str


def excel_sheet_to_base64_image(excel_file, sheet_name):
    # Read the Excel file
    df = pd.read_excel(excel_file, sheet_name=sheet_name)

    # Create a figure and axis
    fig, ax = plt.subplots(figsize=(12, 4))  # Adjust figsize as needed

    # Hide the axes
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.set_frame_on(False)

    # Create the table
    tbl = table(ax, df, loc="center", cellLoc="center", colWidths=[0.1] * len(df.columns))

    # Scale the table to fit the figure
    tbl.auto_set_font_size(False)
    tbl.set_fontsize(10)
    tbl.scale(1.2, 1.2)

    # Save the figure to a BytesIO buffer - Adding DPI to help with the LLM
    buf = io.BytesIO()
    plt.savefig(buf, format="jpeg", bbox_inches="tight", dpi=600)
    plt.close()
    buf.seek(0)

    # Encode the image to base64
    img_str = base64.b64encode(buf.read()).decode("ascii")

    return img_str


output_schema = T.StructType([
    T.StructField("thethingtoextract_id", T.StringType()),
    T.StructField("thethingtoextract_count", T.StringType()),
    T.StructField("thethingtoextract_price", T.StringType()),
    T.StructField("thethingtoextract_other_info", T.StringType()),
])


# Define a call to an LLM
def get_llm_response(excel_content, picture_as_b64, model):
    prompt = f"""
        ### Context
        We have Excels containing information about [ CONTEXT ]. Those Excels have various formatting.
        We need to convert those excels into structured dataframe.
        We are providing you with an export version of the Excel in CSV format, as well as a picture of the original Excel table.

        ### Goal
        You need to return a table with the provided schema, containing the relevant information parsed-out of the Excel.

        ### Content
        Excel content as CSV:
        ```
        {excel_content}
        ```

        Output target schema:
        ```
        {get_columns_from_schema(output_schema)}
        ```

        ### Expected output
        ONLY RETURN THE CSV.
        No quotes, no backticks, no text, no verbiage.
        Do not invent data. If a value is not available, keep it empty.
        ONLY THE CSV matching the output target schema.
    """

    request: GptChatWithVisionCompletionRequest = GptChatWithVisionCompletionRequest(
        [
            MultiContentChatMessage(
                contents=[
                    ChatMessageContent(text=prompt),
                    ChatMessageContent(
                        image=Base64ImageContent(
                            image_url=f"data:image/jpeg;base64,{picture_as_b64}", detail=ImageDetail.HIGH
                        )
                    ),
                ],
                role=ChatMessageRole.USER,
            )
        ],
        max_tokens=200,
        temperature=0.8,
    )
    response: GptChatCompletionResponse = model.create_chat_completion(request)
    return response.choices[0].message.content


def list_excel_sheets(excel_file):
    # Use pandas to load the Excel file
    excel_data = pd.ExcelFile(excel_file)
    # Get the sheet names
    sheet_names = excel_data.sheet_names
    return sheet_names


# See /workspace/model-catalog/model/view/ri.language-model-service..language-model.gpt-4-o if you want to have pre-generated code snippet
@transform(
    out=Output("<RID>"),
    model=OpenAiGptChatWithVisionLanguageModelInput("ri.language-model-service..language-model.gpt-4-o"),
    source_df=Input("<RID>"),
)
def compute(ctx, source_df: DataFrame, model: OpenAiGptChatLanguageModel, out):
    # List the files to process
    input_filesystem = source_df.filesystem()
    all_files = list(input_filesystem.ls(glob="*.xlsx"))

    # Note: Looping over files is inefficient for high number of files. Please refer to the below link for a more performant RDD based approach
    # https://www.palantir.com/docs/foundry/transforms-python/unstructured-files/
    all_dfs = []
    for curr_file in all_files:
        # Open the file
        with input_filesystem.open(curr_file.path, "rb") as f:
            with tempfile.NamedTemporaryFile() as excel_tmp_file:
                shutil.copyfileobj(f, excel_tmp_file)
                excel_tmp_file.seek(0)

                # List the sheets of the excel file
                sheet_names = list_excel_sheets(excel_tmp_file)

                for sheet_name in sheet_names:
                    # Load the Excel file
                    pandas_df = pd.read_excel(excel_tmp_file, sheet_name=sheet_name, engine="openpyxl")

                    # Get a picture of the Excel file
                    picture_as_b64 = excel_sheet_to_base64_image(excel_tmp_file, sheet_name)

                    # Convert the DataFrame to a string or other suitable format for GPT-4
                    # data_string = pandas_df.to_string()
                    # Alternative: CSV
                    data_string = pandas_df.to_csv(index=False)

                    # Call an LLM to extract the information from the Excel
                    response = get_llm_response(data_string, picture_as_b64, model)

                    # Use StringIO to simulate a file-like object from the string
                    csv_data = StringIO(response)

                    # Read the CSV string into a Pandas DataFrame
                    pandas_output = pd.read_csv(csv_data, delimiter=",")

                    # Convert the Pandas DataFrame to a Spark DataFrame
                    curr_df = ctx.spark_session.createDataFrame(pandas_output)
                    all_dfs.append(curr_df)

    out.write_dataframe(all_dfs[0])