HDBSCAN behaviour in Code Repo vs Jupyter Workspace

9eab477d6eb76c06c4f6 · April 15, 2025, 8:11am

Hello,

I am coming across an issue that I can’t solve:

I have been re-using one of Palantir example - Cluster images for comprehensive feature analysis and modified it for my use case which is Document clustering.

For that, I have swapped the vision transformer ViT to Siglip2 which should handle textual context in addition to image features.
However when I am adapting the clustering part, I am having a strange behaviour of HDBSCAN when I compare my Jupyter Workspace implementation vs Code Repository with the same input parameters.

import hdbscan
import numpy as np
from pyspark.sql import functions as F
from transforms.api import transform, Input, Output, configure

import logging
logger = logging.getLogger(__name__)


@configure(profile=["KUBERNETES_NO_EXECUTORS", "DRIVER_MEMORY_MEDIUM", "DRIVER_CORES_MEDIUM", "DRIVER_MEMORY_OVERHEAD_LARGE"])
@transform(
    clustersOut=Output("ri.foundry.main.dataset."),
    imgsIn=Input("ri.foundry.main.dataset."),
    base64In=Input("ri.foundry.main.dataset."),
)
def compute(ctx, clustersOut, imgsIn, base64In):

    imagesDf = imgsIn.dataframe()
    base64Df = base64In.dataframe()

    data = imagesDf.selectExpr("path", "embeddings").collect()

    clusterResults = getClusters(data)
    df1 = ctx.spark_session.createDataFrame(clusterResults)

    emb3dResults = getEmbeddings3d(data)
    df2 = ctx.spark_session.createDataFrame(emb3dResults)

    df = (
        df1
        .join(F.broadcast(df2), ["path"], "left")
        .join(imagesDf, ["path"], "left")
        .join(base64Df, ["path"], "left")
    )

    clustersOut.write_dataframe(df)
    return


def getClusters(data):
    embeddingsList = np.array(list(map(lambda row: row.embeddings, data)))

    clusters = clusterImages_HDBSCAN(embeddingsList, min_cluster_size=8, min_samples=1,metric='euclidean',cluster_selection_epsilon=2, cluster_selection_method='leaf')
    # clusters = clusterImages_DBSCAN(embeddingsList, eps=0.00006, min_samples=3)

    results = [{"path": data[i].path, "cluster": "class_%s" % str(cls[1])} for i, cls in enumerate(clusters)]
    return results


# Assuming you have a 2D array 'embeddings' where each row is the embedding of an image

def clusterImages_HDBSCAN(embeddings, min_cluster_size, min_samples, metric,cluster_selection_epsilon,cluster_selection_method):
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples,
                                cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method = cluster_selection_method)

    clusters = clusterer.fit_predict(embeddings)

    # Associate each embedding with its cluster
    embedding_cluster_pairs = list(zip(embeddings, clusters))

    return embedding_cluster_pairs


from sklearn.cluster import DBSCAN


# Assuming you have a 2D array 'embeddings' where each row is the embedding of an image
def clusterImages_DBSCAN(embeddings, eps=0.5, min_samples=5):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples).fit(embeddings)

    # The labels_ property of DBSCAN gives the cluster each point belongs to
    clusters = dbscan.labels_

    # Associate each embedding with its cluster
    embedding_cluster_pairs = list(zip(embeddings, clusters))

    return embedding_cluster_pairs


from pyspark.sql import types as T


def getEmbeddings3d(data):
    # df = imgEmbeddingsDf #.limit(200)
    # data = df.select("embeddings", "path").collect()
    paths = list(map(lambda r: r.path, data))
    embs = list(map(lambda r: r.embeddings, data))
    reducedEmbs = getReducedVectors(embs)
    # print(getReducedVectors(embs)[0:10])

    results = [{"path": paths[i], "embeddings_3d": e.tolist(), 

    } for i,e in enumerate(reducedEmbs)]

    return results


def getReducedVectors(embs):
    import numpy as np
    from sklearn.manifold import TSNE
    from sklearn.metrics import pairwise_distances

    n_components = 3 #3D
    embs = np.array(embs) #converting to numpy array

    tsne = TSNE(n_components=n_components, random_state=42, perplexity=5)
    reduced_vectors = tsne.fit_transform(embs)

    return reduced_vectors

I have tried many different ways, as well as using lightweight transform to process on single node with the same issue.

Would you be aware of an architecture difference that would explain the issue?

Best regards,

system · June 14, 2025, 8:11am

This topic was automatically closed 60 days after the last reply. New replies are no longer allowed.