Hello,
I am coming across an issue that I can’t solve:
I have been re-using one of Palantir example - Cluster images for comprehensive feature analysis and modified it for my use case which is Document clustering.
For that, I have swapped the vision transformer ViT to Siglip2 which should handle textual context in addition to image features.
However when I am adapting the clustering part, I am having a strange behaviour of HDBSCAN when I compare my Jupyter Workspace implementation vs Code Repository with the same input parameters.
import hdbscan
import numpy as np
from pyspark.sql import functions as F
from transforms.api import transform, Input, Output, configure
import logging
logger = logging.getLogger(__name__)
@configure(profile=["KUBERNETES_NO_EXECUTORS", "DRIVER_MEMORY_MEDIUM", "DRIVER_CORES_MEDIUM", "DRIVER_MEMORY_OVERHEAD_LARGE"])
@transform(
clustersOut=Output("ri.foundry.main.dataset."),
imgsIn=Input("ri.foundry.main.dataset."),
base64In=Input("ri.foundry.main.dataset."),
)
def compute(ctx, clustersOut, imgsIn, base64In):
imagesDf = imgsIn.dataframe()
base64Df = base64In.dataframe()
data = imagesDf.selectExpr("path", "embeddings").collect()
clusterResults = getClusters(data)
df1 = ctx.spark_session.createDataFrame(clusterResults)
emb3dResults = getEmbeddings3d(data)
df2 = ctx.spark_session.createDataFrame(emb3dResults)
df = (
df1
.join(F.broadcast(df2), ["path"], "left")
.join(imagesDf, ["path"], "left")
.join(base64Df, ["path"], "left")
)
clustersOut.write_dataframe(df)
return
def getClusters(data):
embeddingsList = np.array(list(map(lambda row: row.embeddings, data)))
clusters = clusterImages_HDBSCAN(embeddingsList, min_cluster_size=8, min_samples=1,metric='euclidean',cluster_selection_epsilon=2, cluster_selection_method='leaf')
# clusters = clusterImages_DBSCAN(embeddingsList, eps=0.00006, min_samples=3)
results = [{"path": data[i].path, "cluster": "class_%s" % str(cls[1])} for i, cls in enumerate(clusters)]
return results
# Assuming you have a 2D array 'embeddings' where each row is the embedding of an image
def clusterImages_HDBSCAN(embeddings, min_cluster_size, min_samples, metric,cluster_selection_epsilon,cluster_selection_method):
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples,
cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method = cluster_selection_method)
clusters = clusterer.fit_predict(embeddings)
# Associate each embedding with its cluster
embedding_cluster_pairs = list(zip(embeddings, clusters))
return embedding_cluster_pairs
from sklearn.cluster import DBSCAN
# Assuming you have a 2D array 'embeddings' where each row is the embedding of an image
def clusterImages_DBSCAN(embeddings, eps=0.5, min_samples=5):
dbscan = DBSCAN(eps=eps, min_samples=min_samples).fit(embeddings)
# The labels_ property of DBSCAN gives the cluster each point belongs to
clusters = dbscan.labels_
# Associate each embedding with its cluster
embedding_cluster_pairs = list(zip(embeddings, clusters))
return embedding_cluster_pairs
from pyspark.sql import types as T
def getEmbeddings3d(data):
# df = imgEmbeddingsDf #.limit(200)
# data = df.select("embeddings", "path").collect()
paths = list(map(lambda r: r.path, data))
embs = list(map(lambda r: r.embeddings, data))
reducedEmbs = getReducedVectors(embs)
# print(getReducedVectors(embs)[0:10])
results = [{"path": paths[i], "embeddings_3d": e.tolist(),
} for i,e in enumerate(reducedEmbs)]
return results
def getReducedVectors(embs):
import numpy as np
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
n_components = 3 #3D
embs = np.array(embs) #converting to numpy array
tsne = TSNE(n_components=n_components, random_state=42, perplexity=5)
reduced_vectors = tsne.fit_transform(embs)
return reduced_vectors
I have tried many different ways, as well as using lightweight transform to process on single node with the same issue.
Would you be aware of an architecture difference that would explain the issue?
Best regards,