import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing, metrics
from REMclust import REM
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA


df = pd.read_csv('Data/HTRU_2.csv')

df.columns = ['µ IRP','σ IRP', 'excess kurtosis IRP', 'skewness IRP', 
              'µ DM-SNR','σ DM-SNR', 'excess kurtosis DM-SNR', 'skewness DM-SNR', 
              'class']

X = df[df.columns[:-1]]        ## drop the class column
X = np.array(X)


scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
n_samples, n_features = X_scaled.shape


bndwk = int(np.floor(np.min((30, np.sqrt(n_samples)))))
Cluster = REM(data=X_scaled, covariance_type = "full", criteria = "all", bandwidth = bndwk, tol = 1e-3)
Cluster.mode_decision_plot()        ## to estimate the density and distance threshold


# Setting the density threshold and distance threshold from the decision plot
Cluster.fit(density_threshold = 1.5, distance_threshold = 3.6)
yp = Cluster.get_labels(mixture_selection='aic')        ## predicted labels

2 modes selected.

C:\Users\samue\anaconda3\envs\sisa\lib\site-packages\REMclust\REM.py:432: ComplexWarning: Casting complex values to real discards the imaginary part
  covariances_jitter[i, :, :] = vec.dot(np.diag(val)).dot(np.linalg.inv(vec))


yp = np.where(np.array(yp)==0,1,0)        ## corrected yp

# performance metric
fm = metrics.fowlkes_mallows_score(df['class'], yp)
print('Fowlkes-Mallows index (FMI) : %.2f' % (fm))

Fowlkes-Mallows index (FMI) : 0.85


# confusion matrix or classification rate
metrics.ConfusionMatrixDisplay.from_predictions(df['class'], yp, normalize='true')
plt.show()


# pairwise feature plots with predicted cluster labels
df_updated = df[df.columns[:-1]]
df_updated['labels'] = yp
sns.pairplot(df_updated, hue="labels")
plt.show()


pca_X = PCA(n_components=2)
pca_X_scaled = pca_X.fit_transform(X_scaled)

plt.scatter(pca_X_scaled[:, 0], pca_X_scaled[:, 1], s = 5, c = yp, marker = "o", cmap='tab10')
plt.xlabel('Principal Component - 1')
plt.ylabel('Principal Component - 2')
plt.title('PCA')
plt.show()


# NMI & ARI
nmi = metrics.normalized_mutual_info_score(df['class'], yp)
ari = metrics.adjusted_rand_score(df['class'], yp)
print("Adjusted Rand Score: \t", ari)
print("Normalized Mutual Information Score: \t", nmi)

Adjusted Rand Score: 	 0.3878884708578086
Normalized Mutual Information Score: 	 0.2743628710506591


# sensitivity w.r.t. density threshold
n = 5
nmi_x = np.zeros((n))
ari_x = np.zeros((n))
dens_thres = [1.3,1.4,1.5,1.6,1.7]

for i in range(n):
    Cluster.fit(density_threshold = dens_thres[i], 
                distance_threshold = 3.6)
    yp_x = Cluster.get_labels(mixture_selection='aic')
    nmi_x[i] = metrics.normalized_mutual_info_score(df['class'].astype(int), yp_x.astype(int))
    ari_x[i] = metrics.adjusted_rand_score(df['class'].astype(int), yp_x.astype(int))

plt.plot(dens_thres, nmi_x, marker = 'o', label='nmi')
plt.plot(dens_thres, ari_x, marker = 'o', label='ari')
plt.xlabel('density threshold')
plt.ylabel('nmi/ari')
plt.legend()
plt.show()

2 modes selected.

C:\Users\samue\anaconda3\envs\sisa\lib\site-packages\REMclust\REM.py:432: ComplexWarning: Casting complex values to real discards the imaginary part
  covariances_jitter[i, :, :] = vec.dot(np.diag(val)).dot(np.linalg.inv(vec))

2 modes selected.

C:\Users\samue\anaconda3\envs\sisa\lib\site-packages\REMclust\REM.py:432: ComplexWarning: Casting complex values to real discards the imaginary part
  covariances_jitter[i, :, :] = vec.dot(np.diag(val)).dot(np.linalg.inv(vec))

2 modes selected.

C:\Users\samue\anaconda3\envs\sisa\lib\site-packages\REMclust\REM.py:432: ComplexWarning: Casting complex values to real discards the imaginary part
  covariances_jitter[i, :, :] = vec.dot(np.diag(val)).dot(np.linalg.inv(vec))

1 modes selected.
1 modes selected.


# sensitivity w.r.t. distance threshold
n = 5
nmi_x = np.zeros((n))
ari_x = np.zeros((n))
dist_thres = [3.4,3.5,3.6,3.7,3.8]

for i in range(n):
    Cluster.fit(density_threshold = 1.5, 
                distance_threshold = dist_thres[i])
    yp_x = Cluster.get_labels(mixture_selection='aic')
    nmi_x[i] = metrics.normalized_mutual_info_score(df['class'].astype(int), yp_x.astype(int))
    ari_x[i] = metrics.adjusted_rand_score(df['class'].astype(int), yp_x.astype(int))

plt.plot(dist_thres, nmi_x, marker = 'o', label='nmi')
plt.plot(dist_thres, ari_x, marker = 'o', label='ari')
plt.xlabel('density threshold')
plt.ylabel('nmi/ari')
plt.legend()
plt.show()

2 modes selected.

C:\Users\samue\anaconda3\envs\sisa\lib\site-packages\REMclust\REM.py:432: ComplexWarning: Casting complex values to real discards the imaginary part
  covariances_jitter[i, :, :] = vec.dot(np.diag(val)).dot(np.linalg.inv(vec))

2 modes selected.

C:\Users\samue\anaconda3\envs\sisa\lib\site-packages\REMclust\REM.py:432: ComplexWarning: Casting complex values to real discards the imaginary part
  covariances_jitter[i, :, :] = vec.dot(np.diag(val)).dot(np.linalg.inv(vec))

2 modes selected.

C:\Users\samue\anaconda3\envs\sisa\lib\site-packages\REMclust\REM.py:432: ComplexWarning: Casting complex values to real discards the imaginary part
  covariances_jitter[i, :, :] = vec.dot(np.diag(val)).dot(np.linalg.inv(vec))

1 modes selected.
1 modes selected.


df = pd.read_csv('Data/penguins_size.csv')

df['species'] = df['species'].map({'Adelie':0, 'Gentoo':1, 'Chinstrap':2})        ## for one hot encoding
df['island'] = df['island'].map({'Torgersen':0, 'Biscoe':1, 'Dream':2})
df['sex'] = df['sex'].map({'MALE':0, 'FEMALE':1})

df = df.dropna()        ## to discard nan value datapoints

X = df[df.columns[2:-1]]        ## considering the length and depth of beak, flipper length and body mass
X = np.array(X)


scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
n_samples, n_features = X_scaled.shape


bndwk = int(np.floor(np.min((30, np.square(n_samples)))))
Cluster = REM(data=X_scaled, covariance_type = "full", criteria = "all", bandwidth = bndwk, tol = 1e-3)
Cluster.mode_decision_plot()


# Setting the density threshold and distance threshold from the decision plot 
Cluster.fit(density_threshold = 1.15, distance_threshold = 1.00)
yp = Cluster.get_labels(mixture_selection='aic')

3 modes selected.


yp[yp == 2] = 3; yp[yp == 1] = 2; yp[yp == 3] = 1;        ## corrected yp

# performance metric
fm = metrics.fowlkes_mallows_score(df['species'], yp)
print('Fowlkes-Mallows index (FMI) : %.2f' % (fm))

Fowlkes-Mallows index (FMI) : 0.85


# confusion matrix or classification rate
metrics.ConfusionMatrixDisplay.from_predictions(df['species'], yp, normalize='true')
plt.show()


# pairwise feature plots
df_updated = df[df.columns[2:-1]]
df_updated = df_updated.assign(labels=yp)
sns.pairplot(df_updated, hue="labels", palette='tab10')
plt.show()


pca_X = PCA(n_components=2)
pca_X_scaled = pca_X.fit_transform(X_scaled)

plt.scatter(pca_X_scaled[:, 0], pca_X_scaled[:, 1], s = 5, c = yp, marker = "o", cmap='tab10')
plt.xlabel('Principal Component - 1')
plt.ylabel('Principal Component - 2')
plt.title('PCA')
plt.show()


# NMI & ARI
nmi = metrics.normalized_mutual_info_score(df['species'].astype(int), yp.astype(int))
ari = metrics.adjusted_rand_score(df['species'].astype(int), yp.astype(int))
print("Adjusted Rand Score: \t", ari)
print("Normalized Mutual Information Score: \t", nmi)

Adjusted Rand Score: 	 0.7740574481288712
Normalized Mutual Information Score: 	 0.7785511759363766


# sensitivity w.r.t. density threshold
n = 5
nmi_x = np.zeros((n))
ari_x = np.zeros((n))
dens_thres = [1.05,1.10,1.15,1.20,1.25]

for i in range(n):
    Cluster.fit(density_threshold = dens_thres[i], 
                distance_threshold = 1.0)
    yp_x = Cluster.get_labels(mixture_selection='aic')
    nmi_x[i] = metrics.normalized_mutual_info_score(df['species'].astype(int), yp_x.astype(int))
    ari_x[i] = metrics.adjusted_rand_score(df['species'].astype(int), yp_x.astype(int))

plt.plot(dens_thres, nmi_x, marker = 'o', label='nmi')
plt.plot(dens_thres, ari_x, marker = 'o', label='ari')
plt.xlabel('density threshold')
plt.ylabel('nmi/ari')
plt.legend()
plt.show()

3 modes selected.
3 modes selected.
3 modes selected.
2 modes selected.
2 modes selected.


# sensitivity w.r.t. distance threshold
n = 5
nmi_x = np.zeros((n))
ari_x = np.zeros((n))
dist_thres = [0.90,0.95,1.00,1.05,1.10]

for i in range(n):
    Cluster.fit(density_threshold = 1.0, 
                distance_threshold = dist_thres[i])
    yp_x = Cluster.get_labels(mixture_selection='aic')
    nmi_x[i] = metrics.normalized_mutual_info_score(df['species'].astype(int), yp_x.astype(int))
    ari_x[i] = metrics.adjusted_rand_score(df['species'].astype(int), yp_x.astype(int))

plt.plot(dist_thres, nmi_x, marker = 'o', label='nmi')
plt.plot(dist_thres, ari_x, marker = 'o', label='ari')
plt.xlabel('distance threshold')
plt.ylabel('nmi/ari')
plt.legend()
plt.show()

3 modes selected.
3 modes selected.
3 modes selected.
2 modes selected.
2 modes selected.

Reinforced EM Algorithm through Clever Initialization for Clustering with Gaussian Mixture Models¶

Gaussian Mixture Model (GMM) for Clustering¶

Drawbacks of the EM Algorithm¶

The REM Algorithm¶

Code Examples¶

Example 1¶

Load the data¶

Standardize the data¶

Apply the REM algorithm¶

Visualization¶

PCA plot with predicted cluster labels¶

Sensitivity analysis¶

Example 2¶

Load the data¶

Standardize the data¶

Apply the REM algorithm¶

Visualization¶

PCA plot¶

Sensitivity analysis¶

Wrap-up¶

References¶