# Import libraries.

import sys
sys.path.insert(1, './src')
import numpy as np
import pandas as pd
from DCFcluster import DCFcluster
from itertools import cycle, islice
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import warnings
from sklearn import cluster, datasets, metrics


# Simulate synthetic datasets.

np.random.seed(0)
n_samples = 1500

noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)

half_moons = datasets.make_moons(n_samples=n_samples, noise=.05)

blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
X, y = datasets.make_blobs(n_samples=n_samples, random_state=170)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

varied = datasets.make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=170)


# Plot the noisy_circles dataset.

Data = noisy_circles
X = Data[0]
y = Data[1]
plt.xlim(X[:,0].min(), X[:,0].max())
plt.ylim(X[:,1].min(), X[:,1].max())
plt.scatter(X[:, 0], X[:, 1], s = 1,marker = "o")
plt.xticks(())
plt.yticks(())

([], [])


# Apply the DCF algorithm on the dataset and visualize the clustering results.

result = DCFcluster.train(X, k = 40, beta = 0.4)


colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a','#f781bf', '#a65628', '#984ea3',
                                    '#999999', '#e41a1c', '#dede00']), int(max(result.labels) + 1))))
plt.xlim(X[:,0].min(), X[:,0].max())
plt.ylim(X[:,1].min(), X[:,1].max())
plt.scatter(X[:, 0], X[:, 1], s = 1,color = colors[result.labels],marker = "o")
plt.text(.99, .01, ('k= {0} beta= {1}'.format(str(40), str(0.4))).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
plt.xticks(())
plt.yticks(())

([], [])


# Plot the varied dataset.

Data = varied
X = Data[0]
y = Data[1]
plt.xlim(X[:,0].min(), X[:,0].max())
plt.ylim(X[:,1].min(), X[:,1].max())
plt.scatter(X[:, 0], X[:, 1], s = 1,marker = "o")
plt.xticks(())
plt.yticks(())

([], [])


# Apply the DCF algorithm on the dataset and visualize the clustering results.

result = DCFcluster.train(X, k = 40, beta = 0.4)


colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a','#f781bf', '#a65628', '#984ea3',
                                    '#999999', '#e41a1c', '#dede00']), int(max(result.labels) + 1))))
plt.xlim(X[:,0].min(), X[:,0].max())
plt.ylim(X[:,1].min(), X[:,1].max())
plt.scatter(X[:, 0], X[:, 1], s = 1,color = colors[result.labels],marker = "o")
plt.text(.99, .01, ('k= {0} beta= {1}'.format(str(40), str(0.4))).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
plt.xticks(())
plt.yticks(())

([], [])


# Plot the aniso dataset.

Data = aniso
X = Data[0]
y = Data[1]
plt.xlim(X[:,0].min(), X[:,0].max())
plt.ylim(X[:,1].min(), X[:,1].max())
plt.scatter(X[:, 0], X[:, 1], s = 1,marker = "o")
plt.xticks(())
plt.yticks(())

([], [])


# Apply the DCF algorithm on the dataset and visualize the clustering results.

result = DCFcluster.train(X, k = 40, beta = 0.4)


colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a','#f781bf', '#a65628', '#984ea3',
                                    '#999999', '#e41a1c', '#dede00']), int(max(result.labels) + 1))))
plt.xlim(X[:,0].min(), X[:,0].max())
plt.ylim(X[:,1].min(), X[:,1].max())
plt.scatter(X[:, 0], X[:, 1], s = 1,color = colors[result.labels],marker = "o")
plt.text(.99, .01, ('k= {0} beta= {1}'.format(str(40), str(0.4))).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
plt.xticks(())
plt.yticks(())

([], [])


# Plot the half_moons dataset.

Data = half_moons
X = Data[0]
y = Data[1]
plt.xlim(X[:,0].min(), X[:,0].max())
plt.ylim(X[:,1].min(), X[:,1].max())
plt.scatter(X[:, 0], X[:, 1], s = 1,marker = "o")
plt.xticks(())
plt.yticks(())

([], [])


# Apply the DCF algorithm on the dataset and visualize the clustering results.

result = DCFcluster.train(X, k = 40, beta = 0.4)


colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a','#f781bf', '#a65628', '#984ea3',
                                    '#999999', '#e41a1c', '#dede00']), int(max(result.labels) + 1))))
plt.xlim(X[:,0].min(), X[:,0].max())
plt.ylim(X[:,1].min(), X[:,1].max())
plt.scatter(X[:, 0], X[:, 1], s = 1,color = colors[result.labels],marker = "o")
plt.text(.99, .01, ('k= {0} beta= {1}'.format(str(40), str(0.4))).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
plt.xticks(())
plt.yticks(())

([], [])


# Load dataset.

df = pd.read_csv('Seed_Data.csv')
df.head()


# Plot all the features.

import seaborn as sns
sns.lineplot(data=df.drop(['target'], axis=1))
plt.show()


df.columns

Index(['A', 'P', 'C', 'LK', 'WK', 'A_Coef', 'LKG', 'target'], dtype='object')


X = df[['A', 'P', 'C', 'LK', 'WK', 'A_Coef', 'LKG']]


# Standardize the features by subtracting the mean and then scaling to unit variance.

from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit(X)
X_scaled = scale.transform(X)


# Apply the DCF algorithm on the dataset and visualize the clustering results.

result = DCFcluster.train(X_scaled, k = 14, beta = 0.7)


# Visualize through columns 1 and 2.

colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a','#f781bf', '#a65628', '#984ea3',
                                    '#999999', '#e41a1c', '#dede00']), int(max(result.labels) + 1))))
plt.xlim(X_scaled[:,0].min(), X_scaled[:,0].max())
plt.ylim(X_scaled[:,1].min(), X_scaled[:,1].max())
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], s = 1,color = colors[result.labels],marker = "o")
plt.text(.99, .01, ('k= {0} beta= {1}'.format(str(14), str(0.7))).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
plt.xticks(())
plt.yticks(())

([], [])


# pairwise scatter plot

def matrix_plot(a,b,c):
    colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a','#f781bf', '#a65628', '#984ea3',
                                    '#999999', '#e41a1c', '#dede00']), int(max(result.labels) + 1))))
    plt.xlim(X_scaled[:,a].min(), X_scaled[:,a].max())
    plt.ylim(X_scaled[:,b].min(), X_scaled[:,b].max())
    plt.subplot(2,2,c)
    plt.scatter(X_scaled[:, 0], X_scaled[:, 1], s = 1,color = colors[result.labels],marker = "o")
    plt.xticks(())
    plt.yticks(())

matrix_plot(1,2,1)
matrix_plot(2,3,2)
matrix_plot(3,4,3)
matrix_plot(4,5,4)
plt.show()


# AMI & ARI

ami = metrics.adjusted_mutual_info_score(df['target'].astype(int), result.labels.astype(int))
ari = metrics.adjusted_rand_score(df['target'].astype(int), result.labels.astype(int))
print("Adjusted Mutual Information Score -", ami)
print ("Adjusted Rand Score -", ari)

Adjusted Mutual Information Score - 0.7217366105244023
Adjusted Rand Score - 0.7847997819414604


# Sensitivity analysis w.r.t. k.

ari = np.zeros(5)
ami = np.zeros(5)

ari[0] = metrics.adjusted_rand_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 10, beta = 0.7).labels.astype(int))
ari[1] = metrics.adjusted_rand_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 11, beta = 0.7).labels.astype(int))
ari[2] = metrics.adjusted_rand_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 12, beta = 0.7).labels.astype(int))
ari[3] = metrics.adjusted_rand_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 13, beta = 0.7).labels.astype(int))
ari[4] = metrics.adjusted_rand_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 14, beta = 0.7).labels.astype(int))

ami[0] = metrics.adjusted_mutual_info_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 10, beta = 0.7).labels.astype(int))
ami[1] = metrics.adjusted_mutual_info_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 11, beta = 0.7).labels.astype(int))
ami[2] = metrics.adjusted_mutual_info_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 12, beta = 0.7).labels.astype(int))
ami[3] = metrics.adjusted_mutual_info_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 13, beta = 0.7).labels.astype(int))
ami[4] = metrics.adjusted_mutual_info_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 14, beta = 0.7).labels.astype(int))


x = [10,11,12,13,14]
default_x_ticks = range(len(x))
plt.plot(default_x_ticks, ami, color = "#9ecae1", marker = "o", label = "AMI")
plt.plot(default_x_ticks, ari,color = "#636363", marker = "o", label = "ARI")
plt.xticks(default_x_ticks, x)
plt.legend()
plt.ylabel("AMI/ARI")
plt.xlabel("neighborhood parameter k")
plt.show()


# Sensitivity analysis w.r.t. beta.

ari = np.zeros(5)
ami = np.zeros(5)

ari[0] = metrics.adjusted_rand_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 15, beta = 0.1).labels.astype(int))
ari[1] = metrics.adjusted_rand_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 15, beta = 0.3).labels.astype(int))
ari[2] = metrics.adjusted_rand_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 15, beta = 0.5).labels.astype(int))
ari[3] = metrics.adjusted_rand_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 15, beta = 0.7).labels.astype(int))
ari[4] = metrics.adjusted_rand_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 15, beta = 0.8).labels.astype(int))

ami[0] = metrics.adjusted_mutual_info_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 15, beta = 0.1).labels.astype(int))
ami[1] = metrics.adjusted_mutual_info_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 15, beta = 0.3).labels.astype(int))
ami[2] = metrics.adjusted_mutual_info_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 15, beta = 0.5).labels.astype(int))
ami[3] = metrics.adjusted_mutual_info_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 15, beta = 0.7).labels.astype(int))
ami[4] = metrics.adjusted_mutual_info_score(df['target'].astype(int), DCFcluster.train(X_scaled, k = 15, beta = 0.8).labels.astype(int))


x = [0.1,0.3, 0.5, 0.7, 0.8]
default_x_ticks = range(len(x))
plt.plot(default_x_ticks, ami, color = "#9ecae1", marker = "o", label = "AMI")
plt.plot(default_x_ticks, ari,color = "#636363", marker = "o", label = "ARI")
plt.xticks(default_x_ticks, x)
plt.legend()
plt.ylabel("AMI/ARI")
plt.xlabel("fluctuation parameter β")
plt.show()


# Load dataset.

df = pd.read_csv('wine-quality-white-and-red.csv')
df.head()


# Plot the features.

import seaborn as sns
sns.lineplot(data=X)
plt.show()


df.isnull().sum()

type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


df.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')


X = df[['fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality']]


from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['type']= label_encoder.fit_transform(df['type'])


# Standardize the features by subtracting the mean and then scaling to unit variance.

from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit(X)
X_scaled = scale.transform(X)


# Apply the DCF algorithm on the dataset and visualize the clustering results.

result = DCFcluster.train(X_scaled, k = 120, beta = 0.7)


colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a','#f781bf', '#a65628', '#984ea3',
                                    '#999999', '#e41a1c', '#dede00']), int(max(result.labels) + 1))))
plt.xlim(X_scaled[:,0].min(), X_scaled[:,0].max())
plt.ylim(X_scaled[:,1].min(), X_scaled[:,1].max())
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], s = 1,color = colors[result.labels],marker = "o")
plt.text(.99, .01, ('k= {0} beta= {1}'.format(str(120), str(0.7))).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
plt.xticks(())
plt.yticks(())

([], [])


# pairwise scatter plot

def matrix_plot(a,b,c):
    colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a','#f781bf', '#a65628', '#984ea3',
                                    '#999999', '#e41a1c', '#dede00']), int(max(result.labels) + 1))))
    plt.xlim(X_scaled[:,a].min(), X_scaled[:,a].max())
    plt.ylim(X_scaled[:,b].min(), X_scaled[:,b].max())
    plt.subplot(2,2,c)
    plt.scatter(X_scaled[:, 0], X_scaled[:, 1], s = 1,color = colors[result.labels],marker = "o")
    plt.xticks(())
    plt.yticks(())

matrix_plot(2,3,1)
matrix_plot(3,4,2)
matrix_plot(6,7,3)
matrix_plot(10,11,4)
plt.show()


# AMI & ARI

ami = metrics.adjusted_mutual_info_score(df['type'].astype(int), result.labels.astype(int))
ari = metrics.adjusted_rand_score(df['type'].astype(int), result.labels.astype(int))
print("Adjusted Mutual Information Score -", ami)
print ("Adjusted Rand Score -", ari)

Adjusted Mutual Information Score - 0.8694930498497558
Adjusted Rand Score - 0.9395430911170245


# Sensitivity analysis w.r.t. k.

ari = np.zeros(5)
ami = np.zeros(5)

ari[0] = metrics.adjusted_rand_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 90, beta = 0.7).labels.astype(int))
ari[1] = metrics.adjusted_rand_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 95, beta = 0.7).labels.astype(int))
ari[2] = metrics.adjusted_rand_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 100, beta = 0.7).labels.astype(int))
ari[3] = metrics.adjusted_rand_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 110, beta = 0.7).labels.astype(int))
ari[4] = metrics.adjusted_rand_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 120, beta = 0.7).labels.astype(int))

ami[0] = metrics.adjusted_mutual_info_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 90, beta = 0.7).labels.astype(int))
ami[1] = metrics.adjusted_mutual_info_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 95, beta = 0.7).labels.astype(int))
ami[2] = metrics.adjusted_mutual_info_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 100, beta = 0.7).labels.astype(int))
ami[3] = metrics.adjusted_mutual_info_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 110, beta = 0.7).labels.astype(int))
ami[4] = metrics.adjusted_mutual_info_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 120, beta = 0.7).labels.astype(int))


x = [90,95,100,110,120]
default_x_ticks = range(len(x))
plt.plot(default_x_ticks, ami, color = "#9ecae1", marker = "o", label = "AMI")
plt.plot(default_x_ticks, ari,color = "#636363", marker = "o", label = "ARI")
plt.xticks(default_x_ticks, x)
plt.legend()
plt.ylabel("AMI/ARI")
plt.xlabel("neighborhood parameter k")
plt.show()


# Sensitivity analysis w.r.t. beta.

ari = np.zeros(5)
ami = np.zeros(5)

ari[0] = metrics.adjusted_rand_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 120, beta = 0.4).labels.astype(int))
ari[1] = metrics.adjusted_rand_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 120, beta = 0.5).labels.astype(int))
ari[2] = metrics.adjusted_rand_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 120, beta = 0.6).labels.astype(int))
ari[3] = metrics.adjusted_rand_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 120, beta = 0.7).labels.astype(int))
ari[4] = metrics.adjusted_rand_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 120, beta = 0.8).labels.astype(int))

ami[0] = metrics.adjusted_mutual_info_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 120, beta = 0.4).labels.astype(int))
ami[1] = metrics.adjusted_mutual_info_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 120, beta = 0.5).labels.astype(int))
ami[2] = metrics.adjusted_mutual_info_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 120, beta = 0.6).labels.astype(int))
ami[3] = metrics.adjusted_mutual_info_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 120, beta = 0.7).labels.astype(int))
ami[4] = metrics.adjusted_mutual_info_score(df['type'].astype(int), DCFcluster.train(X_scaled, k = 120, beta = 0.8).labels.astype(int))


x = [0.4,0.5,0.6,0.7,0.8]
default_x_ticks = range(len(x))
plt.plot(default_x_ticks, ami, color = "#9ecae1", marker = "o", label = "AMI")
plt.plot(default_x_ticks, ari,color = "#636363", marker = "o", label = "ARI")
plt.xticks(default_x_ticks, x)
plt.legend()
plt.ylabel("AMI/ARI")
plt.xlabel("fluctuation parameter β")
plt.show()

	A	P	C	LK	WK	A_Coef	LKG
0	15.26	14.84	0.8710	5.763	3.312	2.221	5.220
1	14.88	14.57	0.8811	5.554	3.333	1.018	4.956
2	14.29	14.09	0.9050	5.291	3.337	2.699	4.825
3	13.84	13.94	0.8955	5.324	3.379	2.259	4.805
4	16.14	14.99	0.9034	5.658	3.562	1.355	5.175

	type	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	white	7.0	0.27	0.36	20.7	0.045	45.0	170.0	1.0010	3.00	0.45	8.8	6
1	white	6.3	0.30	0.34	1.6	0.049	14.0	132.0	0.9940	3.30	0.49	9.5	6
2	white	8.1	0.28	0.40	6.9	0.050	30.0	97.0	0.9951	3.26	0.44	10.1	6
3	white	7.2	0.23	0.32	8.5	0.058	47.0	186.0	0.9956	3.19	0.40	9.9	6
4	white	7.2	0.23	0.32	8.5	0.058	47.0	186.0	0.9956	3.19	0.40	9.9	6

DCF: An Efficient and Robust Density-Based Clustering Method¶

Introduction¶

The Algorithm¶

The Two Inputs¶

Point Modes vs Cluster Cores¶

Code Examples¶

Example 1¶

Example 2¶

Example 3¶

Example 4¶

Example 5¶

Example 6¶