Distance between any two points after DBSCAN
DBSCAN is a clustering model which is robust to detect the outliers also. A parameter $\epsilon$ i.e. radius is an input of the algorithm, a point is said to be outlier if it's circle with radius $\epsilon$ has no point except that point of center. I have detected the outliers for a dataset, but then I observed that all pair distances is less than $\epsilon$. I'm just confused now, Is my understanding of DBSCAN wrong or there should be some mistake in my code?
import numpy as np
import pandas as pd
df = pd.read_csv('HomeC.csv')
time_index = pd.date_range('2016-01-01 00:00', periods=503911, freq='min')
#time_index = pd.DatetimeIndex(time_index)
L = []
for i in range(len(time_index)):
L.append()
for i in range(len(time_index)):
if int(str(time_index[i])[10:13]) 4 and int(str(time_index[i])[10:13]) = 0:
L[i] = 'Night'
if int(str(time_index[i])[10:13]) 9 and int(str(time_index[i])[10:13]) = 4:
L[i] = 'Morning'
if int(str(time_index[i])[10:13]) 12 and int(str(time_index[i])[10:13]) = 9:
L[i] = 'Late Morning'
if int(str(time_index[i])[10:13]) 15 and int(str(time_index[i])[10:13]) = 12:
L[i] = 'afternoon'
if int(str(time_index[i])[10:13]) 18 and int(str(time_index[i])[10:13]) = 15:
L[i] = 'late afternoon'
if int(str(time_index[i])[10:13]) 21 and int(str(time_index[i])[10:13]) = 18:
L[i] = 'Evening'
if int(str(time_index[i])[10:13]) 24 and int(str(time_index[i])[10:13]) = 21:
L[i] = 'Late evening'
df = df.iloc[:,:].values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder #sklearn is inside numpy module
labelencoder_X = LabelEncoder()
df[:,0] = L
df[:, 0] = labelencoder_X.fit_transform(df[:, 0]) # Converting Categorical feature to numerrical feature
df[:,20] = df[:,20].astype('str')
df[:,23] = df[:,23].astype('str')
labelencoder_Y = LabelEncoder()
df[:, 20] = labelencoder_Y.fit_transform(df[:, 20])
labelencoder_Z = LabelEncoder()
df[:, 23] = labelencoder_Z.fit_transform(df[:, 23])
df = df[58:,:]
df = df.astype('float')
df = df[:len(df)-1]
df = np.log(df+10)
house1 = []
house2 = []
house3 = []
house4 = []
for i in range(0,len(df)):
if i % 4 == 0:
house1.append(df[i])
elif i % 4 == 1:
house2.append(df[i])
elif i % 4 == 2:
house3.append(df[i])
else:
house4.append(df[i])
X_house1 = house1[:5000]
y2 = dbscan(X_house1)
mins = []
count = 0
for i in range(5000):
print(i)
temp = []
count = 0
for j in range(5000):
if i != j:
temp.append(np.sqrt(np.sum(np.square(X[i]-X[j]))))
if min(set(temp)) eps:
count += 1
mins.append(min(set(temp)))
house1 = np.array(house1)
house2 = np.array(house2)
house3 = np.array(house3)
house4 = np.array(house4)
from sklearn.cluster import DBSCAN
def dbscan(X):
clustering = DBSCAN(eps=0.6 , min_samples=200).fit(X)
y = clustering.labels_
y_2 = []
for i in range(len(y)):
if y[i] != -1:
y_2.append(0)
else:
y_2.append(1)
return np.array(y_2)
X = X_house1
eps = 0.6
mins = []
count = 0
for i in range(5000): #Calculating the distance of each pair of points
print(i)
temp = []
count = 0
for j in range(5000):
if i != j:
temp.append(np.sqrt(np.sum(np.square(X[i]-X[j]))))
if min(set(temp)) eps:
count += 1
mins.append(min(set(temp)))
print(count,sum(y2)) #count is 0, but should be equal to sum(y2), sum(y2) is total number of the outliers
link of the dataset https://www.kaggle.com/taranvee/smart-home-dataset-with-weather-information
Topic unsupervised-learning anomaly-detection dbscan outlier clustering
Category Data Science