# Clustering

# Introductions

Clustering is an unsupervised learning method which can be used to discover the underlying pattern in the data structure.
聚类是一种无监督的学习方法,可以用来发现数据结构中的潜在模式。

For example, it can be used to group unlabelled data
例如,它可以用于对未标记的数据进行分组

# K-Means Clustering

Requirements: numerical and normalized features
要求:数值化和归一化特征

import numpy as np
import scipy as sp
import pandas as pd
from IPython.display import display, HTML
df=pd.read_csv('data_students.csv')
cols=df.columns
# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))
# replace missing values in numerical variables by using mean value
# 用均值代替数值变量中的缺失值 
df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True)
df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True)
df["Exam"].fillna(df["Exam"].mean(), inplace=True)
df["Grade"].fillna(df["Grade"].mean(), inplace=True)
# check again whether there are missing values
# 再次检查是否有缺失值
print('ColumnName, DataType, MissingValues')
for i in cols:
    print(i, ',', df[i].dtype,',',df[i].isnull().any())
 
# remove column ID
df=df.drop('ID',1)
# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))
# Data preprocessing
print('Column Datatypes:\n',df.dtypes)
# convert all nominal variables to binary variables
# 将所有名义变量转换为二进制变量
df_raw=df.copy(deep=True) 
df_knn=df.copy(deep=True) 
# create new binary columns
# 新建二进制列
df_dummies=pd.get_dummies(df_knn[['Degree','Nationality']])
# add them to dataframe
df_knn=df_knn.join(df_dummies)
# drop original columns
df_knn=df_knn.drop('Degree',axis=1)
df_knn=df_knn.drop('Nationality', axis=1)
display('Data Example:',HTML(df_knn.head(10).to_html()))
# Normalized all numerical features
# 归一化所有数值特征
# find numeric columns 查找数字列
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
cols_numeric = df_knn.select_dtypes(include=numerics).columns.tolist()
print('Selected numerical columns:\n',cols_numeric)    
# min-max normalization to scale [0, 1]
for col in cols_numeric:
    df_knn[col]=(df_knn[col]-df_knn[col].min())/(df_knn[col].max()-df_knn[col].min())
# We ignore the label column
# 忽略标签列
df_kmeans=df_knn.drop('GradeLetter',axis=1)
display(HTML(df_kmeans.head(10).to_html()))
# KMeans clustering 
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cluster import KMeans
# API, https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
kmeans=KMeans(n_clusters=4, random_state=1,max_iter=200)
kmeans.fit(df_kmeans)
y_pred=kmeans.predict(df_kmeans)
plt.scatter(df_kmeans['Exam'],df_kmeans['Grade'],c=y_pred,cmap='viridis')
# get the cluster labels and add it back to the original data
# 获取聚类标签并将其添加回原始数据
opt=kmeans.labels_
df_knn['Cluster']=opt
display('Data:',HTML(df_knn.tail(10).to_html()))
# try different K value and find the best K for KMeans
# 尝试不同的 K 值,找到最适合 KMeans 的 K 值
# Assumption: SSE is smaller, it is better
# 假设:SSE 越小越好
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(df_kmeans)
    Sum_of_squared_distances.append(km.inertia_)
    
# Plot K and SSE, observe which one is better
# K 图和 SSE 图,观察哪个更好
# In the plot, the elbow on the arm is optimal k
# 图中折点是最佳 K
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

# DBSCAN - Density-based Clustering

from sklearn.cluster import DBSCAN
# API: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
df_dbscan=df_knn.drop(['GradeLetter','Cluster'],axis=1)
display('Data:',HTML(df_dbscan.tail(10).to_html()))
# Numpy array of all the cluster labels assigned to each data point 
# 分配给每个数据点的所有聚类标签的 Numpy 数组
db_default = DBSCAN(eps = 0.2, min_samples = 3).fit(df_dbscan) 
labels = db_default.labels_ 
df_dbscan['Cluster']=labels
display('Data after clustering:',HTML(df_dbscan.tail(10).to_html()))
# Visualize the clusters 可视化集群
# Building the label to colour mapping 
# 构建标签到颜色的映射
# Need to figure out how many clusters were produced, then assign different number of the colors
# 需要计算出产生了多少簇,然后分配不同数量的颜色
colours = {} 
colours[0] = 'r'
colours[1] = 'g'
colours[2] = 'b'
colours[-1] = 'k'
  
# Building the colour vector for each data point 
# 为每个数据点构建颜色向量
cvec = [colours[label] for label in labels] 
  
# For the construction of the legend of the plot 
r = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='r'); 
g = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='g'); 
b = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='b'); 
k = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='k'); 
  
# Plotting P1 on the X-Axis and P2 on the Y-Axis  
# 在 X 轴上绘制 P1,在 Y 轴上绘制 P2
# according to the colour vector defined 
# 根据定义的颜色向量
plt.figure(figsize =(9, 9)) 
plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], c = cvec)   
# Building the legend 
plt.legend((r, g, b, k), ('Label 0', 'Label 1', 'Label 2', 'Label -1'))   
plt.show()

# Hierarchical Clustering

from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc 
# API, https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
df_hc=df_knn.drop(['GradeLetter','Cluster'],axis=1)
display('Data:',HTML(df_hc.tail(10).to_html()))
# Plot Dendrogram
plt.figure(figsize =(8, 8)) 
plt.title('Visualising the data') 
Dendrogram = shc.dendrogram((shc.linkage(df_hc, method ='single'))) 
# Clustering based on the Dendrogram
# 基于树状图的聚类分析
# choose best K based on elbow method introduced above
# 基于上面介绍的弯肘法选择最佳 K
cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='single')
cls=cluster.fit_predict(df_hc)
print(cluster.labels_)
# Visualizing the clustering 
plt.figure(figsize =(6, 6)) 
plt.scatter(df_dbscan['Age'], df_dbscan['Exam'],  
           c = cls, cmap ='rainbow') 
plt.show()

# Association Rules

install the mlxtend library first
start anconda prompt, To install this package with conda run one of the following:

conda install -c conda-forge mlxtend 
conda install -c conda-forge/label/gcc7 mlxtend 
conda install -c conda-forge/label/cf201901 mlxtend 

fix install issues in windows
copy the following dll files
libcrypto-1_1-x64.*
libssl-1_1-x64.*
from "your Anaconda3 folder\Library\bin" to "your Anaconda3 folder\DLLs"

# load data 
import numpy as np
import scipy as sp
import pandas as pd
from IPython.display import display, HTML
df=pd.read_csv('data_students.csv')
cols=df.columns
# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))
# replace missing values in numerical variables by using mean value 
# 用均值代替数值变量中的缺失值
df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True)
df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True)
df["Exam"].fillna(df["Exam"].mean(), inplace=True)
df["Grade"].fillna(df["Grade"].mean(), inplace=True)
# check again whether there are missing values
print('ColumnName, DataType, MissingValues')
for i in cols:
    print(i, ',', df[i].dtype,',',df[i].isnull().any())
 
# remove column ID
df=df.drop('ID',1)
# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))
# Data preprocessing 
print('Column Datatypes:\n',df.dtypes)
# convert all numerical variable to nominal variables
df_nb=df.copy(deep=True)
df_nb['Gender'] = df_nb['Gender'].astype(str)
df_nb['Age'] = pd.cut(df_nb['Age'],3)
df_nb['Hours on Readings'] = pd.cut(df_nb['Hours on Readings'],3)
df_nb['Hours on Assignments'] = pd.cut(df_nb['Hours on Assignments'],3)
df_nb['Hours on Games'] = pd.cut(df_nb['Hours on Games'],3)
df_nb['Hours on Internet'] = pd.cut(df_nb['Hours on Internet'],3)
df_nb['Exam'] = pd.cut(df_nb['Exam'],3)
df_nb['Grade'] = pd.cut(df_nb['Grade'],3)
display('Data Example',HTML(df_nb.head(5).to_html()))
# Association Rule Mining 
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
import matplotlib.pyplot as plt  
print(df_nb.dtypes)
# convert all columns to strings
# 将所有列转换为字符串
df_nb = df_nb.astype(str)
df_nb['Gender'] = 'Gender=' + df_nb['Gender'].astype(str)
df_nb['Age'] = 'Age=' + df_nb['Age'].astype(str)
df_nb['Hours on Readings'] = 'Readings=' + df_nb['Hours on Readings'].astype(str)
df_nb['Hours on Assignments'] = 'Assignments=' + df_nb['Hours on Assignments'].astype(str)
df_nb['Hours on Games'] = 'Games=' + df_nb['Hours on Games'].astype(str)
df_nb['Hours on Internet'] = 'Internet=' + df_nb['Hours on Internet'].astype(str)
df_nb['Exam'] = 'Exam=' + df_nb['Exam'].astype(str)
df_nb['Grade'] = 'Grade=' + df_nb['Grade'].astype(str)
print(df_nb.dtypes)
# convert data frame to lists
# 将数据帧转换为列表
df_arr = df_nb.stack().groupby(level=0).apply(list).tolist()
# Encode lists to transactions
# 将列表编码为事务
te = TransactionEncoder()
df_transactions = te.fit_transform(df_arr)
# covnert the values to booleans: TRUE and FALSE
df_rules = pd.DataFrame(df_transactions,columns=te.columns_)
display('Data Example',HTML(df_rules.head(5).to_html()))
# API, http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
frequent_itemsets = apriori(df_rules, min_support=0.45, use_colnames=True)
# API, http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
display('Rules',HTML(rules.to_html()))

# Outlier Detection

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import LocalOutlierFactor
# API, https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor
# Data preprocessing 
# Prepare a numerical feature matrix, better to be normalized
# 准备一个数值特征矩阵,最好归一化
print('Column Datatypes:\n',df.dtypes)
# convert all nominal variables to binary variables
df_raw=df.copy(deep=True) 
df_knn=df.copy(deep=True) 
# create new binary columns
df_dummies=pd.get_dummies(df_knn[['Degree','Nationality']])
# add them to dataframe
df_knn=df_knn.join(df_dummies)
# drop original columns
df_knn=df_knn.drop('Degree',axis=1)
df_knn=df_knn.drop('Nationality', axis=1)
display('Data Example:',HTML(df_knn.head(10).to_html()))
# Normalized all numerical features
# 归一化所有数值特征
# find numeric columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
cols_numeric = df_knn.select_dtypes(include=numerics).columns.tolist()
print('Selected numerical columns:\n',cols_numeric)    
# min-max normalization to scale [0, 1]
for col in cols_numeric:
    df_knn[col]=(df_knn[col]-df_knn[col].min())/(df_knn[col].max()-df_knn[col].min())
df_knn=df_knn.drop("GradeLetter",1)
display(HTML(df_knn.head(10).to_html()))
# API, https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor
# plot data points
plt.scatter(df_knn["Age"], df_knn["Exam"], color = "b")
plt.grid()
plt.show()
# model specification
model1 = LocalOutlierFactor(n_neighbors = 3, metric = "euclidean")
# model fitting 模型拟合
y_pred = model1.fit_predict(df_knn)
# filter outlier index 过滤离群指数
outlier_index = np.where(y_pred == -1) # negative values are outliers 
print("outlier indices: ", outlier_index)
# filter outlier values
outlier_values = df_knn.iloc[outlier_index]
# plot data
plt.scatter(df["Age"], df["Exam"], color = "b")
# plot outlier values
plt.scatter(outlier_values["Age"], outlier_values["Exam"], color = "r")
plt.grid()
plt.show()

# Assignment

# Clustering

  • Ignore the column related to loan term
    忽略与贷款期限相关的列
  • Perform Kmeans, DBSCAN and hierarchical clustering
    执行 Kmeans, DBSCAN 和层次聚类
  • Determine the best K in Kmeans and use the same K in hierarchical clustering
    确定 Kmeans 中的最佳 K,并在层次聚类中使用相同的 K
  • Evaluate the training process of these models by comparing SSE
    通过比较 SSE 评估这些模型的训练过程
  • Evaluate the outputs by fusing them into your previous classification task
    通过将输出融合到之前的分类任务中来评估输出

# Association Rules

  • Produce the rules by trying different confidence and support values
    通过尝试不同的置信度和支持值来生成规则
  • Pick up top interesting/useful rules, and explain them, e.g., why they are valuable or interesting
    找出最有趣 / 有用的规则,并解释它们,例如,为什么它们有价值或有趣

# Outlier Detections

  • Identify outliers by using LOF
    利用 LOF 识别离群值
  • Re-run Kmeans clustering to see whether SSE can be reduced
    重新运行 Kmeans 聚类,看看 SSE 是否可以降低