# Introductions

Classification is a kind of supervised learning.
分类是一种监督学习。

It is used to predict the value in a nominal variable which is also called 'label'.
它用于预测名义变量 (也称为 “标签”) 中的值。

The factors that are used for predictions are called features.
用于预测的因素称为特征。

import numpy as np
import scipy as sp
import pandas as pd
from IPython.display import display, HTML
df=pd.read_csv('data_students.csv')
cols=df.columns
# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))
# replace missing values in numerical variables by using mean value
# 使用平均值替换数值变量中的缺失值 
df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True)
df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True)
df["Exam"].fillna(df["Exam"].mean(), inplace=True)
df["Grade"].fillna(df["Grade"].mean(), inplace=True)
# check again whether there are missing values
# 再次检查是否有缺失值
print('ColumnName, DataType, MissingValues')
for i in cols:
    print(i, ',', df[i].dtype,',',df[i].isnull().any())
 
# remove column ID
# 删除 ID 列
df=df.drop('ID',1)
# print out and display dataframe as tables in HTML
# 在 HTML 中打印并显示 dataframe 为表格
display(HTML(df.head(10).to_html()))

# KNN Classifier

Requirements: 1). numerical features; 2). normalized features
要求:1)数值特征;2) 标准化特征

Parameters: distance measure and value of K
参数:距离测量和 K 值

# Data preprocessing 数据预处理

print('Column Datatypes:\n',df.dtypes)
# convert all nominal variables to binary variables
# 将所有名义变量转换为二进制变量
df_raw=df.copy(deep=True) 
df_knn=df.copy(deep=True) 
# create new binary columns
# 创建新的二进制列
df_dummies=pd.get_dummies(df_knn[['Degree','Nationality']])
# add them to dataframe
# 将它们添加到 dataframe
df_knn=df_knn.join(df_dummies)
# drop original columns
# 删除原始列
df_knn=df_knn.drop('Degree',axis=1)
df_knn=df_knn.drop('Nationality', axis=1)
display('Data Example:',HTML(df_knn.head(10).to_html()))
# drop extra binary columns, since we only need N-1 binary columns
# 删除额外的二进制列,因为我们只需要 N-1 二进制列
print(df_knn.columns)
df_knn=df_knn.drop('Degree_ BS', axis=1)
df_knn=df_knn.drop('Nationality_ China', axis=1)
display('Data Example:',HTML(df_knn.head(10).to_html()))
# Normalized all numerical features
# 归一化所有数值特征
# min-max normalization to scale [0, 1]
# 最小 - 最大归一化以缩放 [0,1]
for col in df_knn.columns:
    if col != 'GradeLetter':
        # exclude GradeLetter, since it is label in our data
        # 不包括 GradeLetter,因为它是数据中的标签
        df_knn[col]=(df_knn[col]-df_knn[col].min())/(df_knn[col].max()-df_knn[col].min())
display(HTML(df_knn.head(10).to_html()))
# Build KNN models and evaluate the models 
# 建立 KNN 模型并评估模型
# Note: for demo and teaching purpose, we present evaluations based on both hold-out and N-fold cross validations
# By hold-out evaluations  通过 hold-out 评估
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
# preprocess label, since KNN requires label encoding
# 预处理标签,因为 KNN 需要标签编码
from sklearn import preprocessing
y = df_knn['GradeLetter'] # define label as nominal values 将标签定义为标称值
le = preprocessing.LabelEncoder()
le.fit(y)
y_encoded = le.transform(y) # encode nominal labels to integers 将标称标签编码为整数
print(y_encoded)
df_knn['GradeLetter'] = y_encoded
x = df_knn.drop('GradeLetter',1)
y = df_knn['GradeLetter'] 
display(HTML(df_knn.head(10).to_html()))
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2)
# Visualize train set 可视化训练组
plt.figure(1)
plt.scatter(x_train['Grade'], x_train['Exam'], c=y_train, alpha = 0.8)
plt.xlabel('Grade')
plt.ylabel('Exam')
plt.title('Visualization of Trainng Set')
plt.show()
plt.close()
# build and eval models
# 建立和评估模型
from sklearn import neighbors
from sklearn.metrics import accuracy_score
# API, https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
# API for KNeighborsClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
for k in range(1, 24, 2): 
    clf=neighbors.KNeighborsClassifier(k, weights='uniform')
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print('K =', k, ', Accuracy: ', accuracy_score(y_test, y_pred), ', Precision: ', precision_score(y_test, y_pred, average='micro'),
         ', Recall: ', recall_score(y_test, y_pred, average='micro'))
    # note, there is also an option 'macro' which calculate metric for each label, then return average
    # 注意,还有一个选项 “宏”,它计算每个标签的度量,然后返回平均值
    
# Visualize the best model on the test set
# 在测试集上可视化最佳模型
clf=neighbors.KNeighborsClassifier(1, weights='uniform')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
plt.figure(2)
plt.scatter(x_test['Grade'], x_test['Exam'], c=y_pred, alpha = 0.8)
plt.xlabel('Grade')
plt.ylabel('Exam')
plt.title('Visualization of Testing Set')
plt.show()
plt.close()
# By N-fold cross evaluations  通过 N 倍交叉评估
from sklearn.model_selection import cross_val_score
for k in range(1, 24, 2): 
    clf=neighbors.KNeighborsClassifier(k, weights='uniform')
    acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
    print('K =', k, ', Accuracy: ',acc)

# Naive Bayes Classifier

Requirements: 1). nominal features; 2). assumption of conditionally indepenence
要求:1)名义性特征;2) 条件独立假设

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score
# Pre-processing 
print('Column data types:\n',df_raw.dtypes)
df_nb=df_raw.copy(deep=True)
# convert numerical to categorical data, e.g., Age 
df_nb['Gender'] = df_nb['Gender'].astype(str)
df_nb['Age'] = pd.cut(df_nb['Age'],3)
df_nb['Hours on Readings'] = pd.cut(df_nb['Hours on Readings'],3)
df_nb['Hours on Assignments'] = pd.cut(df_nb['Hours on Assignments'],3)
df_nb['Hours on Games'] = pd.cut(df_nb['Hours on Games'],3)
df_nb['Hours on Internet'] = pd.cut(df_nb['Hours on Internet'],3)
df_nb['Exam'] = pd.cut(df_nb['Exam'],3)
df_nb['Grade'] = pd.cut(df_nb['Grade'],3)
display('Data Example',HTML(df_nb.head(5).to_html()))
# by hold-out evaluation 
y=df_nb['GradeLetter']
le = preprocessing.LabelEncoder()
le.fit(y)
y_encoded = le.transform(y) # encode nominal labels to integers 
# transform categorical data to numerical data, i.e., one-hot encoding 将分类数据转换为数值数据,即一次热编码
print(df_nb.dtypes)
df_nb=pd.get_dummies(df_nb.drop('GradeLetter',axis=1))
df_nb['GradeLetter']=y_encoded
display(HTML(df_nb.head(5).to_html()))
print('starting model build and evaluations...')
# API for GaussianNB
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB
x_train, x_test, y_train, y_test = train_test_split(df_nb, y_encoded, test_size=0.2)
clf = GaussianNB()
clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)
# in the following coding example, we use accuracy only as the example
print("Accuracy by Hold-out Eval:",accuracy_score(y_pred,y_test))
# by N-fold evaluation 
y=df_nb['GradeLetter']
x=df_nb.drop('GradeLetter',axis=1)
clf = GaussianNB()
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
print("Accuracy by N-fold Cross Validation:",acc)

# Decision Trees and Random Forest

Preprocessing: 1). encode labels; 2). convert numerical to categorical data and then encoding
预处理:1) 对标签进行编码;2) 将数字数据转换为分类数据,然后进行编码

Tips: same preprocessing with the operations in Naive Bayes
提示:与朴素贝叶斯中的操作相同的预处理

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
# API for DecisionTreeClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html?highlight=decisiontreeclassifier#sklearn.tree.DecisionTreeClassifier
# by hold-out evaluation
x_train, x_test, y_train, y_test = train_test_split(df_nb, y_encoded, test_size=0.2)
clf=DecisionTreeClassifier(criterion='entropy', max_depth=10) # note: there are many parameters in API API 中有很多参数
clf=clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)
acc=accuracy_score(y_pred, y_test)
print('Tree Accuracy by hold-out evaluation: ',acc)
# by N-fold cross validation
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
print("Tree Accuracy by N-fold Cross Validation:",acc)
# Example of randomForest = bagging method of decision trees
# 随机森林的例子 = 决策树的装袋方法
tree = DecisionTreeClassifier()
# Note: you can use tree only or the random forest for the purpose of evaluations
# 可以仅使用树或随机森林进行评估
# API, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html?highlight=baggingclassifier
bag = BaggingClassifier(tree, n_estimators=100, max_samples=0.8, random_state=1)
acc=cross_val_score(bag, x, y, cv=5, scoring='accuracy').mean()
print("RandomForest Accuracy by N-fold Cross Validation:",acc)

# SVM

Preprocessing: same requirements as KNN, not necessary for normalization
预处理:与 KNN 相同的要求,归一化不是必需的

from sklearn.svm import SVC
# API for SVC
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html?highlight=svc#sklearn.svm.SVC
# by hold-out evaluation
x_train, x_test, y_train, y_test = train_test_split(df_knn, y_encoded, test_size=0.2)
clf=SVC(kernel='linear', C=1E10) # C is large -> hard margin; C is small -> soft margin C 是大 -> 硬边;C 是小 -> 软边距
clf=clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)
acc=accuracy_score(y_pred, y_test)
print('Accuracy by hold-out evaluation: ',acc)
x=df_knn.drop('GradeLetter',axis=1)
y=df_knn['GradeLetter']
# by N-fold cross validation
clf=SVC(kernel='poly', C=1E10)
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
print("Accuracy by N-fold Cross Validation:",acc)
clf=SVC(kernel='rbf', C=1E10)
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
print("Accuracy by N-fold Cross Validation:",acc)

# Logistic Regression

Preprocessing: same requirements as KNN, not necessary for normalization
预处理:与 KNN 要求相同,归一化不是必需的

API

import pandas as pd
import numpy as np
from sklearn import metrics 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# by hold-out evaluation
x_train, x_test, y_train, y_test = train_test_split(df_knn, y_encoded, test_size=0.2)
clf=LogisticRegression(penalty='l2',solver='lbfgs')
clf=clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)
acc=accuracy_score(y_pred, y_test)
print('Accuracy by hold-out evaluation: ',acc)
x=df_knn.drop('GradeLetter',axis=1)
y=df_knn['GradeLetter']
# by N-fold cross validation
clf=LogisticRegression()
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
print("Accuracy by N-fold Cross Validation:",acc)

# Neural Networks

Preprocessing: same requirements as KNN, not necessary for normalization

API

from sklearn.neural_network import MLPClassifier
# by hold-out evaluation
x_train, x_test, y_train, y_test = train_test_split(df_knn, y_encoded, test_size=0.2)
clf=MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(200,), random_state=1)
clf=clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)
acc=accuracy_score(y_pred, y_test)
print('Accuracy by hold-out evaluation: ',acc)
x=df_knn.drop('GradeLetter',axis=1)
y=df_knn['GradeLetter']
# by N-fold cross validation
clf=MLPClassifier(solver='lbfgs', alpha=1e-4,hidden_layer_sizes=(100,8), random_state=1)
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
print("Accuracy by N-fold Cross Validation:",acc)

# Imbalance Solutions

Note that imbalance solutions can only be applied to training set
请注意,不平衡解决方案只能应用于训练集

API

import numpy as np
import scipy as sp
import pandas as pd
from IPython.display import display, HTML
from collections import Counter
df=pd.read_csv('data_students.csv')
cols=df.columns
# print out and display dataframe as tables in HTML
# 在 HTML 中打印并显示 dataframe 为表格
display(HTML(df.head(10).to_html()))
# check degree of imbalance in labels
# 检查标签的不平衡程度
cf=df['GradeLetter'].value_counts()
crf=df['GradeLetter'].value_counts()/df.shape[0]
print("\nClass frequency:\n", cf, "\n\nClass relative frequency:\n", crf)
# get features and labels
# 获取特征和标签
x=df.drop('GradeLetter',axis=1)
y=df['GradeLetter']
# Install the library imblearn on Anaconda
# 在 Anaconda 上安装库 imblearn
# https://anaconda.org/conda-forge/imbalanced-learn
from imblearn.over_sampling import RandomOverSampler
# http://glemaitre.github.io/imbalanced-learn/generated/imblearn.over_sampling.RandomOverSampler.html
from imblearn.over_sampling import SMOTE
# https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
from imblearn.under_sampling import RandomUnderSampler
# http://glemaitre.github.io/imbalanced-learn/generated/imblearn.under_sampling.RandomUnderSampler.html
ros = RandomOverSampler(random_state=10)
ros.fit(x, y)
print('Original dataset shape {}'.format(Counter(y)))
x_resampled, y_resampled = ros.fit_resample(x, y)
print('After oversampling dataset shape {}'.format(Counter(y_resampled)))
print('Original dataset shape {}'.format(Counter(y)))
ros = RandomUnderSampler(random_state=30)
ros.fit(x, y)
x_resampled, y_resampled = ros.fit_resample(x, y)
print('After undersampling dataset shape {}'.format(Counter(y_resampled)))
# get features and labels, SMOTE can only be applied on numerical features
# 获取特征和标签,SMOTE 只能应用于数值特征
x=df_knn.drop('GradeLetter',axis=1)
y=df_knn['GradeLetter']
ros = SMOTE(k_neighbors=2)
ros.fit(x, y)
print('Original dataset shape {}'.format(Counter(y)))
x_resampled, y_resampled = ros.fit_resample(x, y)
print('After oversampling by SMOTE dataset shape {}'.format(Counter(y_resampled)))
# Note that, imbalance solutions are only applied on training set.
# 请注意,不平衡解决方案仅适用于训练集。
# In terms of N-folds, you have to split data into train-test splits, and apply the solution
# 就 N-folds 而言,必须将数据拆分为训练 - 测试拆分,并应用解决方案
from sklearn.model_selection import KFold
# N-fold data split
# API: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
# Assume last column is your label, other columns are features
# 假设最后一列是标签,其他列是特征
X = df_knn.loc[:, df_knn.columns!='GradeLetter']
y = df_knn.loc[:,'GradeLetter']
print(X.columns)
print(type(X))
print(type(y))
kf = KFold(n_splits=5, shuffle=True)
data_5folds = []
for train_index, test_index in kf.split(X,y):
    print("TRAIN:", train_index, "TEST:", test_index)
    # get actual data by index
    # 按索引获取实际数据
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # save data into fold
    # 将数据保存到 fold 中
    fold = [x_train, x_test, y_train, y_test]
    # add each fold data into 5folds
    # 将每个 fold 数据添加到 5 个 folds
    data_5folds.append(fold)
for k in range(1, 24, 2): 
    acc_5folds = []
    for x_train, x_test, y_train, y_test in data_5folds:
        ros = RandomOverSampler(random_state=10)
        ros.fit(x_train, y_train)
        x_resampled, y_resampled = ros.fit_resample(x_train, y_train)
        clf=neighbors.KNeighborsClassifier(k, weights='uniform')
        clf.fit(x_resampled, y_resampled)
        y_pred = clf.predict(x_test)
        acc = accuracy_score(y_test, y_pred)
        acc_5folds.append(acc)
    print('k = ',k,'Accuracy on 5-folds: ', np.mean(acc_5folds))