#  Data Science Libraries in Python#  NumPyintroduces objects for multidimensional arrays, vectors and matrices, as well as functions that allow to easily perform advanced mathematical and statistical operations on those objects provides vectorization of mathematical operations on arrays and matrices which significantly improves the performance many other python libraries are built on NumPy #  Pandasadds data structures (data frame) and tools designed to work with table-like data provides tools for data manipulation: reshaping, merging, sorting, slicing, aggregation etc. allows handling missing data #  SciPycollection of algorithms for linear algebra, differential equations, numerical integration, optimization, statistics and more part of SciPy Stack built on NumPy SciPy and NumPy are usually used for matrix-based operations, such as matrix factorization #  SciKit-Learnprovides machine learning algorithms: classification, regression, clustering, model validation etc.
built on NumPy, SciPy and matplotlib
#  Visualization libraries#  matplotlibpython 2D plotting library which produces publication quality figures in a variety of hardcopy formats a set of functionalities similar to those of MATLAB line plots, scatter plots, barcharts, histograms, pie charts etc. relatively low-level; some effort needed to create advanced visualization #  Seabornbased on matplotlib provides high level interface for drawing attractive statistical graphics Similar (in style) to the popular ggplot2 library in Rggplot2 库相似 
VIDEO 
#  Pandas#  IntroductionsPandas is a newer package built on top of NumPy, and provides an efficient implementation of a DataFrame.
DataFrames are essentially multidimensional arrays with attached row and column labels, and often with heterogeneous types and/or missing data.
#  Basic data structure#  SeriesSeries is a one-dimensional array of indexed data. The index-value is similar to key-value in dict in Python3. We can create series by given a dict or an array/list print ( 'Data Structure: Series' ) import  numpy as  npimport  pandas as  pddata =  pd. Series( [ 0.25 ,  0.5 ,  0.75 ,  1.0 ] )  print ( data) print ( data. values) print ( data. index) print ( data[ 1 : 4 ] ) data =  pd. Series( [ 0.25 ,  0.5 ,  0.75 ,  1.0 ] ,  index= [ 'a' ,  'b' ,  'c' ,  'd' ] )  print ( data) print ( data. values) print ( data. index) print ( data[ 1 : 4 ] ) print ( data[ 'c' ] ) data =  pd. Series( [ 0.25 ,  0.5 ,  0.75 ,  1.0 ] ,  index= [ 2 ,  5 ,  3 ,  7 ] )  print ( data) print ( data. values) print ( data. index) population_dict =  { 'California' :  38332521 ,  'Texas' :  26448193 , 'New York' :  19651127 , 'Florida' :  19552860 , 'Illinois' :  12882135 } data =  pd. Series( population_dict)  print ( data) print ( data. values) print ( data. index) A =  pd. Series( [ 2 ,  4 ,  6 ] ,  index= [ 0 ,  1 ,  2 ] )  B =  pd. Series( [ 1 ,  3 ,  5 ] ,  index= [ 1 ,  2 ,  3 ] )  print ( A) print ( B) print ( A +  B) C= A. add( B,  fill_value= 0 )  print ( C) 
#  DataFrameDataFrame is a generalized array or a specialization of dict. It can be viewed as a table which stores data in different data types. print ( '\nData Structure: DataFrame' ) population_dict =  { 'California' :  38332521 ,  'Texas' :  26448193 , 'New York' :  19651127 , 'Florida' :  19552860 , 'Illinois' :  12882135 } population =  pd. Series( population_dict)  area_dict =  { 'California' :  423967 ,  'Texas' :  695662 ,  'New York' :  141297 ,  'Florida' :  170312 ,  'Illinois' :  149995 }  area =  pd. Series( area_dict)  direction_dict= { 'California' :  'West' ,  'Texas' :  'Middle' ,  'New York' :  'East' ,  'Florida' :  'East' ,  'Illinois' :  'Middle' }  direction= pd. Series( direction_dict)  states =  pd. DataFrame( { 'population' :  population,  'area' :  area,  'direction' :  direction} )  print ( states) print ( states. index) print ( states[ 'area' ] ) print ( states[ 1 : 3 ] ) df= pd. DataFrame( population,  columns= [ 'population' ] )  print ( df) df= pd. DataFrame( { 'population' :  population,  'area' :  area,  'direction' :  direction} )  print ( df) df= pd. DataFrame( np. random. rand( 3 ,  2 ) ,  columns= [ 'foo' ,  'bar' ] ,  index= [ 'a' ,  'b' ,  'c' ] )  print ( df) df= pd. DataFrame( [ { 'a' :  1 ,  'b' :  2 } ,  { 'b' :  3 ,  'c' :  4 } ] )  print ( df) 
#  IndexIndex is the object associated with Series and DataFrame It can be viewed as an immutable array (i.e., cannot be modified) or as an ordered set indA =  pd. Index( [ 1 ,  3 ,  5 ,  7 ,  9 ] )  indB =  pd. Index( [ 2 ,  3 ,  5 ,  7 ,  11 ] )  ind =  indA &  indB  print ( ind) ind =  indA |  indB  print ( ind) ind =  indA ^  indB  print ( ind) 
#  Slicing data in dataFrameloc and iloc are used to slice rows by default.loc 和 iloc 用于切片行。To slice columns, we can use df.loc[:, [1, 2, 3]]df.loc[:, [1, 2, 3]] we can use index -1 in iloc , but cannot use it in loc . We only  use index number  in iloc , but we can use both index number and label in loc ix is not suggested to be used in pandas v0.2 or later versionprint ( states[ 'area' ] ) print ( states[ [ 'area' , 'direction' ] ] ) print ( states[ 1 : 3 ] )  print ( states[ 'Florida' : 'Illinois' ] ) print ( states. iloc[ 1 : 3 ,  1 : 3 ] )  print ( states. loc[ [ 'Texas' , 'New York' ] ,  [ 'area' , 'direction' ] ] )  
#  NaN , None and mising values in PandasNone is a general missing dataNaN can be interpreted as missing numerical data in float types= pd. Series( [ 1 ,  np. nan,  2 ,  None ] )  print ( s) print ( s. isnull( ) ) print ( s. dropna( ) ) df =  pd. DataFrame( [ [ 1 ,  np. nan,  2 ] ,  [ 2 ,  3 ,  5 ] , [ np. nan,  4 ,  6 ] ] ) print ( df) print ( df. dropna( ) ) print ( df. dropna( axis= 'columns' ) )  data =  pd. Series( [ 1 ,  np. nan,  2 ,  None ,  3 ] ,  index= list ( 'abcde' ) )  print ( data) print ( data. fillna( 0 ) ) 
#  File operations by Pandasimport  pandas as  pddf1= pd. read_csv( 'Data_students.csv' )  print ( type ( df1) ) df2= pd. read_table( 'Data_students.csv' , sep= ',' )  print ( type ( df2) ) print ( df1. head( 2 ) ) print ( df1. tail( 3 ) ) print ( df1. columns) print ( type ( df1. columns) )  col_list =  df1. columns. tolist( )  print ( col_list) print ( type ( col_list) )  print ( '\nselect data by rows' ) print ( df1[ 1 : 3 ] )  print ( df1. columns) print ( '\nselect data by columns' ) cols =  [ 'Age' , 'Gender' , 'Grade' ]  print ( df1[ cols] ) print ( df1[ 'Age' ] . mean( ) ) 
#  Data Preprocessing#  IntroductionsData preprocessing may include the following operations:
file load deal with missing values slicing data data normalization data smoothing data transformation, numerical to categorical data transformation, categorical to numerical feature selection feature deduction some special preprocessing, such as the operations in text mining, e.g., stopword removal, tokenization, TF-IDF weighting the following operations will use Data_Students.csv as the data set
#  deal with missing valuesimport  numpy as  npimport  scipy as  spimport  pandas as  pdimport  matplotlib as  mplimport  seaborn as  snsfrom  IPython. display import  display,  HTMLdf= pd. read_csv( 'data_students.csv' )  cols= df. columns print ( df. shape) print ( df. dtypes) print ( 'ColumnName, DataType, MissingValues' ) for  i in  cols:     print ( i,  ',' ,  df[ i] . dtype, ',' , df[ i] . isnull( ) . any ( ) )  display( HTML( df. head( 10 ) . to_html( ) ) )      sns. set ( )  sns. pairplot( df,  hue= 'GradeLetter' ,  height= 2 ) ;  
mean_age= df[ 'Age' ] . mean( skipna= True )  mean_hr_assignment= df[ 'Hours on Assignments' ] . mean( skipna= True )  mean_hr_game= df[ 'Hours on Games' ] . mean( skipna= True )  mean_exam= df[ 'Exam' ] . mean( skipna= True )  mean_grade= df[ 'Grade' ] . mean( skipna= True )  df[ "Age" ] . fillna( df[ "Age" ] . mean( ) ,  inplace= True )  df[ "Hours on Assignments" ] . fillna( df[ "Hours on Assignments" ] . mean( ) ,  inplace= True )  df[ "Hours on Games" ] . fillna( df[ "Hours on Games" ] . mean( ) ,  inplace= True )  df[ "Exam" ] . fillna( df[ "Exam" ] . mean( ) ,  inplace= True )  df[ "Grade" ] . fillna( df[ "Grade" ] . mean( ) ,  inplace= True )  print ( 'ColumnName, DataType, MissingValues' ) for  i in  cols:     print ( i,  ',' ,  df[ i] . dtype, ',' , df[ i] . isnull( ) . any ( ) )    display( HTML( df. head( 10 ) . to_html( ) ) )  
#  Normalizationnumerics =  [ 'int16' ,  'int32' ,  'int64' ,  'float16' ,  'float32' ,  'float64' ]  cols_numeric =  df. select_dtypes( include= numerics) . columns. tolist( )  cols_numeric_index= [ df. columns. get_loc( col)  for  col in  cols_numeric]  print ( 'Numerical column names:\n' , cols_numeric) print ( 'Numerical column indeices:\n' , cols_numeric_index) for  i in  cols:     print ( i,  ',' ,  df[ i] . dtype, ',' , df[ i] . isnull( ) . any ( ) )       df_norm= df. copy( deep= True )  from  sklearn. preprocessing import  MinMaxScalerscaler =  MinMaxScaler( )  df[ cols_numeric] = scaler. fit_transform( df[ cols_numeric] )  display( HTML( df. head( 10 ) . to_html( ) ) )  for  col in  cols_numeric:     df_norm[ col] = ( df[ col] - df[ col] . min ( ) ) / ( df[ col] . max ( ) - df[ col] . min ( ) )         df_norm= df_norm. drop( 'ID' , 1 )  df_norm. head( 10 )  
df_transform= df_norm. copy( deep= True )     display( HTML( df_transform. head( 5 ) . to_html( ) ) )  print ( df_transform[ 'Degree' ] . dtype) print ( df_transform[ 'Nationality' ] . dtype) df_dummies_degree= pd. get_dummies( df_transform[ 'Degree' ] )  print ( df_dummies_degree. head( 5 ) ) df_dummies_nation= pd. get_dummies( df_transform[ 'Nationality' ] )  df_transform= df_transform. join( df_dummies_degree)  df_transform= df_transform. join( df_dummies_nation)  df_transform= df_transform. drop( 'Degree' , 1 )  df_transform= df_transform. drop( 'Nationality' , 1 )  display( HTML( df_transform. head( 5 ) . to_html( ) ) )  df_transform= df_transform. drop( ' PHD' , 1 )  df_transform= df_transform. drop( ' China' , 1 )  display( HTML( df_transform. head( 5 ) . to_html( ) ) )  
#  Feature selectionimport  matplotlib. pyplot as  pltdisplay( HTML( df_transform. head( 10 ) . to_html( ) ) )  x =  df_transform. drop( 'GradeLetter' ,  1 )  y =  df_transform[ 'GradeLetter' ]  plt. figure( figsize= ( 12 , 10 ) )  cor =  df_transform. corr( )  sns. heatmap( cor,  annot= True ,  cmap= plt. cm. Reds)  plt. show( )  cor_target =  abs ( cor[ "Grade" ] )  relevant_features =  cor_target[ cor_target> 0.5 ]  print ( '\nSelected features by Filter model:\n' , relevant_features) from  sklearn. ensemble import  ExtraTreesClassifiery =  df_transform[ 'GradeLetter' ]  x =  df_transform. drop( 'GradeLetter' ,  1 )  display( HTML( x. head( 10 ) . to_html( ) ) )  model =  ExtraTreesClassifier( )  model. fit( x,  y)  values= model. feature_importances_. tolist( )  keys= x. columns. tolist( )  d =  dict ( zip ( keys,  values) )  s =  [ ( k,  d[ k] )  for  k in  sorted ( d,  key= d. get,  reverse= True ) ]  print ( '\nSelected features by Wrapper model (classification):\n' ) for  k,  v in  s:     print ( k, '\t' , v)  
#  Feature reductionfrom  sklearn. decomposition import  PCAdisplay( HTML( x. head( 10 ) . to_html( ) ) )  df_dummies_nationality= pd. get_dummies( df_transform[ 'Nationality' ] )  print ( df_dummies_nationality. head( 5 ) ) df_transform= df_transform. join( df_dummies_nationality)  df_transform= df_transform. drop( 'Nationality' , 1 )  df_transform= df_transform. drop( ' China' , 1 )  x =  df_transform. drop( 'GradeLetter' ,  1 )  y =  df_transform[ 'GradeLetter' ]  display( HTML( x. head( 10 ) . to_html( ) ) )  pca =  PCA( n_components= 10 )  fit =  pca. fit( x)  print ( 'Explained variance: ' ,  fit. explained_variance_ratio_) print ( '\nPCAs:\n' ,  fit. components_) PCAs =  pca. fit_transform( x)  PCAs_selected =  PCAs[ : , : 3 ]  df_PCAs =  pd. DataFrame( data= PCAs_selected,  columns= [ 'PC1' , 'PC2' , 'PC3' ] )  df_PCAs[ 'GraderLetter' ] = y display( HTML( df_PCAs. head( 10 ) . to_html( ) ) )  df_PCAs. to_csv( 'Data_Students_PCA.csv' ,  sep= ',' )  
#  Data Splits: Examplesfrom  sklearn. model_selection import  train_test_splitfrom  sklearn. model_selection import  cross_val_scorefrom  sklearn. model_selection import  KFoldfrom  matplotlib import  pyplot as  pltimport  matplotlib as  mplimport  seaborn as  snsprint ( df. columns) X =  df. loc[ : ,  df. columns!= 'GradeLetter' ]  y =  df. loc[ : , 'GradeLetter' ]  print ( X. columns) print ( type ( X) ) print ( type ( y) ) x_train,  x_test,  y_train,  y_test =  train_test_split( X,  y,  test_size= 0.2 )  kf =  KFold( n_splits= 5 ,  shuffle= True )  data_5folds =  [ ]  for  train_index,  test_index in  kf. split( X, y) :     print ( "TRAIN:" ,  train_index,  "TEST:" ,  test_index)           x_train,  x_test =  X. iloc[ train_index] ,  X. iloc[ test_index]      y_train,  y_test =  y[ train_index] ,  y[ test_index]           fold =  [ x_train,  x_test,  y_train,  y_test]           data_5folds. append( fold)