In [1]:
## Height Weight data
# Grab data from https://www.kaggle.com/mustafaali96/weight-height
# https://opendata.stackexchange.com/questions/7793/age-weight-and-height-dataset
# https://www.reddit.com/r/datasets/comments/1dii3t/data_sets_for_heights_and_weights/

# https://vincentarelbundock.github.io/Rdatasets/datasets.html

# a related tutorial
# http://goelhardik.github.io/2016/10/04/fishers-lda/
In [2]:
##
using Plots
In [3]:
##
# https://vincentarelbundock.github.io/Rdatasets/csv/carData/Davis.csv
using CSV
df = CSV.read(download("https://vincentarelbundock.github.io/Rdatasets/csv/carData/Davis.csv"))
X = Float64.([df[:weight] df[:height]])
┌ Warning: `getindex(df::DataFrame, col_ind::ColumnIndex)` is deprecated, use `df[!, col_ind]` instead.
│   caller = top-level scope at In[3]:5
└ @ Core In[3]:5
┌ Warning: `getindex(df::DataFrame, col_ind::ColumnIndex)` is deprecated, use `df[!, col_ind]` instead.
│   caller = top-level scope at In[3]:5
└ @ Core In[3]:5
Out[3]:
200×2 Array{Float64,2}:
  77.0  182.0
  58.0  161.0
  53.0  161.0
  68.0  177.0
  59.0  157.0
  76.0  170.0
  76.0  167.0
  69.0  186.0
  71.0  178.0
  65.0  171.0
  70.0  175.0
 166.0   57.0
  51.0  161.0
   ⋮         
  76.0  183.0
  50.0  158.0
  88.0  185.0
  89.0  173.0
  59.0  164.0
  51.0  156.0
  62.0  164.0
  74.0  175.0
  83.0  180.0
  81.0  175.0
  90.0  181.0
  79.0  177.0
In [4]:
##
g = df[:sex]
┌ Warning: `getindex(df::DataFrame, col_ind::ColumnIndex)` is deprecated, use `df[!, col_ind]` instead.
│   caller = top-level scope at In[4]:1
└ @ Core In[4]:1
Out[4]:
200-element CSV.Column{String,PooledString}:
 "M"
 "F"
 "F"
 "M"
 "F"
 "M"
 "M"
 "M"
 "M"
 "M"
 "M"
 "F"
 "F"
 ⋮  
 "M"
 "F"
 "M"
 "M"
 "F"
 "F"
 "F"
 "M"
 "M"
 "M"
 "M"
 "M"
In [5]:
##
scatter(X[:,1],X[:,2])
xlabel!("Weight")
ylabel!("Height")
gui()
In [6]:
## Filter the data
goodpts = X[:,1] .<= 115
Xf = X[goodpts,:]
gf = g[goodpts]
Out[6]:
198-element Array{String,1}:
 "M"
 "F"
 "F"
 "M"
 "F"
 "M"
 "M"
 "M"
 "M"
 "M"
 "M"
 "F"
 "F"
 ⋮  
 "M"
 "F"
 "M"
 "M"
 "F"
 "F"
 "F"
 "M"
 "M"
 "M"
 "M"
 "M"
In [7]:
##
scatter(Xf[:,1],Xf[:,2])
xlabel!("Weight")
ylabel!("Height")
gui()
In [8]:
## Partition by classes
s1 = gf .== "F"
s2 = gf .== "M"
X1 = Xf[s1,:]
X2 = Xf[s2,:]
scatter(X1[:,1],X1[:,2])
scatter!(X2[:,1],X2[:,2])
gui()
In [9]:
##
using Statistics
m = mean(Xf,dims=1)
m1 = mean(X1,dims=1)
m2 = mean(X2,dims=1)
n1 = size(X1,1)
n2 = size(X2,1)

B = (m1-m)'*n1*(m1-m) + (m2-m)'*n2*(m2-m)
W = (X1.-m1)'*(X1.-m1) + (X2.-m2)'*(X2.-m2)
Out[9]:
2×2 Array{Float64,2}:
 15643.6   5719.83
  5719.83  7158.18
In [10]:
## The matrices B and W add up to the total covariance
using LinearAlgebra
C = (Xf.-m)'*(Xf.-m)
norm(B+W - C)
Out[10]:
3.279229782005122e-11
In [11]:
##
lams,V = eigen(B,W)
Out[11]:
GeneralizedEigen{Float64,Float64,Array{Float64,2},Array{Float64,1}}
eigenvalues:
2-element Array{Float64,1}:
 2.220446049250313e-16
 1.4771693189891801   
eigenvectors:
2×2 Array{Float64,2}:
  0.00857482  -0.0040965 
 -0.0119468   -0.00739159
In [12]:
##
v = V[:,2]
p1 = histogram((X1)*v,nbins=10)
histogram!((X2)*v,nbins=10)
gui()
In [13]:
## Compare to PCA
lams,V = eigen(C)
v2 = V[:,2]
p2 = histogram(X1*v2,nbins=10)
histogram!(X2*v2,nbins=10)
plot(p1,p2)
gui()
In [14]:
##
X1c = X1.-m
X2c = X2.-m
scatter(X1c[:,1],X1c[:,2])
scatter!(X2c[:,1],X2c[:,2])
Plots.abline!(v[2]/v[1],0,label="LDA")
Plots.abline!(v2[2]/v2[1],0,label="PCA")
gui()
In [15]:
##
X1c = X1.-m
X2c = X2.-m
scatter(X1c[:,1],X1c[:,2])
scatter!(X2c[:,1],X2c[:,2])
Plots.abline!(v2[2]/v2[1],0)
gui()