using Plots, LinearAlgebra, Random, StatsBase, SparseArrays, CSV, DataFrames
We will rank sports teams using a variant of least squares
## Read the data
using DataFrames
content = CSV.read("ncaa-2018.csv", DataFrame)
# this file is also on the website in the same directory
Row | Date | Team | Team Location | Team Score | Opponent | Opponent Score | Opponent Location | Neutral Site | Team Result | Team Margin | Team Differential | Opponent Differential | Game Type |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
String15 | String | String7 | Int64 | String | Int64 | String7? | String15? | String7 | Int64 | Float64 | Float64 | String15 | |
1 | 11/10/17 | Eastern Kentucky | Away | 73 | Rice | 72 | Home | missing | Win | 1 | -4.55 | -7.52 | Division 1 |
2 | 11/10/17 | Rice | Home | 72 | Eastern Kentucky | 73 | Away | missing | Loss | -1 | -7.52 | -4.55 | Division 1 |
3 | 11/10/17 | Southern Illinois-Edwardsville | Away | 74 | Purdue | 105 | Home | missing | Loss | -31 | -8.52 | 15.5 | Division 1 |
4 | 11/10/17 | Oral Roberts | Home | 86 | Avila | 72 | Away | missing | Win | 14 | -5.39 | 0.0 | Non Division 1 |
5 | 11/10/17 | Purdue | Home | 105 | Southern Illinois-Edwardsville | 74 | Away | missing | Win | 31 | 15.5 | -8.52 | Division 1 |
6 | 11/10/17 | Seattle | Away | 46 | Saint Louis | 62 | Home | missing | Loss | -16 | 3.73 | 0.39 | Division 1 |
7 | 11/10/17 | Florida Atlantic | Away | 59 | South Florida | 60 | Home | missing | Loss | -1 | -0.55 | -7.31 | Division 1 |
8 | 11/10/17 | Saint Louis | Home | 62 | Seattle | 46 | Away | missing | Win | 16 | 0.39 | 3.73 | Division 1 |
9 | 11/10/17 | Hampton | Away | 75 | Rider | 90 | Home | missing | Loss | -15 | 4.12 | 4.13 | Division 1 |
10 | 11/10/17 | Rider | Home | 90 | Hampton | 75 | Away | missing | Win | 15 | 4.13 | 4.12 | Division 1 |
11 | 11/10/17 | Greensboro | Away | 80 | North Carolina A&T | 104 | Home | missing | Loss | -24 | 0.0 | 0.91 | Non Division 1 |
12 | 11/10/17 | North Carolina A&T | Home | 104 | Greensboro | 80 | Away | missing | Win | 24 | 0.91 | 0.0 | Non Division 1 |
13 | 11/10/17 | Murray State | Home | 118 | Brescia | 61 | Away | missing | Win | 57 | 13.39 | 0.0 | Non Division 1 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
11813 | 3/10/18 | Pennsylvania | Neutral | 80 | Yale | 57 | missing | Neutral Site | Win | 23 | 7.75 | 0.77 | Division 1 |
11814 | 3/10/18 | Georgia State | Neutral | 73 | Georgia Southern | 67 | missing | Neutral Site | Win | 6 | 8.03 | 5.73 | Division 1 |
11815 | 3/11/18 | Georgia State | Neutral | 74 | Texas-Arlington | 61 | missing | Neutral Site | Win | 13 | 8.03 | 4.12 | Division 1 |
11816 | 3/11/18 | Tennessee | Neutral | 72 | Kentucky | 77 | missing | Neutral Site | Loss | -5 | 7.82 | 6.0 | Division 1 |
11817 | 3/11/18 | Texas-Arlington | Neutral | 61 | Georgia State | 74 | missing | Neutral Site | Loss | -13 | 4.12 | 8.03 | Division 1 |
11818 | 3/11/18 | Houston | Neutral | 55 | Cincinnati | 56 | missing | Neutral Site | Loss | -1 | 12.26 | 17.06 | Division 1 |
11819 | 3/11/18 | Cincinnati | Neutral | 56 | Houston | 55 | missing | Neutral Site | Win | 1 | 17.06 | 12.26 | Division 1 |
11820 | 3/11/18 | Pennsylvania | Neutral | 68 | Harvard | 65 | missing | Neutral Site | Win | 3 | 7.75 | 1.94 | Division 1 |
11821 | 3/11/18 | Harvard | Neutral | 65 | Pennsylvania | 68 | missing | Neutral Site | Loss | -3 | 1.94 | 7.75 | Division 1 |
11822 | 3/11/18 | Davidson | Neutral | 58 | Rhode Island | 57 | missing | Neutral Site | Win | 1 | 8.88 | 8.28 | Division 1 |
11823 | 3/11/18 | Kentucky | Neutral | 77 | Tennessee | 72 | missing | Neutral Site | Win | 5 | 6.0 | 7.82 | Division 1 |
11824 | 3/11/18 | Rhode Island | Neutral | 57 | Davidson | 58 | missing | Neutral Site | Loss | -1 | 8.28 | 8.88 | Division 1 |
## Convert teams into numbers and also build our game table
m = size(content,1)
teams_dict = Dict{String,Int}()
teams = Vector{String}()
index = 1
data = Array{Int64}(undef, m, 4)
for i = 1 : m
global index
team = content[:Team][i]
opponent = content[:Opponent][i]
teamscore = content[Symbol("Team Score")][i]
oppscore = content[Symbol("Opponent Score")][i]
if !haskey(teams_dict, team)
push!(teams, team)
teams_dict[team] = index
index = index + 1
end
if !haskey(teams_dict, opponent)
push!(teams, opponent)
teams_dict[opponent] = index
index = index + 1
end
data[i, 1] = teams_dict[team]
data[i, 2] = teams_dict[opponent]
data[i, 3] = teamscore
data[i, 4] = oppscore
end
ArgumentError: syntax df[column] is not supported use df[!, column] instead Stacktrace: [1] getindex(#unused#::DataFrame, #unused#::Symbol) @ DataFrames ~/.julia/packages/DataFrames/dgZn3/src/abstractdataframe/abstractdataframe.jl:2595 [2] top-level scope @ ./In[9]:9
mutable struct SGD
fs::Vector{Function} # we will randomly sample data
x::Vector # current point
end
function gradient(M::SGD) # return the Gradient
g = similar(M.x)
fill!(g, zero(eltype(M.x)))
for i=1:length(M.fs)
g .+= gradient(M,i) # get the second output, which is the gradient
end
return g
end
function gradient(M::SGD, i::Int)
return M.fs[i](M.x)[2]
end
## Setup least squares
function sports_loss(x,i,j,s)
d = (x[i] - x[j] -s)
f = 0.5*(d)^2
g = zeros(length(x))
g[i] = d
g[j] = -d
return f, g
end
##
opt = SGD(
collect(map(game_i -> # game i maps to loss function i
(x -> sports_loss(x, data[game_i,1], data[game_i,2],
data[game_i,3] - data[game_i,4])),
1:size(data,1))),
zeros(length(teams_dict))
)
##
g = gradient(opt)
#m.fs[1](m.x)
UndefVarError: data not defined Stacktrace: [1] top-level scope @ In[4]:26
## Validate the gradient
m = size(data,1)
B = zeros(m, length(teams))
p = zeros(m)
for i=1:m
# form the model
ti = data[i,1]
tj = data[i,2]
p[i] = data[i,3] - data[i,4]
B[i,ti] = 1
B[i,tj] = -1
end
gtest = B'*(-p)
norm(g-gtest)
UndefVarError: data not defined Stacktrace: [1] top-level scope @ In[5]:2
function sgd_step!(m::SGD, alpha)
f = rand(m.fs) # generate a random f
m.x .-= alpha*f(m.x)[2]
end
norms = zeros(0)
for i=1:100
for j=1:100 # show gradient every few steps
sgd_step!(opt, 0.001) # How to set alpha?
end
push!(norms, norm(gradient(opt)))
end
UndefVarError: opt not defined Stacktrace: [1] top-level scope @ ./In[6]:8
plot(norms, label="", ylabel="norm(g)", yscale=:log10)
┌ Warning: No strict ticks found └ @ PlotUtils ~/.julia/packages/PlotUtils/bZEEj/src/ticks.jl:191 ┌ Warning: No strict ticks found └ @ PlotUtils ~/.julia/packages/PlotUtils/bZEEj/src/ticks.jl:191