In [1]:
using Plots, LinearAlgebra, Random, StatsBase, SparseArrays, CSV, DataFrames, Tables
In [2]:
content = rowtable(CSV.read("ncaa-2023.csv", DataFrame, header=false))
Out[2]:
5610-element Vector{@NamedTuple{Column1::String, Column2::String15, Column3::String, Column4::Float64, Column5::Float64, Column6::Int64, Column7::Int64, Column8::Int64, Column9::Int64, Column10::Float64, Column11::Float64}}:
 (Column1 = "Mississippi Valley St.Baylor11-7", Column2 = "11/7/22", Column3 = "Baylor -37.0, 95-58 (100%)", Column4 = 58.08219264, Column5 = 95.10164256, Column6 = 53, Column7 = 117, Column8 = 355, Column9 = 2, Column10 = 72.61017799, Column11 = 29.83936953)
 (Column1 = "BresciaMiddle Tennessee11-7", Column2 = "11/7/22", Column3 = "Middle Tennessee (100%)", Column4 = 51.57063963, Column5 = 92.52730733, Column6 = 52, Column7 = 79, Column8 = 0, Column9 = 100, Column10 = 70.6356741, Column11 = 0.55918372)
 (Column1 = "PacificStanford11-7", Column2 = "11/7/22", Column3 = "Stanford -19.1, 82-62 (96%)", Column4 = 62.4924586, Column5 = 81.59700436, Column6 = 78, Column7 = 88, Column8 = 254, Column9 = 37, Column10 = 70.3686906, Column11 = 35.83878057)
 (Column1 = "Valley ForgeJames Madison11-7", Column2 = "11/7/22", Column3 = "James Madison (100%)", Column4 = 56.12414098, Column5 = 95.1918094, Column6 = 38, Column7 = 123, Column8 = 0, Column9 = 157, Column10 = 72.10854632, Column11 = -2.130669557)
 (Column1 = "JWU (Charlotte)UNC Greensboro11-7", Column2 = "11/7/22", Column3 = "UNC Greensboro (100%)", Column4 = 50.26946273, Column5 = 89.43981154, Column6 = 60, Column7 = 93, Column8 = 0, Column9 = 108, Column10 = 69.30809531, Column11 = -0.968387195)
 (Column1 = "HarvardMorehouse11-7", Column2 = "11/7/22", Column3 = "Harvard (100%)", Column4 = 82.66080133, Column5 = 54.41964239, Column6 = 68, Column7 = 63, Column8 = 164, Column9 = 0, Column10 = 70.69162206, Column11 = -3.1999939)
 (Column1 = "DefianceOakland11-7", Column2 = "11/7/22", Column3 = "Oakland (100%)", Column4 = 56.31633105, Column5 = 90.91803277, Column6 = 27, Column7 = 92, Column8 = 0, Column9 = 213, Column10 = 71.1807591, Column11 = -6.139759419)
 (Column1 = "Coppin St.Charlotte11-7", Column2 = "11/7/22", Column3 = "Charlotte -10.9, 83-72 (84%)", Column4 = 71.73747765, Column5 = 82.68669043, Column6 = 59, Column7 = 82, Column8 = 304, Column9 = 180, Column10 = 73.97837139, Column11 = 33.15820053)
 (Column1 = "La SalleVillanova11-7", Column2 = "11/7/22", Column3 = "Villanova -15.2, 78-63 (92%)", Column4 = 62.90763241, Column5 = 78.1192195, Column6 = 68, Column7 = 81, Column8 = 156, Column9 = 17, Column10 = 69.98142267, Column11 = 44.63969223)
 (Column1 = "Fort WayneMichigan11-7", Column2 = "11/7/22", Column3 = "Michigan -13.0, 81-68 (88%)", Column4 = 67.65212191, Column5 = 80.61596127, Column6 = 56, Column7 = 75, Column8 = 147, Column9 = 38, Column10 = 70.80633829, Column11 = 47.29321085)
 (Column1 = "FisherUMass Lowell11-7", Column2 = "11/7/22", Column3 = "UMass Lowell (100%)", Column4 = 55.4776031, Column5 = 88.7214148, Column6 = 43, Column7 = 108, Column8 = 0, Column9 = 228, Column10 = 71.04435062, Column11 = -6.841735024)
 (Column1 = "New OrleansButler11-7", Column2 = "11/7/22", Column3 = "Butler -17.5, 84-66 (94%)", Column4 = 66.37333751, Column5 = 83.91084458, Column6 = 53, Column7 = 89, Column8 = 321, Column9 = 110, Column10 = 74.00732387, Column11 = 28.93818721)
 (Column1 = "Lebanon ValleyBucknell11-7", Column2 = "11/7/22", Column3 = "Bucknell (100%)", Column4 = 58.35162454, Column5 = 90.7786648, Column6 = 52, Column7 = 113, Column8 = 0, Column9 = 267, Column10 = 71.31113414, Column11 = -8.568614778)
 ⋮
 (Column1 = "EvansvilleIllinois St.2-26", Column2 = "2/26/23", Column3 = "Illinois St. -9.1, 73-64 (82%)", Column4 = 64.11506566404985, Column5 = 73.22360392063257, Column6 = 53, Column7 = 72, Column8 = 348, Column9 = 287, Column10 = 67.70143746278178, Column11 = 21.597462385507335)
 (Column1 = "WisconsinMichigan2-26", Column2 = "2/26/23", Column3 = "Michigan -4.9, 66-61 (71%)", Column4 = 61.457675924637904, Column5 = 66.35741568390314, Column6 = 79, Column7 = 87, Column8 = 68, Column9 = 47, Column10 = 63.45026112919709, Column11 = 60.362252415967916)
 (Column1 = "Wichita St.Tulane2-26", Column2 = "2/26/23", Column3 = "Tulane -5.8, 78-72 (71%)", Column4 = 72.4234967579466, Column5 = 78.19234548772916, Column6 = 83, Column7 = 76, Column8 = 95, Column9 = 81, Column10 = 71.85067938277956, Column11 = 61.39030324895765)
 (Column1 = "UCLAColorado2-26", Column2 = "2/26/23", Column3 = "UCLA -6.7, 70-63 (76%)", Column4 = 70.12275807975539, Column5 = 63.3954204019498, Column6 = 60, Column7 = 56, Column8 = 3, Column9 = 80, Column10 = 69.07916008700157, Column11 = 67.59068192427755)
 (Column1 = "DrakeBradley2-26", Column2 = "2/26/23", Column3 = "Bradley -2.2, 67-65 (59%)", Column4 = 64.91285968915885, Column5 = 67.07176003739494, Column6 = 61, Column7 = 73, Column8 = 54, Column9 = 72, Column10 = 65.04641652008074, Column11 = 68.3591369545626)
 (Column1 = "UCFTulsa2-26", Column2 = "2/26/23", Column3 = "UCF -9.7, 74-64 (83%)", Column4 = 74.0566060873923, Column5 = 64.34699603415602, Column6 = 68, Column7 = 49, Column8 = 66, Column9 = 287, Column10 = 65.74941442865234, Column11 = 38.310522657010495)
 (Column1 = "ValparaisoMurray St.2-26", Column2 = "2/26/23", Column3 = "Murray St. -6.4, 76-70 (73%)", Column4 = 70.0319843529249, Column5 = 76.4250223649765, Column6 = 76, Column7 = 77, Column8 = 273, Column9 = 223, Column10 = 67.29922491001606, Column11 = 36.508178442747095)
 (Column1 = "RutgersPenn St.2-26", Column2 = "2/26/23", Column3 = "Penn St. -1.7, 67-65 (57%)", Column4 = 65.49269945316647, Column5 = 67.16562140473204, Column6 = 59, Column7 = 56, Column8 = 30, Column9 = 51, Column10 = 63.96814012860589, Column11 = 73.0180743188931)
 (Column1 = "WashingtonStanford2-26", Column2 = "2/26/23", Column3 = "Stanford -4.6, 73-69 (68%)", Column4 = 68.8418690708591, Column5 = 73.40993207092959, Column6 = 69, Column7 = 81, Column8 = 108, Column9 = 95, Column10 = 67.54958884796666, Column11 = 58.28396042896583)
 (Column1 = "Cal BaptistStephen F. Austin2-26", Column2 = "2/26/23", Column3 = "Stephen F. Austin -4.8, 70-65 (69%)", Column4 = 65.32491295566658, Column5 = 70.08624698312488, Column6 = 58, Column7 = 80, Column8 = 139, Column9 = 124, Column10 = 65.70546229716118, Column11 = 50.783821804477824)
 (Column1 = "WashingtonStanford2-26", Column2 = "2/26/23", Column3 = "Stanford -4.6, 73-69 (68%)", Column4 = 68.8418690708591, Column5 = 73.40993207092959, Column6 = 69, Column7 = 81, Column8 = 108, Column9 = 95, Column10 = 67.54958884796666, Column11 = 58.28396042896583)
 (Column1 = "Cal BaptistStephen F. Austin2-26", Column2 = "2/26/23", Column3 = "Stephen F. Austin -4.8, 70-65 (69%)", Column4 = 65.32491295566658, Column5 = 70.08624698312488, Column6 = 58, Column7 = 80, Column8 = 139, Column9 = 124, Column10 = 65.70546229716118, Column11 = 50.783821804477824)
In [3]:
"""Games are stored in this really weird way

gameid = Coppin St.Charlotte11-7
predict = Charlotte -10.9, 83-72 (84%)
So we are going to match predict against

"""
function parse_game(gameid, predict)
    m = match(r"([\w.' ]+) ([ -1234567890,.\(\)%]+)", predict)
    if m === nothing
        println(predict)
    end
    team1 = m.captures[1]
    # remove the digits at the end... 
    m2 = match(r"^([a-zA-Z\(\)&'-. ]+)[\d-]+$", gameid)
    if m2 === nothing
        println(gameid)
    end 
    gameid = m2.captures[1]
    team2 = filter!(x->length(x) > 0, split(gameid, team1))[1]
    # it's possible team2 still has bad symbols... 
    # but maybe 
    return team1, team2
end 
gamedata = map(content) do row
    score1 = row.Column6
    score2 = row.Column7
    gameid = row.Column1
    date = row.Column2
    predict = row.Column3
    team1,team2 = parse_game(gameid, predict)
    return (team=team1, opponent=team2, date, teamscore=score2, opponentscore=score1)
end |> DataFrame
Out[3]:
5610×5 DataFrame
5585 rows omitted
Rowteamopponentdateteamscoreopponentscore
SubStrin…SubStrin…String15Int64Int64
1BaylorMississippi Valley St.11/7/2211753
2Middle TennesseeBrescia11/7/227952
3StanfordPacific11/7/228878
4James MadisonValley Forge11/7/2212338
5UNC GreensboroJWU (Charlotte)11/7/229360
6HarvardMorehouse11/7/226368
7OaklandDefiance11/7/229227
8CharlotteCoppin St.11/7/228259
9VillanovaLa Salle11/7/228168
10MichiganFort Wayne11/7/227556
11UMass LowellFisher11/7/2210843
12ButlerNew Orleans11/7/228953
13BucknellLebanon Valley11/7/2211352
⋮⋮⋮⋮⋮⋮
5599Illinois St.Evansville2/26/237253
5600MichiganWisconsin2/26/238779
5601TulaneWichita St.2/26/237683
5602UCLAColorado2/26/235660
5603BradleyDrake2/26/237361
5604UCFTulsa2/26/234968
5605Murray St.Valparaiso2/26/237776
5606Penn St.Rutgers2/26/235659
5607StanfordWashington2/26/238169
5608Stephen F. AustinCal Baptist2/26/238058
5609StanfordWashington2/26/238169
5610Stephen F. AustinCal Baptist2/26/238058

Least-squares fitting with Stochastic Gradient Descent¶

We will rank sports teams using a variant of least squares

In [4]:
## Convert teams into numbers and also build our game table
m = size(content,1)
teams_dict = Dict{String,Int}()
teams = Vector{String}()
index = 1
data = Array{Int64}(undef, m, 4)
for i = 1 : m
    team = gamedata[!,:team][i]
    opponent = gamedata[!,:opponent][i]
    teamscore = gamedata[!,:teamscore][i]
    oppscore = gamedata[!,:opponentscore][i]
    if !haskey(teams_dict, team)
        push!(teams, team)
        teams_dict[team] = index
        index = index + 1
    end

    if !haskey(teams_dict, opponent)
        push!(teams, opponent)
        teams_dict[opponent] = index
        index = index + 1
    end

    data[i, 1] = teams_dict[team]
    data[i, 2] = teams_dict[opponent]
    data[i, 3] = teamscore
    data[i, 4] = oppscore
end
In [5]:
index
Out[5]:
791
In [6]:
# stochastic gradient descent works on a collection
# of functions, each of which provides their own gradient
mutable struct SGD
    fs::Vector{Function} # we will randomly sample data
    x::Vector # current point
end
function gradient(M::SGD) # return the Gradient
    g = similar(M.x)
    N = length(M.fs)
    fill!(g, zero(eltype(M.x)))
    for i=1:length(M.fs)
        g .+= (1.0/N)*gradient(M,i) # get the second output, which is the gradient
    end
    return g
end
function Hessian(M::SGD)
    mean(map(i->Hessian(M,i), 1:length(M.fs)))
end 
function gradient(M::SGD, i::Int)
    return M.fs[i](M.x)[2] # the second output is the gradient
end
function Hessian(M::SGD, i::Int)
    return M.fs[i](M.x)[3] # the third output is the Hessian
end 
function fval(M::SGD)
    f = 0.0 
    N = length(M.fs)
    for i=1:length(M.fs)
        f += (1.0/N)*M.fs[i](M.x)[1] # first output is fval 
    end
    return f
end 
## Setup least squares
function sports_loss(x,i,j,s)
    d = (x[i] - x[j] -s)
    f = 0.5*(d)^2
    g = zeros(length(x))
    g[i] = d
    g[j] = -d 
    H = sparse([i,i,j,j],[i,j,i,j],[1.0,-1,-1,1],length(x),length(x))
    return f, g, H
end
## This set up is for the problem
opt = SGD(
    collect(map(game_i -> # game i maps to loss function i 
        (x -> sports_loss(x, data[game_i,1], data[game_i,2], 
                data[game_i,3] - data[game_i,4])),
        1:size(data,1))),
    zeros(length(teams_dict))
)
##
g = gradient(opt)
#m.fs[1](m.x)
Out[6]:
790-element Vector{Float64}:
 -0.05525846702317291
  0.06809269162210337
 -0.026024955436720138
  0.026203208556149733
 -0.02584670231729055
  0.010695187165775406
 -0.040641711229946524
  0.015151515151515152
 -0.012299465240641709
  0.01158645276292335
  0.00802139037433155
  0.0033868092691622105
  0.007130124777183602
  ⋮
  0.0042780748663101605
 -0.00196078431372549
  0.0010695187165775401
  0.00089126559714795
  0.00409982174688057
  0.006060606060606061
  0.00196078431372549
  0.008556149732620321
  0.00035650623885918
 -0.00071301247771836
  0.01301247771836007
  0.00017825311942959
In [7]:
## Validate the gradient
m = size(data,1)
B = zeros(m, length(teams))
p = zeros(m)
for i=1:m
    # form the model
    ti = data[i,1]
    tj = data[i,2]
    p[i] = data[i,3] - data[i,4]
    B[i,ti] = 1
    B[i,tj] = -1
end
gtest = 1/size(B,1)*B'*(-p)
norm(g-gtest)
Out[7]:
9.711277220295703e-17
In [8]:
## Validate the Hessian
norm(B'*B/size(B,1) - Hessian(opt))
Out[8]:
0.0
In [9]:
function gradient_descent_step!(m::SGD)
    g = gradient(m)
end
Out[9]:
gradient_descent_step! (generic function with 1 method)
In [10]:
function sgd_step!(m::SGD, alpha)
    f = rand(m.fs) # generate a random f
    m.x .-= alpha*f(m.x)[2] # the 2nd output is the gradient 
end
function sgd_step!(m::SGD, S::Integer, alpha)
    f = rand(m.fs) # generate a random f
    eg = alpha*f(m.x)[2]/S
    for j in 2:S
        f = rand(m.fs) # generate a random f
        eg .+= alpha*f(m.x)[2]/S
    end 
    m.x .-= eg # the 2nd output is the gradient 
end

norms = zeros(0)
fvals = zeros(0)
for i=1:100 
    for j=1:100 # show gradient every few steps
        sgd_step!(opt, 0.001) # How to set alpha? 
    end
    push!(norms, norm(gradient(opt)))
    push!(fvals, fval(opt))
end
In [11]:
plot(
    plot(norms, label="", ylabel="norm(g)"),
    plot(fvals, label="", ylabel="fval")
    )
Out[11]:
In [12]:
norms = zeros(0)
fvals = Float64[] 
opt.x = zeros(length(opt.x))
for i=1:100
    for j=1:100 # show gradient every few steps
        sgd_step!(opt, 0.01) # How to set alpha? 
    end
    push!(norms, norm(gradient(opt)))
    push!(fvals, fval(opt))
end
In [13]:
plot(
    plot(norms, label="", ylabel="norm(g)"),
    plot(fvals, label="", ylabel="fval")
)
Out[13]:
In [14]:
norms = zeros(0)
fvals = Float64[]
opt.x = zeros(length(opt.x))
for i=1:100
    sgd_step!(opt, 100, 0.1) # How to set alpha? 
    push!(norms, norm(gradient(opt)))
    push!(fvals, fval(opt))
end
In [15]:
plot(
    plot(norms, label="", ylabel="norm(g)"),
    plot(fvals, label="", ylabel="fval")
)
Out[15]:
In [16]:
norms = zeros(0)
opt.x = zeros(length(opt.x))
fvals = zeros(0)
for i=1:100
    g = gradient(opt)
    opt.x .-= 0.01*g
    push!(norms, norm(gradient(opt)))
    push!(fvals, fval(opt))
end
plot(
    plot(norms, label="", ylabel="norm(g)"),
    plot(fvals, label="", ylabel="fval")
)
Out[16]:
In [ ]:
function sn_step!(m::SGD, S::Integer, T::Integer, )
    sH = mean(i->m.fs[i](m.x)[3], rand(1:length(m.fs), T))
    sg = mean(i->m.fs[i](m.x)[2], rand(1:length(m.fs), S))

    # heuristic scaling to make the model 
    # defined in all coordinates... 
    gamma = T/(2*length(m.fs)*log(length(m.fs)))
    gamma = max(1/4 - gamma,0)
    p = (sH + gamma*I)\(-sg)
    
    m.x .+= p
end 
##
norms = zeros(0)
opt.x = zeros(length(opt.x))
fvals = zeros(0)
for i=1:100
    sn_step!(opt, 100, 100) 
    push!(norms, norm(gradient(opt)))
    push!(fvals, fval(opt))
end
plot(
    plot(norms, label="", ylabel="norm(g)"),
    plot(fvals, label="", ylabel="fval")
)
In [ ]:

In [ ]: