ModelSanitizer exports the sanitize! function and the Model, Data, and
ForceSanitize structs.
If your model is stored in m and your data are stored in x1,
x2, x3, etc. then you can sanitize your model with:
sanitize!(Model(m), Data(x1), Data(x2), Data(x3), ...)This will recursively search inside the model m for anything that resembles
your data and will delete the data that it finds.
If you happen to know exactly where inside a model the data are stored, you
can explicitly tell ModelSanitizer to delete those data. If your model is
stored in m, and you know that the fields m.x1, m.x2, m.x3, etc. contain
data that needs to be removed, you can force ModelSanitizer to delete those
data with:
sanitize!(ForceSanitize(m.x1), ForceSanitize(m.x2), ForceSanitize(m.x3), ...)julia> using ModelSanitizer
julia> using Statistics
julia> using Test
julia> mutable struct LinearModel{T}
           X::Matrix{T}
           y::Vector{T}
           beta::Vector{T}
           function LinearModel{T}()::LinearModel{T} where T
               m::LinearModel{T} = new()
               return m
           end
       end
julia> function fit!(m::LinearModel{T}, X::Matrix{T}, y::Vector{T})::LinearModel{T} where T
           m.X = deepcopy(X)
           m.y = deepcopy(y)
           m.beta = beta = (m.X'm.X)\(m.X'm.y)
           return m
       end
fit! (generic function with 1 method)
julia> function predict(m::LinearModel{T}, X::Matrix{T})::Vector{T} where T
           y_hat::Vector{T} = X * m.beta
           return y_hat
       end
predict (generic function with 1 method)
julia> function predict(m::LinearModel{T})::Vector{T} where T
           X::Matrix{T} = m.X
           y_hat::Vector{T} = predict(m, X)
           return y_hat
       end
predict (generic function with 2 methods)
julia> function mse(y::Vector{T}, y_hat::Vector{T})::T where T
           _mse::T = mean((y .- y_hat).^2)
           return _mse
       end
mse (generic function with 1 method)
julia> function mse(m::LinearModel{T}, X::Matrix{T}, y::Vector{T})::T where T
           y_hat::Vector{T} = predict(m, X)
           _mse::T = mse(y, y_hat)
           return _mse
       end
mse (generic function with 2 methods)
julia> function mse(m::LinearModel{T})::T where T
           X::Matrix{T} = m.X
           y::Vector{T} = m.y
           _mse::T = mse(m, X, y)
           return _mse
       end
mse (generic function with 3 methods)
julia> rmse(varargs...) = sqrt(mse(varargs...))
rmse (generic function with 1 method)
julia> function r2(y::Vector{T}, y_hat::Vector{T})::T where T
           y_bar::T = mean(y)
           SS_tot::T = sum((y .- y_bar).^2)
           SS_res::T = sum((y .- y_hat).^2)
           _r2::T = 1 - SS_res/SS_tot
           return _r2
       end
r2 (generic function with 1 method)
julia> function r2(m::LinearModel{T}, X::Matrix{T}, y::Vector{T})::T where T
           y_hat::Vector{T} = predict(m, X)
           _r2::T = r2(y, y_hat)
           return _r2
       end
r2 (generic function with 2 methods)
julia> function r2(m::LinearModel{T})::T where T
           X::Matrix{T} = m.X
           y::Vector{T} = m.y
           _r2::T = r2(m, X, y)
           return _r2
       end
r2 (generic function with 3 methods)
julia> X = randn(Float64, 5_000, 14)
5000×14 Array{Float64,2}:
  0.0956436    0.481324   -0.796437  …  -2.26483     1.57243    -1.65105
 -0.306527    -0.880146   -0.764714     -0.182449   -0.0767462  -0.939232
 -0.223116    -0.408068    0.728855      0.220045    0.785533    0.49013
 -0.336363     1.46187    -1.17633      -0.955872    0.699277    0.587961
  0.628275     0.208697   -0.522714      0.116233    0.47314     0.435968
 -0.12303     -0.964061    0.919518  …  -0.0230613  -1.12379    -0.439892
  1.06664      0.96542    -0.250164     -0.776266    1.70851    -1.08608
  0.957151     0.850486    1.31718       0.497219    1.01069    -0.558217
 -0.206168    -0.608305   -0.864631      0.969031    0.209796    1.28718
 -0.658039     1.20687     1.33288       1.54847     0.546286   -1.00404
 -0.598782    -0.193289    0.673134  …  -1.59742     0.410881   -1.61342
  0.31442      0.0199012   0.50533       1.0889     -0.0713841  -1.29933
  0.236585    -1.09804     0.945631     -0.729247   -1.10004    -0.339332
  0.122913     0.619345   -2.90947       1.09613    -0.662693   -1.03469
  1.52615      0.942471    0.262139      0.223064    0.665103    1.4081
 -0.474543     1.9466     -0.408505  …   1.01626    -0.297397   -0.0953909
  0.73664     -0.0796424  -1.84864       1.15935     0.0164378   1.32191
  0.24588      0.271068   -0.238212      0.596475    1.52617    -0.747777
  ⋮                                  ⋱
 -1.07141      0.194049   -0.350011     -0.666195    0.481406   -0.451329
 -0.00993413   0.33006    -0.985443     -0.0395822   2.36983    -0.793007
  0.610014    -0.509744   -1.06447   …   1.19769     1.129       0.397217
  0.785654    -0.361031    0.314127      0.192215    0.789262    0.725731
  0.258588    -2.06379     0.511611      0.0963516  -1.01919    -0.540021
  0.48671     -0.918205    0.264124      0.989929    2.45245    -1.39545
 -1.27085     -0.0617834   2.59491       0.291602    1.28642     0.236496
  1.4044      -1.24472    -0.205029  …   1.99366    -1.58951     0.963728
 -1.07691      0.44178    -0.602841      0.584759   -0.887116    1.36514
  1.13586      0.954756    0.44016      -2.21191    -1.14086    -0.585916
 -0.763031    -1.13348    -1.46696      -1.4121     -0.977694   -0.618883
  0.875367    -1.30925     0.183117      0.224709    0.0752964  -0.92173
  0.659502     0.71971    -1.05538   …  -0.912277   -0.736332    1.01404
 -0.809941     2.02362     1.29668       0.113623   -0.858281    0.0863472
 -1.6409       0.310551   -0.235102     -1.11232    -0.170224    0.404804
 -0.367908    -1.9062      0.245953     -0.751821   -0.794633    0.00894607
  0.380897     2.30871    -0.669909      0.282513   -0.114725   -0.253537
julia> y = X * randn(Float64, 14) + randn(5_000)
5000-element Array{Float64,1}:
 -4.418867382994752
  1.0721553534178543
  2.210545604666476
 -2.5053994409702094
  2.24399399066432
  0.5993702994926247
  2.2040361967638322
 -2.4902628750358193
  4.184644001244288
  1.7688752332135804
 -4.831550352023476
 -1.068149084362266
 -0.746260929030723
  0.032933800577055417
  2.878202216460962
  2.773804353610833
  1.0288912118472482
  3.7799578982964963
  ⋮
  3.1797791441997822
  5.830717537973503
 -0.8191545280972992
  4.649281267724443
  0.9470989605451162
  5.733118456044454
  3.057352206232011
  4.791267454465988
 -4.604222639675081
 -5.755448165821573
 -0.9804279159155482
  2.2904285226467276
  2.809999802793834
  0.7773010780323945
 -2.5205742651574
  3.8866539005621092
 -4.085889556008112
julia> m = LinearModel{Float64}()
LinearModel{Float64}(#undef, #undef, #undef)
julia> testing_rows = 1:2:5_000
1:2:4999
julia> training_rows = setdiff(1:5_000, testing_rows)
2500-element Array{Int64,1}:
    2
    4
    6
    8
   10
   12
   14
   16
   18
   20
   22
   24
   26
   28
   30
   32
   34
   36
    ⋮
 4968
 4970
 4972
 4974
 4976
 4978
 4980
 4982
 4984
 4986
 4988
 4990
 4992
 4994
 4996
 4998
 5000
julia> fit!(m, X[training_rows, :], y[training_rows])
LinearModel{Float64}([-0.306527 -0.880146 … -0.0767462 -0.939232; -0.336363 1.46187 … 0.699277 0.587961; … ; -1.6409 0.310551 … -0.170224 0.404804; 0.380897 2.30871 … -0.114725 -0.253537], [1.07216, -2.5054, 0.59937, -2.49026, 1.76888, -1.06815, 0.0329338, 2.7738, 3.77996, -4.06727  …  2.81088, 3.17978, -0.819155, 0.947099, 3.05735, -4.60422, -0.980428, 2.81, -2.52057, -4.08589], [-0.532213, -1.16489, -0.414974, -0.562536, -0.440432, 0.732505, -1.06754, 0.399485, -0.67281, -1.44599, 0.835625, 0.426459, 1.20088, 0.754435])
julia> @test m.X == X[training_rows, :]
Test Passed
julia> @test m.y == y[training_rows]
Test Passed
julia> @test all(m.X .== X[training_rows, :])
Test Passed
julia> @test all(m.y .== y[training_rows])
Test Passed
julia> @test !all(m.X .== 0)
Test Passed
julia> @test !all(m.y .== 0)
Test Passed
julia> # before sanitization, we can make predictions
       predict(m, X[testing_rows, :])
2500-element Array{Float64,1}:
 -4.513253714187381
  2.5689035333536605
  0.9939782906365846
  1.2513894159362184
  3.2007086601687353
 -5.387968774216589
 -0.1767892797746935
  3.4408813711668165
  0.4625821018811823
  1.649129884116436
 -0.8620887900500149
  0.6504970487658756
  4.287913533796443
 -2.5014166099065136
  1.1666979326633855
  0.2723098985354143
  3.2783930370766634
  2.250636815003683
  ⋮
  1.1999638265752477
  3.8377489399901084
  4.2805489451765935
 -0.5849048693472063
 -0.6574890049656816
  0.2606368302418087
 -4.197310605534758
 -3.5805273324146336
 -0.5244747588662737
  5.274904154193373
  2.7742388165636953
  5.883741172337488
  2.118699747786167
 -4.209943069147431
  2.262361580682631
 -0.5044151513387216
  4.443422779093501
julia> predict(m, X[training_rows, :])
2500-element Array{Float64,1}:
  2.943212508610099
 -0.8226863248850258
  1.031068845178503
 -3.3178919274576053
  0.587046578244962
 -0.032251634503744686
  1.9123819046207888
  3.555603804394087
  2.1728937544760307
 -1.9319447549669504
 -0.7592148524301295
 -7.250437603426189
  4.982277986708986
 -1.8660967909674548
  0.29423182806971415
  0.593840341165224
 -0.26314562641917977
  1.4340414682799685
  ⋮
  1.6038174714835796
  1.3091787016871341
  4.936123830680592
  1.9812183495287048
 -0.848632475032059
  3.1553721781769157
 -5.412240178264108
  1.406559298117795
  3.6433312336276646
  0.3408165307792135
  0.2882242203753349
  1.8120206189755343
 -3.299798877655878
 -0.8793971451160698
  2.3158119962568886
 -2.4598360012327265
 -4.810128269819875
julia> @show mse(m, X[training_rows, :], y[training_rows])
mse(m, X[training_rows, :], y[training_rows]) = 0.9856973993855034
0.9856973993855034
julia> @show rmse(m, X[training_rows, :], y[training_rows])
rmse(m, X[training_rows, :], y[training_rows]) = 0.9928229446308658
0.9928229446308658
julia> @show r2(m, X[training_rows, :], y[training_rows])
r2(m, X[training_rows, :], y[training_rows]) = 0.9044357103305194
0.9044357103305194
julia> @show mse(m, X[testing_rows, :], y[testing_rows])
mse(m, X[testing_rows, :], y[testing_rows]) = 0.9480778102674918
0.9480778102674918
julia> @show rmse(m, X[testing_rows, :], y[testing_rows])
rmse(m, X[testing_rows, :], y[testing_rows]) = 0.9736928726592856
0.9736928726592856
julia> @show r2(m, X[testing_rows, :], y[testing_rows])
r2(m, X[testing_rows, :], y[testing_rows]) = 0.9088387716983182
0.9088387716983182
julia> sanitize!(Model(m), Data(X), Data(y)) # sanitize the model with ModelSanitizer
Model{LinearModel{Float64}}(LinearModel{Float64}([0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [-0.532213, -1.16489, -0.414974, -0.562536, -0.440432, 0.732505, -1.06754, 0.399485, -0.67281, -1.44599, 0.835625, 0.426459, 1.20088, 0.754435]))
julia> @test m.X != X[training_rows, :]
Test Passed
julia> @test m.y != y[training_rows]
Test Passed
julia> @test !all(m.X .== X[training_rows, :])
Test Passed
julia> @test !all(m.y .== y[training_rows])
Test Passed
julia> @test all(m.X .== 0)
Test Passed
julia> @test all(m.y .== 0)
Test Passed
julia> # after sanitization, we are still able to make predictions
       predict(m, X[testing_rows, :])
2500-element Array{Float64,1}:
 -4.513253714187381
  2.5689035333536605
  0.9939782906365846
  1.2513894159362184
  3.2007086601687353
 -5.387968774216589
 -0.1767892797746935
  3.4408813711668165
  0.4625821018811823
  1.649129884116436
 -0.8620887900500149
  0.6504970487658756
  4.287913533796443
 -2.5014166099065136
  1.1666979326633855
  0.2723098985354143
  3.2783930370766634
  2.250636815003683
  ⋮
  1.1999638265752477
  3.8377489399901084
  4.2805489451765935
 -0.5849048693472063
 -0.6574890049656816
  0.2606368302418087
 -4.197310605534758
 -3.5805273324146336
 -0.5244747588662737
  5.274904154193373
  2.7742388165636953
  5.883741172337488
  2.118699747786167
 -4.209943069147431
  2.262361580682631
 -0.5044151513387216
  4.443422779093501
julia> predict(m, X[training_rows, :])
2500-element Array{Float64,1}:
  2.943212508610099
 -0.8226863248850258
  1.031068845178503
 -3.3178919274576053
  0.587046578244962
 -0.032251634503744686
  1.9123819046207888
  3.555603804394087
  2.1728937544760307
 -1.9319447549669504
 -0.7592148524301295
 -7.250437603426189
  4.982277986708986
 -1.8660967909674548
  0.29423182806971415
  0.593840341165224
 -0.26314562641917977
  1.4340414682799685
  ⋮
  1.6038174714835796
  1.3091787016871341
  4.936123830680592
  1.9812183495287048
 -0.848632475032059
  3.1553721781769157
 -5.412240178264108
  1.406559298117795
  3.6433312336276646
  0.3408165307792135
  0.2882242203753349
  1.8120206189755343
 -3.299798877655878
 -0.8793971451160698
  2.3158119962568886
 -2.4598360012327265
 -4.810128269819875
julia> @show mse(m, X[training_rows, :], y[training_rows])
mse(m, X[training_rows, :], y[training_rows]) = 0.9856973993855034
0.9856973993855034
julia> @show rmse(m, X[training_rows, :], y[training_rows])
rmse(m, X[training_rows, :], y[training_rows]) = 0.9928229446308658
0.9928229446308658
julia> @show r2(m, X[training_rows, :], y[training_rows])
r2(m, X[training_rows, :], y[training_rows]) = 0.9044357103305194
0.9044357103305194
julia> @show mse(m, X[testing_rows, :], y[testing_rows])
mse(m, X[testing_rows, :], y[testing_rows]) = 0.9480778102674918
0.9480778102674918
julia> @show rmse(m, X[testing_rows, :], y[testing_rows])
rmse(m, X[testing_rows, :], y[testing_rows]) = 0.9736928726592856
0.9736928726592856
julia> @show r2(m, X[testing_rows, :], y[testing_rows])
r2(m, X[testing_rows, :], y[testing_rows]) = 0.9088387716983182
0.9088387716983182
julia> # if you know exactly where the data are stored inside the model, you can
       # directly delete them with ForceSanitize:
       sanitize!(ForceSanitize(m.X), ForceSanitize(m.y))
(ForceSanitize{Array{Float64,2}}([0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]), ForceSanitize{Array{Float64,1}}([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]))
julia> # we can still make predictions even after using ForceSanitize
       predict(m, X[testing_rows, :])
2500-element Array{Float64,1}:
 -4.513253714187381
  2.5689035333536605
  0.9939782906365846
  1.2513894159362184
  3.2007086601687353
 -5.387968774216589
 -0.1767892797746935
  3.4408813711668165
  0.4625821018811823
  1.649129884116436
 -0.8620887900500149
  0.6504970487658756
  4.287913533796443
 -2.5014166099065136
  1.1666979326633855
  0.2723098985354143
  3.2783930370766634
  2.250636815003683
  ⋮
  1.1999638265752477
  3.8377489399901084
  4.2805489451765935
 -0.5849048693472063
 -0.6574890049656816
  0.2606368302418087
 -4.197310605534758
 -3.5805273324146336
 -0.5244747588662737
  5.274904154193373
  2.7742388165636953
  5.883741172337488
  2.118699747786167
 -4.209943069147431
  2.262361580682631
 -0.5044151513387216
  4.443422779093501
julia> predict(m, X[training_rows, :])
2500-element Array{Float64,1}:
  2.943212508610099
 -0.8226863248850258
  1.031068845178503
 -3.3178919274576053
  0.587046578244962
 -0.032251634503744686
  1.9123819046207888
  3.555603804394087
  2.1728937544760307
 -1.9319447549669504
 -0.7592148524301295
 -7.250437603426189
  4.982277986708986
 -1.8660967909674548
  0.29423182806971415
  0.593840341165224
 -0.26314562641917977
  1.4340414682799685
  ⋮
  1.6038174714835796
  1.3091787016871341
  4.936123830680592
  1.9812183495287048
 -0.848632475032059
  3.1553721781769157
 -5.412240178264108
  1.406559298117795
  3.6433312336276646
  0.3408165307792135
  0.2882242203753349
  1.8120206189755343
 -3.299798877655878
 -0.8793971451160698
  2.3158119962568886
 -2.4598360012327265
 -4.810128269819875
julia> @show mse(m, X[training_rows, :], y[training_rows])
mse(m, X[training_rows, :], y[training_rows]) = 0.9856973993855034
0.9856973993855034
julia> @show rmse(m, X[training_rows, :], y[training_rows])
rmse(m, X[training_rows, :], y[training_rows]) = 0.9928229446308658
0.9928229446308658
julia> @show r2(m, X[training_rows, :], y[training_rows])
r2(m, X[training_rows, :], y[training_rows]) = 0.9044357103305194
0.9044357103305194
julia> @show mse(m, X[testing_rows, :], y[testing_rows])
mse(m, X[testing_rows, :], y[testing_rows]) = 0.9480778102674918
0.9480778102674918
julia> @show rmse(m, X[testing_rows, :], y[testing_rows])
rmse(m, X[testing_rows, :], y[testing_rows]) = 0.9736928726592856
0.9736928726592856
julia> @show r2(m, X[testing_rows, :], y[testing_rows])
r2(m, X[testing_rows, :], y[testing_rows]) = 0.9088387716983182
0.9088387716983182