hi ,
I am doing a random forest tree with data file: https://www.openml.org/d/179
my code worked in many smaller data files. but this the largest data file I tried so far. do you know if ShapML might have a limit in the number of file rows?
@Everywhere function predict_function_mode(model, data)
ŷ = MLJ.predict(model, data)
ŷMode = [convert(Int64, mode(ŷ[i])) for i in 1:length(ŷ)]
data_pred = DataFrame(y_pred = ŷMode)
return data_pred
end # predict_function_mode
@everywhere pipeRandomForestClassifier = @pipeline RandomForestClassifierPipe(
selector = FeatureSelector(),
hot = OneHotEncoder(),
tree = RandomForestClassifier()) prediction_type = :probabilistic
cases = [[Symbol(names(X)[j]) for j in 1:i] for i in 1:ncol(X)]
r1 = range(pipeRandomForestClassifier, :(selector.features), values = cases)
tmRandomForestClassifier = TunedModel(
model = pipeRandomForestClassifier,
range = r1,
measures = [cross_entropy, BrierScore()],
resampling = CV(nfolds = 5)
)
mtm = machine(tmRandomForestClassifier, setScientificTypes!(X), categorical(y[:, 1]))
Base.invokelatest(MLJ.fit!, mtm)
predictor = predict_function_mode
r = Int(round(nrow(X) / 2))
explain = copy(X[1:r, :]) # Compute Shapley feature-level predictions
reference = copy(X) # An optional reference population to compute the baseline prediction.
sample_size = 10 # Number of Monte Carlo samples for Shapley
println("Computing Shapley Effect of Random Forest")
dataShap = ShapML.shap( explain = explain,
reference = reference,
model = mtm,
predict_function = predictor,
sample_size = sample_size,
parallel = :samples, # Parallel computation over "sample_size"
seed = 20200628
)
Worker 5 terminated.
ERROR: OutOfMemoryError()Worker 2 terminated.
Stacktrace:
[1] Worker 4 terminated.Array{Float64,2}
(::Worker 3 terminated.UndefInitializer
, ::Int64, ::Int64) at .\boot.jl:407
[2] matrix(::DataFrame; transpose::Bool) at C:\Users\BCP.julia\packages\Tables\okt7x\src\matrix.jl:73
[3] matrix at C:\Users\BCP.julia\packages\Tables\okt7x\src\matrix.jl:68 [inlined]
[4] #matrix#11 at C:\Users\BCP.julia\packages\MLJBase\O5b6j\src\interface\data_utils.jl:9 [inlined]
[5] matrix at C:\Users\BCP.julia\packages\MLJBase\O5b6j\src\interface\data_utils.jl:9 [inlined]
[6] matrix(::DataFrame; kw::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{,Tuple{}}}) at C:\Users\BCP.julia\packages\MLJModelInterface\aA1k2\src\data_utils.jl:27
[7] matrix at C:\Users\BCP.julia\packages\MLJModelInterface\aA1k2\src\data_utils.jl:27 [inlined]
[8] predict(::RandomForestClassifier, ::Tuple{DecisionTree.Ensemble{Float64,UInt32},CategoricalArray{Int64,1,UInt32,Int64,CategoricalValue{Int64,UInt32},Union{}},Array{UInt32,1}}, ::DataFrame) at C:\Users\BCP.julia\packages\MLJModels\i4XcU\src\DecisionTree.jl:188
[9] predict(::NodalMachine{RandomForestClassifier}, ::DataFrame) at C:\Users\BCP.julia\packages\MLJBase\O5b6j\src\operations.jl:29
[10] (::Node{NodalMachine{RandomForestClassifier}})(::DataFrame) at C:\Users\BCP.julia\packages\MLJBase\O5b6j\src\composition\networks.jl:302
[11] predict(::RandomForestClassifierPipe, ::Node{NodalMachine{RandomForestClassifier}}, ::DataFrame) at C:\Users\BCP.julia\packages\MLJBase\O5b6j\src\composition\composites.jl:45
[12] predict(::Machine{RandomForestClassifierPipe}, ::DataFrame) at C:\Users\BCP.julia\packages\MLJBase\O5b6j\src\operations.jl:29
[13] predict(::MLJTuning.ProbabilisticTunedModel{Grid,RandomForestClassifierPipe,CPU1{Nothing},CPU1{Nothing}}, ::Machine{RandomForestClassifierPipe}, ::DataFrame) at C:\Users\BCP.julia\packages\MLJTuning\JZ7ZX\src\tuned_models.jl:597
[14] predict(::Machine{MLJTuning.ProbabilisticTunedModel{Grid,RandomForestClassifierPipe,CPU1{Nothing},CPU1{Nothing}}}, ::DataFrame) at C:\Users\BCP.julia\packages\MLJBase\O5b6j\src\operations.jl:29
[15] predict_function_mode(::Machine{MLJTuning.ProbabilisticTunedModel{Grid,RandomForestClassifierPipe,CPU1{Nothing},CPU1{Nothing}}}, ::DataFrame) at C:\Users\BCP\github\ICP\GetBlankets.jl:180
[16] _shap_sample(::DataFrame, ::DataFrame, ::Int64, ::Int64, ::Int64, ::Int64, ::Array{String,1}, ::Array{String,1}, ::Array{Symbol,1}, ::Int64, ::Symbol, ::Array{Int64,1}, ::Bool, ::Machine{MLJTuning.ProbabilisticTunedModel{Grid,RandomForestClassifierPipe,CPU1{Nothing},CPU1{Nothing}}}, ::typeof(predict_function_mode), ::Nothing) at C:\Users\BCP.julia\packages\ShapML\sceNA\src\shap_sample.jl:97
[17] shap(; explain::DataFrame, reference::DataFrame, model::Machine{MLJTuning.ProbabilisticTunedModel{Grid,RandomForestClassifierPipe,CPU1{Nothing},CPU1{Nothing}}}, predict_function::Function, target_features::Nothing, sample_size::Int64, parallel::Nothing, seed::Int64, precision::Nothing, chunk::Bool) at C:\Users\BCP.julia\packages\ShapML\sceNA\src\ShapML.jl:117
[18] trainRandomForest(::DataFrame, ::DataFrame) at C:\Users\BCP\github\ICP\GetBlankets.jl:248
[19] selectRandomForest(::DataFrame, ::DataFrame, ::Int64) at C:\Users\BCP\github\ICP\GetBlankets.jl:155
[20] getBlanketRandomForest(::DataFrame, ::DataFrame, ::String, ::Int64) at C:\Users\BCP\github\ICP\GetBlankets.jl:135
[21] getBlanketRandomForest at C:\Users\BCP\github\ICP\GetBlankets.jl:129 [inlined]
[22] ForestInvariantCausalPrediction(::DataFrame, ::DataFrame, ::DataFrame; α::Float64, selection::String, verbose::Bool) at C:\Users\BCP\github\ICP\InvariantCausalPrediction.jl:305
[23] top-level scope at REPL[10]:1
caused by [exception 1]
ProcessExitedException(2)
Stacktrace:
[1] (::Base.var"#726#728")(::Task) at .\asyncmap.jl:178
[2] foreach(::Base.var"#726#728", ::Array{Any,1}) at .\abstractarray.jl:1919
[3] maptwice(::Function, ::Channel{Any}, ::Array{Any,1}, ::UnitRange{Int64}) at .\asyncmap.jl:178
[4] wrap_n_exec_twice(::Channel{Any}, ::Array{Any,1}, ::Distributed.var"#204#207"{WorkerPool}, ::Function, ::UnitRange{Int64}) at .\asyncmap.jl:154
[5] async_usemap(::Distributed.var"#188#190"{Distributed.var"#188#189#191"{WorkerPool,ShapML.var"#13#15"{DataFrame,Machine{MLJTuning.ProbabilisticTunedModel{Grid,RandomForestClassifierPipe,CPU1{Nothing},CPU1{Nothing}}},typeof(predict_function_mode),Int64,Nothing,Bool,Array{String,1},Array{Symbol,1},Int64,Int64,Int64,Int64,Array{Int64,1}}}}, ::UnitRange{Int64}; ntasks::Function, batch_size::Nothing) at .\asyncmap.jl:103
[6] #asyncmap#710 at .\asyncmap.jl:81 [inlined]
[7] pmap(::Function, ::WorkerPool, ::UnitRange{Int64}; distributed::Bool, batch_size::Int64, on_error::Nothing, retry_delays::Array{Any,1}, retry_check::Nothing) at D:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.4\Distributed\src\pmap.jl:126
[8] pmap(::Function, ::WorkerPool, ::UnitRange{Int64}) at D:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.4\Distributed\src\pmap.jl:101
[9] pmap(::Function, ::UnitRange{Int64}; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{,Tuple{}}}) at D:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.4\Distributed\src\pmap.jl:156
[10] pmap at D:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.4\Distributed\src\pmap.jl:156 [inlined]
[11] shap(; explain::DataFrame, reference::DataFrame, model::Machine{MLJTuning.ProbabilisticTunedModel{Grid,RandomForestClassifierPipe,CPU1{Nothing},CPU1{Nothing}}}, predict_function::Function, target_features::Nothing, sample_size::Int64, parallel::Symbol, seed::Int64, precision::Nothing, chunk::Bool) at C:\Users\BCP.julia\packages\ShapML\sceNA\src\ShapML.jl:137
[12] trainRandomForest(::DataFrame, ::DataFrame) at C:\Users\BCP\github\ICP\GetBlankets.jl:238
[13] selectRandomForest(::DataFrame, ::DataFrame, ::Int64) at C:\Users\BCP\github\ICP\GetBlankets.jl:155
[14] getBlanketRandomForest(::DataFrame, ::DataFrame, ::String, ::Int64) at C:\Users\BCP\github\ICP\GetBlankets.jl:135
[15] getBlanketRandomForest at C:\Users\BCP\github\ICP\GetBlankets.jl:129 [inlined]
[16] ForestInvariantCausalPrediction(::DataFrame, ::DataFrame, ::DataFrame; α::Float64, selection::String, verbose::Bool) at C:\Users\BCP\github\ICP\InvariantCausalPrediction.jl:305
[17] top-level scope at REPL[10]:1