# generate some data as in ?bart examples:
f <- function(x) {
10 * sin(pi * x[,1] * x[,2]) + 20 * (x[,3] - 0.5)^2 +
10 * x[,4] + 5 * x[,5]
}
set.seed(99)
sigma <- 1.0
n <- 100
x <- matrix(runif(n * 10), n, 10)
Ey <- f(x)
y <- rnorm(n, Ey, sigma)
# make 'y' binary:
y <- ifelse(y > mean(y), 1, 0)
# make one of the x variables categorical:
x <- data.frame(x)
x[,1] <- ifelse(x[,1] > mean(x[,1]), "high", "low")
head(x)
# fit a bart model:
set.seed(99)
bartFit <- bart(x, y, keeptrees = TRUE)
summary(bartFit) # notice 10 variables (i.e. including the categorical one) in predictor list
bartFit$fit$data
unlist(attr(bartFit$fit$data@x, "drop")) # notice X1 (categorical variable) named here as X11 and X12 (one for each category)
# X11 X12 X2 X3 X4 X5 X6 X7 X8 X9 X10
# 52 48 0 0 0 0 0 0 0 0 0
# attempt to compute variable importance with 'embarcadero':
varimp(bartFit) # Error in data.frame(names, varimps) : arguments imply differing number of rows: 9, 10
# but the variable importance info is there, including for the categorical variable (though it's also renamed here):
rel_imp <- bartFit$varcount / rowSums(bartFit$varcount)
colnames(rel_imp)
# [1] "X1.low" "X2" "X3" "X4" "X5" "X6" "X7" "X8" "X9" "X10"
# attempt to simplify the model with 'embarcadero':
variable.step(x, y) # X1 (categorical variable) said to be dropped by 'dbarts', but it wasn't really -- it was dropped by 'embarcadero' when expecting unlist(attr(bartFit$fit$data@x, "drop")) to have the original variables' names