A RandomDiscrete search criteria with initial setting of max_models > 25 for a glm causes h2o.grid search to hang

Description

library(h2o)

sim_data <- function {
tmp <- mlbench::mlbench.friedman1(n, sd=1)
tmp <- cbind(tmp$x, tmp$y)
tmp <- as.data.frame(tmp)
names(tmp)[ncol(tmp)] <- "y"
tmp
}

small_dat <- sim_data(100)

ncv_dat_2 <- rsample::nested_cv(small_dat,
outside = vfold_cv(v = 2, repeats = 2),
inside = bootstraps(times = 25))

dat <- ncv_dat_2$inner_resamples[[1]]$splits[[1]]

glm_params <- dials::grid_latin_hypercube(
mixture(),
penalty(),
size = 200
)

alg <- "glm"
params <- list(alpha = glm_params$mixture, lambda = glm_params$penalty)
strat <- list(strategy = "RandomDiscrete", max_models = 200, seed = 1)

h2o.init()

  1. split into analysis and assessment sets
    anal_df <- rsample::analysis(dat)
    ass_df <- rsample::assessment(dat)

  1. as.h2o and h2o.grid have progress bars. That's one too many of progress bars.
    h2o.no_progress()

  1. send data to the h2o cluster
    anal_h2o <- as.h2o(anal_df)
    ass_h2o <- as.h2o(ass_df)

y <- names(anal_h2o)[[ncol(anal_h2o)]]
x <- setdiff(names(anal_h2o), y)

h2o.show_progress()

  1. need a unique grid id or h2o just gives you the same predictions over aand over
    gridId <- as.character(dqrng::dqrnorm(1))

mod_grid <- h2o.grid(alg, x = x, y = y,
grid_id = gridId,
training_frame = anal_h2o,
validation_frame = ass_h2o,
hyper_params = params,
search_criteria = strat,
parallelism = 0)

Assignee

New H2O Bugs

Fix versions

None

Reporter

josh willingham

Support ticket URL

None

Labels

Affected Spark version

None

Customer Request Type

None

Task progress

None

ReleaseNotesHidden

None

CustomerVisible

No

Components

Affects versions

Priority

Major
Configure