Tune, and evaluate, species distribution models


  out_dir = FALSE,
  return_val = "path",
  algo = c("all", "maxnet", "bioclim", "envelope", "rf"),
  fc = "auto_feature",
  limit_p = FALSE,
  rm = seq(1, 6, 0.5),
  trees = c(500, 1000, 2000),
  mtry = TRUE,
  limit_spat_mtry = 4,
  nodesize = c(1, 2),
  keep_model = FALSE,
  best_run = FALSE,
  metrics_df = envSDM::sdm_metrics,
  use_metrics = c("auc_po", "CBI_rescale", "IMAE"),
  do_gc = TRUE,
  force_new = FALSE,



Character or named list. If character, the path to an existing prep.rds. Otherwise, the result of a call to prep_sdm with return_val = "object"


FALSE or character. If FALSE the result of tune_sdm will be saved to a temporary folder. If character, a file 'tune.rds' will be created at the path defined by out_dir.


Character: "object" or "path". Both return a named list. In the case of "path" the named list is simply list(tune = out_dir). Will be set to "object" if out_dir is FALSE.


Character. Name of algorithm to use.


Character. Used to generate levels of classes argument to maxnet::maxnet() that are tuned.


TRUE, FALSE or number of predictor variables above which to limit the use of p in the classes argument used in maxnet::maxnet(). Useful with many predictor variables when it becomes unwieldy to generate interactions for all predictors.


Numeric. Used to generate levels of regmult argument to maxnet::maxnet() that are tuned.


Used to generate the levels of ntree argument to randomForest::randomForest() that are tuned. TRUE (tune with default trees), FALSE (don't tune trees) or numeric (the trees values to tune with).


Used to generate the levels of mtry argument to randomForest::randomForest() that are tuned. TRUE (tune with sensible guesses for mtry), FALSE (only use default randomForest::randomForest() mtry) or numeric (the mtry values to tune with).


Numeric. If mtry is TRUE and if using spatial cross validation, the values of mtry to tune will be limited to less than or equal to limit_spat_mtry.


Used to generate the levels of nodesize argument to randomForest::randomForest() that are tuned. TRUE (tune with default nodesize), FALSE (only use default randomForest::randomForest() nodesize) or numeric (the nodesize values to tune with).


Logical. If TRUE the model results will be appended as a list column in the returned tibble (as column m)


Logical. If TRUE this alters the behaviour of the tune_sdm() by, well, not tuning. :). Sets all blocks to the same value so no cross-validation.


Dataframe. Defines which metrics to use when deciding on 'good' SDMs.


Character. Vector of values in metrics_df$metric to use when finding the 'best' model.


Logical. Run base::rm(list = ls) and base::gc() at end of function? Useful when running SDMs for many, many taxa, especially if done in parallel.


Logical. If outputs already exist, should they be remade?


Passed to evaluate_sdm(). e.g. thresholds for use in predicts::pa_evaluate() (as tr argument, although if used, the values of the thresholds element of the pa_ModelEvaluation object returned by predicts::pa_evaluate() will be limited to the values in tr).


If return_val is "object" a named list. If return_val is "path" a named list list(prep = out_dir). If out_dir is a valid path, the 'full result' (irrespective of return_val) is also saved to fs::path(out_dir, "prep.rds"). The 'full result' is a named list with elements:


  out_dir <- file.path(system.file(package = "envSDM"), "examples")

  data <- fs::path(system.file(package = "envSDM"), "examples") |>
    fs::dir_ls(regexp = "prep\\.rds$"
               , recurse = TRUE
               ) |>
    tibble::enframe(name = NULL, value = "prep") |>
    dplyr::mutate(taxa = gsub("\\.rds", "", basename(dirname(prep)))
                  , out_dir = fs::path(out_dir, taxa)

              , \(x) tune_sdm(prep = fs::path(x, "prep.rds")
                              , out_dir = x
                              , fc = "lq"
                              , rm = c(2, 3)
                              , trees = 500
                              , mtry = c(1:3)
                              , nodesize = 2
                              #, force_new = TRUE
#> [[1]]
#> [[1]]$tune_file
#> H:/temp/nige/RtmpAZgTQZ/temp_libpath362426b1106f/envSDM/examples/acaule/tune.rds
#> [[2]]
#> [[2]]$tune_file
#> H:/temp/nige/RtmpAZgTQZ/temp_libpath362426b1106f/envSDM/examples/bradypus/tune.rds

  # which tune args were best for each taxa using 'combo'?
  data %>%
    dplyr::mutate(tune = fs::path(out_dir, "tune.rds")
                  , tune = purrr::map(tune, rio::import)
                  , tune_mean = purrr::map(tune, "tune_mean")
                  ) %>%
    tidyr::unnest(cols = c(tune_mean)) %>%
    dplyr::filter(best) %>% # used 'combo' to determine 'best' as default in tune_sdm
    dplyr::select(taxa, algo, tune_args, combo, auc_po, IMAE, CBI, max_spec_sens)
#> # A tibble: 2 × 8
#>   taxa     algo   tune_args             combo auc_po  IMAE   CBI max_spec_sens
#>   <chr>    <chr>  <chr>                 <dbl>  <dbl> <dbl> <dbl>         <dbl>
#> 1 acaule   maxnet fc: lq. rm: 2         0.823  0.968 0.875 0.942         0.312
#> 2 bradypus rf     tr: 500. mt: 1. ns: 2 0.468  0.714 0.702 0.867         0.326

  # or best tune args choosing on just auc_po?
  data %>%
    dplyr::mutate(tune = fs::path(out_dir, "tune.rds")
                  , tune = purrr::map(tune, rio::import)
                  , all = purrr::map(tune, "tune_mean")
                  ) %>%
    tidyr::unnest(cols = c(all)) %>%
    dplyr::group_by(taxa) %>%
    dplyr::filter(auc_po == max(auc_po)) %>%
    dplyr::ungroup() %>%
    dplyr::select(taxa, algo, tune_args, auc_po, IMAE, CBI, max_spec_sens)
#> # A tibble: 2 × 7
#>   taxa     algo   tune_args             auc_po  IMAE   CBI max_spec_sens
#>   <chr>    <chr>  <chr>                  <dbl> <dbl> <dbl>         <dbl>
#> 1 acaule   rf     tr: 500. mt: 2. ns: 2  0.969 0.902 0.769         0.233
#> 2 bradypus maxnet fc: lq. rm: 3          0.723 0.631 0.830         0.448