For an unmatched 'name', try to find a match via: rgbif::name_usage(); rgbif::name_backbone(); and removal of any characters in 'name' after an 'x' or 'X' (i.e. treat hybrids as just the first taxa). Using rgbif::name_backbone() allows fuzzy matching to fix spelling errors. Any results are passed back to galah::search_taxa() to retrieve an override to use for that 'name' (so long as the rgbif result is not identical to 'name'). Any 'name' still completely unmatched is just given the override 'name' so it will not be lost from downstream processes but will not have any associated taxonomic information.

make_unmatched_overrides(
  df,
  taxa_col = "original_name",
  taxonomy,
  target_rank = "species",
  hybrids = FALSE,
  include_unmatched = TRUE,
  results_file = tempfile(fileext = ".parquet"),
  remove_taxa = c("bold:", "unverified", "undetermined", "unidentified", "annual herb",
    "annual grass", "incertae sedis", "\\?", "another\\s", "not naturalised in sa",
    "annual tussock grass", "*no id", "spec\\.", "\\s\\-\\-\\s.*",
    "\\ssp\\.", "\\sspec\\.", "\\ssp$", "\\ssp\\d", "\\ssp\\s",
    "\\sspp\\.", "\\sspp\\s", "\\sspp$", "dead", "unknown", "\\sgroup$",
    "\\sspecies$", "aquatic grass", "hybrid", "\\scultivar$", "\\scomplex$",
    "\\ssect\\.", "\\ss\\.\\sstr\\.", "\\(includes\\s"),
  tri_strings = c("\\sssp\\s", "\\sssp\\.", "\\svar\\s", "\\svar\\.",
    "\\ssubsp\\.", "\\ssubspecies", "\\sform\\)", "\\sform\\s",
    "\\sf\\.", "\\srace\\s", "\\srace\\)", "\\sp\\.v\\.")
)

Arguments

df

Dataframe of biological records

taxa_col

Character. Name of column in df containing the taxonomic entities for which a match is desired.

taxonomy

Result of call to make_taxonomy()

target_rank

Character. Level within envClean::lurank$rank to target

hybrids

Logical. Create overrides for hybrids (e.g. original names with 'x')?

include_unmatched

Logical. Create overrides for taxa not matched via gbif using their original names?

results_file

File path to write results of searches. Previous results files are used to avoid redoing time consuming searches for taxa that are not matched via gbif and not written to the taxonomy file in make_taxonomy.

remove_taxa

Character. Taxa with regular expressions in tolower(taxa_col) that match remove_taxa will not be searched or have overrides constructed.

tri_strings

Character. Taxa names with these strings that indicate a trinomial will not be included as a binomial override (i.e. avoids the use_species column in the overrides being populated with trinomial names).

Value

Tibble in appropriate form to pass to the overrides argument of make_taxonomy()

Examples


  # setup
  # library("envClean")

  temp_file <- tempfile()

  taxa_df <- tibble::tibble(taxa = c("Charadrius rubricollis"
                                     , "Thinornis cucullatus"
                                     , "Melithreptus gularis laetior"
                                     , "Melithreptus gularis gularis"
                                     , "Eucalyptus viminalis"
                                     , "Eucalyptus viminalis cygnetensis"
                                     , "Eucalyptus"
                                     , "Charadrius mongolus all subspecies"
                                     , "Bettongia lesueur Barrow and Boodie Islands subspecies"
                                     , "Lagorchestes hirsutus Central Australian subspecies"
                                     , "Perameles gunnii Victorian subspecies"
                                     , "Pterostylis sp. Rock ledges (pl. 185, Bates & Weber 1990)"
                                     , "Spyridium glabrisepalum"
                                     , "Spyridium eriocephalum var. glabrisepalum"
                                     , "Petrogale lateralis (MacDonnell Ranges race)"
                                     , "Gehyra montium (revised)"
                                     , "Korthalsella japonica f. japonica"
                                     , "Galaxias sp. nov. 'Hunter'"
                                     , "Some rubbish"
                                     , "Senna artemisioides subsp x artemisioides"
                                     , "Halosarcia sp.  (NC)"
                                     , "TERMITOIDAE sp." # 'epifamily'
                                     )
                            )

  # make taxonomy (returns list and writes taxonomy_file)
  taxonomy <- make_taxonomy(df = taxa_df
                            , taxa_col = "taxa"
                            , taxonomy_file = temp_file
                            , needed_ranks = c("kingdom", "genus", "species", "subspecies")
                            )
#> Joining with `by = join_by(original_name)`
#> Error in check_pour_interactive(.slot): No data stored by `potions`
#>  try using `brew()
  taxonomy$raw
#> Error: object 'taxonomy' not found
  taxonomy$kingdom
#> Error: object 'taxonomy' not found
  taxonomy$genus
#> Error: object 'taxonomy' not found
  taxonomy$species
#> Error: object 'taxonomy' not found
  taxonomy$subspecies
#> Error: object 'taxonomy' not found

  # query more taxa (results are added to taxonomy_file but only the new taxa are returned (default `limit = TRUE`)
  more_taxa <- tibble::tibble(original_name = c("Amytornis whitei"
                                                , "Amytornis striatus"
                                                , "Amytornis modestus (North, 1902)"
                                                , "Amytornis modestus modestus"
                                                , "Amytornis modestus cowarie"
                                                )
                              )

  make_taxonomy(df = more_taxa
                , taxonomy_file = temp_file
                , needed_ranks = c("species")
                )
#> Joining with `by = join_by(original_name)`
#> Error in check_pour_interactive(.slot): No data stored by `potions`
#>  try using `brew()

  # no dataframe supplied - all results in taxonomy_file returned
  make_taxonomy(taxonomy_file = temp_file
                , needed_ranks = c("subspecies")
                )
#> Error: No such file: /tmp/RtmpOXQ9vM/file16b513152a9bf9.parquet

  # Try automatic overrides
  auto_overrides <- make_unmatched_overrides(df = taxa_df
                                             , taxa_col = "taxa"
                                             , taxonomy = taxonomy
                                             , target_rank = "species"
                                             )
#> Error: object 'taxonomy' not found

  # overrrides
  overrides <- envClean::taxonomy_overrides

  # C. rubricollis binned to Phalarope lobatus at species level!
  taxonomy <- make_taxonomy(df = overrides
                            , taxonomy_file = temp_file
                            , needed_ranks = c("species", "subspecies")
                            )
#> Joining with `by = join_by(original_name)`
#> Error in check_pour_interactive(.slot): No data stored by `potions`
#>  try using `brew()

  taxonomy$species$lutaxa %>%
    dplyr::filter(grepl("rubricollis", original_name))
#> Error: object 'taxonomy' not found

  # add in override - C. rubricollis is binned to T. cucullatus at species level
  taxonomy <- make_taxonomy(df = overrides
                            , taxonomy_file = temp_file
                            , needed_ranks = c("species", "subspecies")
                            , overrides = overrides
                            )
#> Joining with `by = join_by(original_name)`
#> Joining with `by = join_by(original_name)`
#> Error in check_pour_package(.pkg): No data stored by `potions`
#>  try using `brew()

  taxonomy$species$lutaxa %>%
    dplyr::filter(grepl("rubricollis", original_name))
#> Error: object 'taxonomy' not found


  # tweak_species example
  make_taxonomy(df = tibble::tibble(original_name = "Acacia sp. Small Red-leaved Wattle (J.B.Williams 95033)")
                , tweak_species = FALSE
                )$raw %>%
    dplyr::select(original_name, scientific_name, species)
#> Joining with `by = join_by(original_name)`
#> Error in check_pour_interactive(.slot): No data stored by `potions`
#>  try using `brew()

  make_taxonomy(df = tibble::tibble(original_name = "Acacia sp. Small Red-leaved Wattle (J.B.Williams 95033)")
                , tweak_species = TRUE
                )$raw %>%
    dplyr::select(original_name, scientific_name, species)
#> Joining with `by = join_by(original_name)`
#> Error in check_pour_interactive(.slot): No data stored by `potions`
#>  try using `brew()

  # clean up
  rm(taxonomy)
#> Warning: object 'taxonomy' not found
  unlist(paste0(temp_file, ".parquet"))
#> [1] "/tmp/RtmpOXQ9vM/file16b513152a9bf9.parquet"