galah::search_taxa()R/make_unmatched_overrides.R
make_unmatched_overrides.RdFor an unmatched 'name', try to find a match via: rgbif::name_usage();
rgbif::name_backbone(); and removal of any characters in 'name' after an
'x' or 'X' (i.e. treat hybrids as just the first taxa). Using
rgbif::name_backbone() allows fuzzy matching to fix spelling errors. Any
results are passed back to galah::search_taxa() to retrieve an
override to use for that 'name' (so long as the rgbif result is not identical
to 'name'). Any 'name' still completely unmatched is just given the override
'name' so it will not be lost from downstream processes but will not have any
associated taxonomic information.
make_unmatched_overrides(
df,
taxa_col = "original_name",
taxonomy,
target_rank = "species",
hybrids = FALSE,
include_unmatched = TRUE,
results_file = tempfile(fileext = ".parquet"),
remove_taxa = c("bold:", "unverified", "undetermined", "unidentified", "annual herb",
"annual grass", "incertae sedis", "\\?", "another\\s", "not naturalised in sa",
"annual tussock grass", "*no id", "spec\\.", "\\s\\-\\-\\s.*",
"\\ssp\\.", "\\sspec\\.", "\\ssp$", "\\ssp\\d", "\\ssp\\s",
"\\sspp\\.", "\\sspp\\s", "\\sspp$", "dead", "unknown", "\\sgroup$",
"\\sspecies$", "aquatic grass", "hybrid", "\\scultivar$", "\\scomplex$",
"\\ssect\\.", "\\ss\\.\\sstr\\.", "\\(includes\\s"),
tri_strings = c("\\sssp\\s", "\\sssp\\.", "\\svar\\s", "\\svar\\.",
"\\ssubsp\\.", "\\ssubspecies", "\\sform\\)", "\\sform\\s",
"\\sf\\.", "\\srace\\s", "\\srace\\)", "\\sp\\.v\\.")
)Dataframe of biological records
Character. Name of column in df containing the taxonomic
entities for which a match is desired.
Result of call to make_taxonomy()
Character. Level within envClean::lurank$rank to target
Logical. Create overrides for hybrids (e.g. original names with 'x')?
Logical. Create overrides for taxa not matched via gbif using their original names?
File path to write results of searches. Previous results files are used to avoid redoing time consuming searches for taxa that are not matched via gbif and not written to the taxonomy file in make_taxonomy.
Character. Taxa with regular expressions in tolower(taxa_col) that match remove_taxa
will not be searched or have overrides constructed.
Character. Taxa names with these strings that indicate a trinomial will not be included as a binomial override (i.e. avoids the use_species column in the overrides being populated with trinomial names).
Tibble in appropriate form to pass to the overrides argument of
make_taxonomy()
# setup
# library("envClean")
temp_file <- tempfile()
taxa_df <- tibble::tibble(taxa = c("Charadrius rubricollis"
, "Thinornis cucullatus"
, "Melithreptus gularis laetior"
, "Melithreptus gularis gularis"
, "Eucalyptus viminalis"
, "Eucalyptus viminalis cygnetensis"
, "Eucalyptus"
, "Charadrius mongolus all subspecies"
, "Bettongia lesueur Barrow and Boodie Islands subspecies"
, "Lagorchestes hirsutus Central Australian subspecies"
, "Perameles gunnii Victorian subspecies"
, "Pterostylis sp. Rock ledges (pl. 185, Bates & Weber 1990)"
, "Spyridium glabrisepalum"
, "Spyridium eriocephalum var. glabrisepalum"
, "Petrogale lateralis (MacDonnell Ranges race)"
, "Gehyra montium (revised)"
, "Korthalsella japonica f. japonica"
, "Galaxias sp. nov. 'Hunter'"
, "Some rubbish"
, "Senna artemisioides subsp x artemisioides"
, "Halosarcia sp. (NC)"
, "TERMITOIDAE sp." # 'epifamily'
)
)
# make taxonomy (returns list and writes taxonomy_file)
taxonomy <- make_taxonomy(df = taxa_df
, taxa_col = "taxa"
, taxonomy_file = temp_file
, needed_ranks = c("kingdom", "genus", "species", "subspecies")
)
#> Joining with `by = join_by(original_name)`
#> Error in check_pour_interactive(.slot): No data stored by `potions`
#> ℹ try using `brew()
taxonomy$raw
#> Error: object 'taxonomy' not found
taxonomy$kingdom
#> Error: object 'taxonomy' not found
taxonomy$genus
#> Error: object 'taxonomy' not found
taxonomy$species
#> Error: object 'taxonomy' not found
taxonomy$subspecies
#> Error: object 'taxonomy' not found
# query more taxa (results are added to taxonomy_file but only the new taxa are returned (default `limit = TRUE`)
more_taxa <- tibble::tibble(original_name = c("Amytornis whitei"
, "Amytornis striatus"
, "Amytornis modestus (North, 1902)"
, "Amytornis modestus modestus"
, "Amytornis modestus cowarie"
)
)
make_taxonomy(df = more_taxa
, taxonomy_file = temp_file
, needed_ranks = c("species")
)
#> Joining with `by = join_by(original_name)`
#> Error in check_pour_interactive(.slot): No data stored by `potions`
#> ℹ try using `brew()
# no dataframe supplied - all results in taxonomy_file returned
make_taxonomy(taxonomy_file = temp_file
, needed_ranks = c("subspecies")
)
#> Error: No such file: /tmp/RtmpOXQ9vM/file16b513152a9bf9.parquet
# Try automatic overrides
auto_overrides <- make_unmatched_overrides(df = taxa_df
, taxa_col = "taxa"
, taxonomy = taxonomy
, target_rank = "species"
)
#> Error: object 'taxonomy' not found
# overrrides
overrides <- envClean::taxonomy_overrides
# C. rubricollis binned to Phalarope lobatus at species level!
taxonomy <- make_taxonomy(df = overrides
, taxonomy_file = temp_file
, needed_ranks = c("species", "subspecies")
)
#> Joining with `by = join_by(original_name)`
#> Error in check_pour_interactive(.slot): No data stored by `potions`
#> ℹ try using `brew()
taxonomy$species$lutaxa %>%
dplyr::filter(grepl("rubricollis", original_name))
#> Error: object 'taxonomy' not found
# add in override - C. rubricollis is binned to T. cucullatus at species level
taxonomy <- make_taxonomy(df = overrides
, taxonomy_file = temp_file
, needed_ranks = c("species", "subspecies")
, overrides = overrides
)
#> Joining with `by = join_by(original_name)`
#> Joining with `by = join_by(original_name)`
#> Error in check_pour_package(.pkg): No data stored by `potions`
#> ℹ try using `brew()
taxonomy$species$lutaxa %>%
dplyr::filter(grepl("rubricollis", original_name))
#> Error: object 'taxonomy' not found
# tweak_species example
make_taxonomy(df = tibble::tibble(original_name = "Acacia sp. Small Red-leaved Wattle (J.B.Williams 95033)")
, tweak_species = FALSE
)$raw %>%
dplyr::select(original_name, scientific_name, species)
#> Joining with `by = join_by(original_name)`
#> Error in check_pour_interactive(.slot): No data stored by `potions`
#> ℹ try using `brew()
make_taxonomy(df = tibble::tibble(original_name = "Acacia sp. Small Red-leaved Wattle (J.B.Williams 95033)")
, tweak_species = TRUE
)$raw %>%
dplyr::select(original_name, scientific_name, species)
#> Joining with `by = join_by(original_name)`
#> Error in check_pour_interactive(.slot): No data stored by `potions`
#> ℹ try using `brew()
# clean up
rm(taxonomy)
#> Warning: object 'taxonomy' not found
unlist(paste0(temp_file, ".parquet"))
#> [1] "/tmp/RtmpOXQ9vM/file16b513152a9bf9.parquet"