Get data using galah::atlas_occurrences()
get_galah(
aoi = NULL,
save_dir = NULL,
get_new = FALSE,
name = "galah",
data_map = NULL,
node = "ALA",
qry = NULL,
check_rel_metres = TRUE,
filter_inconsistent = TRUE,
galah_config = list(email = Sys.getenv("ALA_email"), download_reason_id = 0),
...
)
Optional simple feature (sf). Used to limit the occurrences
returned via galah::galah_geolocate()
Character. Path to directory into which to save outputs. If
NULL
results will be saved to here::here("out", "ds", "galah")
. File will be
named galah.parquet
Logical. If FALSE, will attempt to load from existing
save_dir
.
Character. data_name
value in envImport::data_map
(or other data_map
)
Dataframe or NULL. Mapping of fields to retrieve. See example
envImport::data_map
Character. Name of atlas to use (see galah::atlas_occurrences()
).
Doesn't seem to work with node = "GBIF" and untested on other nodes.
NULL
or an object of class data_request, created using
galah::galah_call()
Logical. Ensure that coordinateUncertaintyInMetres
is no less than generalisationInMetres
?
Logical. If TRUE
, inconsistencies between the
occurrenceStatus
column and either organismQuantity
or individualCount
are removed. e.g. a record with occurrenceStatus == "ABSENT"
but
individualCount == 1
would be filtered.
List in the form of key = value pairs suitable for
galah::galah_config()
.
Passed to envImport::file_prep()
Dataframe of occurrences and file saved to save_dir
. .bib created
when download_reason_id != 10.
# library("envImport")
out_dir <- file.path(system.file(package = "envImport"), "examples")
## config -------
old_atlas <- galah::galah_config()$atlas$region
galah::galah_config(email = Sys.getenv("GBIF_email")
, username = Sys.getenv("GBIF_user")
, password = Sys.getenv("GBIF_pwd")
, caching = TRUE
, download_reason_id = 10 # testing
)
galah::galah_config(atlas = "GBIF")
#> Atlas selected: Global Biodiversity Information Facility (GBIF) [Global]
# Australian Bustards--------
# in the year 2020
## 01: atlas = gbif --------
save_file <- fs::path(out_dir, "qry01", "qry01.rds")
if(!file.exists(save_file)) {
qry01 <- galah::galah_call() %>%
galah::galah_identify("Ardeotis australis") %>%
galah::galah_filter(year == 2000) %>%
galah::atlas_occurrences() %>%
dplyr::collect()
rio::export(qry01
, save_file
)
} else {
qry01 <- rio::import(save_file)
}
#> Warning: Missing `trust` will be set to FALSE by default for RDS in 2.0.0.
## 02: atlas = ala ----------
galah::galah_config(atlas = "ALA")
#> Atlas selected: Atlas of Living Australia (ALA) [Australia]
galah::galah_config(email = Sys.getenv("ALA_email"))
# 'qry' used for both qry02 and qry03
qry <- galah::galah_call() %>%
galah::galah_identify("Ardeotis australis") %>%
galah::galah_filter(year == 2000)
save_file <- fs::path(out_dir, "qry02", "qry02.rds")
if(!file.exists(save_file)) {
qry02 <- qry %>%
galah::atlas_occurrences()
rio::export(qry02
, save_file
)
} else {
qry02 <- rio::import(save_file
, setclass = "tibble"
)
}
#> Warning: Missing `trust` will be set to FALSE by default for RDS in 2.0.0.
# similar (but not identical) # of records
nrow(qry01)
#> [1] 857
nrow(qry02)
#> [1] 857
## 03: get_galah ---------
qry03 <- get_galah(save_dir = fs::path(out_dir, "qry03")
, data_map = data_map
, qry = qry
)
#> save_file will be C:/temp/joel/RtmpKSVQ8v/temp_libpath23d833687f18/envImport/examples/qry03/galah/galah.parquet
# again, not quite the same number of records
nrow(qry02)
#> [1] 857
nrow(qry03)
#> [1] 852
# get_galah removes, via envImport::remap_data_names NULL dates, lat and long
# see arguments to envImport::remap_data_names
# filtering qry02 on those columns gives the same result as qry03
qry02 %>%
dplyr::filter(!is.na(eventDate)
, !is.na(decimalLatitude)
, !is.na(decimalLongitude)
) %>%
nrow()
#> [1] 852
# names from data_map
names(qry02)
#> [1] "recordID" "scientificName" "taxonConceptID" "decimalLatitude"
#> [5] "decimalLongitude" "eventDate" "occurrenceStatus" "dataResourceName"
names(qry03)
#> [1] "data_name" "site" "date" "lat"
#> [5] "long" "original_name" "common" "survey"
#> [9] "rel_metres" "method" "obs" "kingdom"
#> [13] "year" "month" "occ"
## 04: get_galah with profile -------
qry04 <- get_galah(save_dir = fs::path(out_dir, "qry04")
, data_map = data_map
, qry = qry %>%
galah::apply_profile(CSDM)
)
#> save_file will be C:/temp/joel/RtmpKSVQ8v/temp_libpath23d833687f18/envImport/examples/qry04/galah/galah.parquet
# lost some records due to the profile
nrow(qry04)
#> [1] 715
############################################
# Combine data --------
## get_galah for aoi -------
bio_all_galah <- get_galah(aoi = envImport::aoi
, save_dir = out_dir
, data_map = data_map
, sub_dir = "bio_all"
)
#> save_file will be C:/temp/joel/RtmpKSVQ8v/temp_libpath23d833687f18/envImport/examples/bio_all/galah.parquet
## get_tern for aoi --------
bio_all_tern <- get_tern(aoi = envImport::aoi
, save_dir = out_dir
, data_map = data_map
, sub_dir = "bio_all"
)
#> save_file will be C:/temp/joel/RtmpKSVQ8v/temp_libpath23d833687f18/envImport/examples/bio_all/tern.parquet
## or using get_data -------
# to get both galah and tern
datas <- c("galah", "tern", "gbif")
# galah and tern already run from above
temp <- purrr::map(datas
, \(x) get_data(x
, save_dir = out_dir
, get_new = FALSE
, aoi = envImport::aoi
, data_map = data_map
, sub_dir = "bio_all"
, previous_key = "0057643-240626123714530"
)
)
#> save_file will be C:/temp/joel/RtmpKSVQ8v/temp_libpath23d833687f18/envImport/examples/bio_all/galah.parquet
#> save_file will be C:/temp/joel/RtmpKSVQ8v/temp_libpath23d833687f18/envImport/examples/bio_all/tern.parquet
#> save_file will be C:/temp/joel/RtmpKSVQ8v/temp_libpath23d833687f18/envImport/examples/bio_all/gbif.parquet
## single dataset --------
bio_all_files <- fs::dir_ls(fs::path(out_dir, "bio_all")
, regexp = "\\.parquet"
)
bio_all <- purrr::map_dfr(bio_all_files, \(x) rio::import(x))
if(FALSE) {
# check for misaligned classes
check <- purrr::map_dfr(temp
, \(x) purrr::map(x, class)
) %>%
purrr::map_dfr(\(x) length(unique(na.omit(x))) == 1) %>%
tidyr::pivot_longer(everything()) %>%
dplyr::filter(!value)
use_schema <- arrow::schema(bio_all)
use_schema$quantity <- arrow::Field$create("quantity", arrow::string())
bio_all <- arrow::open_dataset(bio_all_files
, schema = use_schema
) %>%
dplyr::collect()
}
# 'bio_all' is now the sum of its components
nrow(bio_all) == sum(purrr::map_dbl(temp, nrow))
#> [1] TRUE
# clean up -------
# return to original atlas
galah::galah_config(atlas = old_atlas)