PhenObs - Phenological observations in botanical gardens#
PhenObs is a constantly growing database of phenological observations. They are conducted in botanical gardens, allowing for a large range of species to be observed. The pre-processing script does the following:
harmonize PhenObs data variable names from different years
harmonize species names from different years
add WGS84 coordinates to botanical gardens
If you have questions, suggestions, spot errors, or want to contribute, get in touch with us through planthub@idiv.de.
Requirements#
To run the script, the following is needed:
Code#
# load in libraries
library(data.table) # handle large datasets
library(openxlsx) # read Excel files
library(RJSONIO) # retrieve open source geodata
# clear workspace
rm(list = ls())
Let’s get the PhenObs data from 2017 to 2020.
# set working directory (adapt this!)
setwd(paste0(.brd, "PlantHub"))
# PhenObs data
phen1 <- fread("PhenObs/1877_6_Dataset/1877_2_PhenObs_data_2017_2018.csv", na.strings = "-9999", ) # 2017, 2018
phen2 <- fread("PhenObs/3509_20_Dataset/data_PhenObs_2020(1).csv") # 2020
phen3 <- fread("PhenObs/3519_74_Dataset/processeddata_PhenObs_2019.csv") # 2019
As the column names are different between the PhenObs versions, some harmonization has to be done. We will use a column dictionary created by PlantHub to this end. It translates different names referring to the same data to the same name.
# column names dictionary
colDict <- fread("PhenObs column dictionary.txt") # you may have to replace "_" by " " in the filename!
Let’s standardize the column names across datasets.
# standardize column names
print("Variables missing from the dictionary:")
for (i in seq_len(3)) {
phen <- eval(as.symbol(paste0("phen", i)))
if (length(setdiff(colnames(phen), colDict$synonym)) > 0) {
print(paste0("phen", i))
print(setdiff(colnames(phen), colDict$synonym))
}
}
for (i in seq_len(3)) {
phen <- eval(as.symbol(paste0("phen", i)))
for (j in seq_len(nrow(colDict))) {
if (colDict$synonym[j] %in% colnames(phen)) setnames(phen, colDict$synonym[j], colDict$standard[j])
}
}
Sometimes, we find values below zero in the data. While they may refer to the last year in some
# remove values < 0
for (i in seq_len(3)) {
phen <- eval(as.symbol(paste0("phen", i)))
phen[, colnames(phen) := lapply(.SD, function(x) {
if (is.numeric(x)) {
x[!is.na(x) & x < 0] <- 0
x
} else {
x
}
})]
}
We can now merge the two datasets. We use the “fill” argument as the number of columns (i.e. variables) is not the same across datasets.
phen <- rbind(phen1, phen2, phen3, fill = TRUE)
We will now do some cleanup. We will basically make sure that flowering / fruiting days are identical to the difference between end and beginning of flowering, and that the flower and fruit peaks fall into the former intervals.
# repair flowering days
phen[, flowering_days := phen$flower_end_1 - phen$flower_beg_1]
# repair fruiting days
phen[, fruiting_days := phen$fruit_end - phen$fruit_beg]
# repair flowering peak
phen[flower_peak < flower_beg_1, flower_peak := flower_beg_1]
phen[flower_peak > flower_end_1, flower_peak := flower_end_1]
# repair fruiting peak
phen[fruit_peak < fruit_beg, fruit_peak := fruit_beg]
phen[fruit_peak > fruit_end, fruit_peak := fruit_end]
Next, we rename the “Species” column, then, we remove NA-only columns
# rename species name column
colnames(phen)[colnames(phen) == "Species"] <- "AccSpeciesName"
# remove NA-only columns
naCols <- colnames(phen)[apply(phen, 2, function(x) all(is.na(x)))]
phen[, (naCols) := NULL]
The harmonization of species names is necessary because in one dataset, the species names are given with authors, and in the other, they are not. Therefore, we remove authors in case we find an uppercase letter or parenthesis in the middle of the species names, and move the found authors into an author column. We then look up the authors for the species names that came without authors.
authors <- regmatches(phen$AccSpeciesName, regexpr("\\s+([[:upper:]]|\\().*", phen$AccSpeciesName))
authors <- gsub("^\\s+", "", authors)
nameEnd <- regexpr("\\s+([[:upper:]]|\\()", phen$AccSpeciesName) - 1
phen[nameEnd > 0, AccAuthorName := authors]
nameEnd[nameEnd < 0] <- vapply(phen$AccSpeciesName[nameEnd < 0], nchar, 1)
newNames <- sapply(seq_along(phen$AccSpeciesName), function(x) substr(phen$AccSpeciesName[x], 1, nameEnd[x]))
phen[, AccSpeciesName := newNames]
for (i in seq_along(unique(phen$AccSpeciesName))) {
author <- unique(phen$AccAuthorName[phen$AccSpeciesName == unique(phen$AccSpeciesName)[i]])
author <- author[!is.na(author)]
if (length(author) > 1) print(paste0(unique(phen$AccSpeciesName)[i], " - ", author))
phen[phen$AccSpeciesName == unique(phen$AccSpeciesName)[i], AccAuthorName := author[1]]
}
PhenObs recorded plant functional traits for the species they observe. However, they were recorded only once. To be able to link phenological observations over the years to the trait values, we will copy the trait values into every year.
traitNames <- c(
"leaf_area_garden_mm2", "SLA_garden_mm2_mg", "LDMC_garden_mg_g", "LeafC_garden", "LeafN_garden",
"max_height_garden", "seed_weight_garden_mg"
)
uniqueTraits <- phen[, c("AccSpeciesName", "Botanic_Garden_Name")]
uniqueTraits <- unique(uniqueTraits)
for (i in seq_len(nrow(uniqueTraits))) {
for (j in seq_along(traitNames)) {
traitVal <- unlist(unique(phen[
AccSpeciesName == uniqueTraits$AccSpeciesName[i] &
Botanic_Garden_Name == uniqueTraits$Botanic_Garden_Name[i],
which(colnames(phen) == traitNames[j]),
with = FALSE
]))
traitVal <- traitVal[!is.na(traitVal)]
if (length(traitVal) > 1) print(paste0(uniqueTraits[i], " - ", traitVal))
phen[
phen$AccSpeciesName == uniqueTraits$AccSpeciesName[i] &
phen$Botanic_Garden_Name == uniqueTraits$Botanic_Garden_Name[i],
(which(colnames(phen) == traitNames[j])) := traitVal[1]
]
}
}
For plotting the sampling locations, i.e. botanical gardens, on an overview map, it may be good to habe sampling locations. We retrieve sampling locations using the city names given in the Botanic_Garden_Name column using openstreetmap.
phen[, Latitude_WGS84 := numeric()]
phen[, Longitude_WGS84 := numeric()]
for (i in seq_along(unique(phen$Botanic_Garden_Name))) {
CityName <- gsub(" ", "%20", unique(phen$Botanic_Garden_Name)[i]) # remove space for URLs
url <- paste(
"http://nominatim.openstreetmap.org/search?city=",
CityName,
"&limit=9&format=json",
sep = ""
)
x <- fromJSON(url)
x <- x[[order(sapply(x, function(x) x$importance), decreasing = TRUE)[1]]]
if (is.vector(x)) {
phen[phen$Botanic_Garden_Name == unique(phen$Botanic_Garden_Name)[i], Latitude_WGS84 := as.numeric(x$lat)]
phen[phen$Botanic_Garden_Name == unique(phen$Botanic_Garden_Name)[i], Longitude_WGS84 := as.numeric(x$lon)]
}
}
Finally, we add a genus column and sort the columns of the dataset.
# add plant genus column
phen[, AccGenus := gsub(" .*", "", AccSpeciesName)]
# sort columns
mainCols <- c("AccGenus", "AccSpeciesName", "AccAuthorName", "Latitude_WGS84", "Longitude_WGS84")
setcolorder(phen, c(mainCols, setdiff(colnames(phen), mainCols)))
Now we can write the pre-processed PhenObs data.
fwrite(phen, file = paste0("PhenObs_processed_", Sys.Date(), ".gz"))