Prepare Plant resprouting capacity data from TRY for use#
The plant resprouting capacity data from TRY informs on whether a plant is able to resprout after disturbance.
If you intend to clean more than one or two traits, we recommend the use of the batch pre-processing script. Refer to the TRY main page for details.
If you have questions, suggestions, spot errors, or want to contribute, get in touch with us through planthub@idiv.de.
Author: David Schellenberger Costa
Requirements#
To run the script, the following is needed:
TRY data, available here
the data.table library may need to be installed
Code#
# load in libraries
library(data.table) # handle large datasets
# clear workspace
rm(list = ls())
Let’s get the TRY data
# set working directory (adapt this!)
setwd(paste0(.brd, "PlantHub"))
# read in data (adapt this!)
TRY <- fread("TRY_PlantHub.gz")
# select data of interest
TRYSubset <- TRY[TraitName == "Plant resprouting capacity"]
The plant resprouting capacity data from TRY is a container for numeric (resprouting strength) and categorical data. We will start to process the categorical data and do the numeric afterwards. We will prepare a matrix with two columns to separately save entries belonging to these to types.
To get an overview of the data, we convert values to lowercase, sort them, and show them as a table.
# extract original data strings
oriVals <- TRYSubset$OrigValueStr # oriVals == original values
# change all to lowercase to ease later classification
oriVals <- tolower(oriVals)
# get an overview over the data by summarizing values and showing them in alphabetical order
valueOverview <- table(oriVals)
valueOverview[order(valueOverview)]
# prepare matrix to save new values in
newVals <- matrix(NA, length(oriVals), 2)
There are some coded entries we need to take care of. They include the numbers 0 and 1, but also “high”, “low”, “yes”, “no”, and “variable”. We need to decode these values. We then remove purely numeric values.
zeroOnes <- table(TRYSubset$Dataset[TRYSubset$OrigValueStr %in% c("0", "1")])
table(oriVals[TRYSubset$Dataset %in% names(zeroOnes)[1]]) # numbers, high, low, yes, no, variable
table(oriVals[TRYSubset$Dataset %in% names(zeroOnes)[2]]) # 0,1
table(oriVals[TRYSubset$Dataset %in% names(zeroOnes)[3]]) # numbers
table(oriVals[TRYSubset$Dataset %in% names(zeroOnes)[4]]) # 0,1
# datasets 2 and 4
for (i in c(2, 4)) {
oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]] <-
sub("^(high|yes|1)$", "resprouting", oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]])
oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]] <-
sub("^(low|no|0)$", "not resprouting", oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]])
}
# dataset 1
for (i in c(1)) {
oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]] <-
sub("^(high|yes)$", "resprouting", oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]])
oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]] <-
sub("^(low|no)$", "not resprouting", oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]])
}
# remove purely numeric values and others that have no lowercase character included
oriVals[!grepl("[[:lower:]]", oriVals)] <- NA
The most important part of the cleaning process is the definition of the search strings to look for. We use regular expressions in some cases to be more inclusive (or exclusive).
searchNames <- c(
"^rt$|^resprouter|resprout after fire|resprouting|survives (100% )?scorch|survives (100% )?fire|^y(es)?$|high",
"\\Wl\\W|lignotuber",
"\\We\\W|epicorm",
"\\Wr\\W|root",
"^fire-killed|killed by (100% )?fire|killed by (100% )?scorch|not resprouting|^no?$|low"
)
We can now search for the strings defined before and give names to the new categories.
# search for the strings defined before
searchResults <- sapply(searchNames, grepl, oriVals)
# name columns of searchResults matrix like plant life span categories
colnames(searchResults) <- c(
"resprouting", "lignotuberous",
"epicormic", "root-resprouting", "not resprouting"
)
Let’s have a look at the results.
# show the number of matches to each category
colSums(searchResults)
# show the original entries for which no match was retrieved
oriVals[rowSums(searchResults) < 1]
# show the number of entries that weren't matched to any category
sum(rowSums(searchResults) < 1)
# show the number of entries that were matched to more that one category
sum(rowSums(searchResults) > 1)
We need to take into account that “lignotuberous”,”epicormic”,and “root-resprouting” are all sub-categories of “resprouting”, and “not resprouting” cannot possibly be found together with another category.
#consider logical relationships for (i in 2:4){ searchResults[searchResults[,i] == TRUE,1] <- TRUE searchResults[searchResults[,i] == TRUE,5] <- FALSE } searchResults[searchResults[,5] == TRUE,1] <- FALSE
Now, we can create new strings with the cleaned values and add them to the observations. To not remove the original entries, we will create a new column called “CleanedValueStr”.
newVals[, 1] <- sapply(seq_len(nrow(searchResults)), function(x) {
paste(colnames(searchResults)[searchResults[x, ]], collapse = ",")
})
newVals[, 1][newVals[, 1] == ""] <- NA
We will now process the numerical values.
To get an overview of the data, we convert values to lowercase, sort them, and show them as a table.
# extract original data strings
oriVals <- TRYSubset$OrigValueStr # oriVals == original values
# change all to lowercase to ease later classification
oriVals <- tolower(oriVals)
# get an overview over the data by summarizing values and showing them in alphabetical order
valueOverview <- table(oriVals)
valueOverview[order(valueOverview)]
There are some coded entries we need to take care of. They include the numbers 0 and 1, but also “high”, “low”, “yes”, “no”, and “variable”. We need to decode these values. We then remove purely categorical values.
zeroOnes <- table(TRYSubset$Dataset[TRYSubset$OrigValueStr %in% c("0", "1")])
table(oriVals[TRYSubset$Dataset %in% names(zeroOnes)[1]]) # numbers, high, low, yes, no, variable
table(oriVals[TRYSubset$Dataset %in% names(zeroOnes)[2]]) # 0,1
table(oriVals[TRYSubset$Dataset %in% names(zeroOnes)[3]]) # numbers
table(oriVals[TRYSubset$Dataset %in% names(zeroOnes)[4]]) # 0,1
# datasets 2 and 4
for (i in c(2, 4)) {
oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]] <-
sub("^(high|yes|1)$", "resprouting", oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]])
oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]] <-
sub("^(low|no|0)$", "not resprouting", oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]])
}
# dataset 1
for (i in c(1)) {
oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]] <-
sub("^(high|yes)$", "resprouting", oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]])
oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]] <-
sub("^(low|no)$", "not resprouting", oriVals[TRYSubset$Dataset %in% names(zeroOnes)[i]])
}
# remove purely categorical values
oriVals[!grepl("\\d", oriVals)] <- NA
newVals[, 2] <- round(as.numeric(oriVals))
Looking at the data, we notice there are two very different data ranges in the remaining newVals[,2].
table(TRYSubset$Dataset[!is.na(newVals[, 2])])
table(newVals[, 2][TRYSubset$Dataset == "BROT Plant Trait Database"]) # percent scale
table(newVals[, 2][TRYSubset$Dataset == "Miombo tree species - SLA, leaf and seed size"]) # scale unknown
As we do not know the scale of some values, it is necessary to remove them.
newVals[, 2][TRYSubset$Dataset == "Miombo tree species - SLA, leaf and seed size"] <- NA
We can now integrate the data into TRY and use different trait names to separate resprouting type and resprouting strength. We will first write the data into a template storing the new values and trait names and then transfer it as a whole to TRY.
# write data into template
newValsAll <- newUnitsAll <- rep(NA, length(oriVals))
newTraitNamesAll <- rep("", length(oriVals))
newValsAll[!is.na(newVals[, 1])] <- newVals[, 1][!is.na(newVals[, 1])]
newTraitNamesAll[!is.na(newVals[, 1])] <- "Plant resprouting type"
newUnitsAll[!is.na(newVals[, 1])] <- ""
newValsAll[!is.na(newVals[, 2])] <- newVals[, 2][!is.na(newVals[, 2])]
newTraitNamesAll[!is.na(newVals[, 2])] <- "Plant resprouting strength"
newUnitsAll[!is.na(newVals[, 2])] <- "%"
# integrate into TRY
TRY[TraitName == "Plant resprouting capacity", CleanedValueStr := newValsAll]
TRY[TraitName == "Plant resprouting capacity", OrigUnitStr := newUnitsAll]
TRY[TraitName == "Plant resprouting capacity", TraitName := newTraitNamesAll]
Let’s write the data to a file.
fwrite(TRY, file = paste0("TRY_processed_", Sys.Date(), ".gz"))