Prepare Species tolerance to fire data from TRY for use#

The Species tolerance to fire data from TRY informs on the ability of a species to cope with fire.

If you intend to clean more than one or two traits, we recommend the use of the batch pre-processing script. Refer to the TRY main page for details.

If you have questions, suggestions, spot errors, or want to contribute, get in touch with us through planthub@idiv.de.

Author: David Schellenberger Costa

Requirements#

To run the script, the following is needed:

  • TRY data, available here

  • the data.table library may need to be installed

Code#

# load in libraries
library(data.table) # handle large datasets

# clear workspace
rm(list = ls())

Let’s get the TRY data.

# set working directory (adapt this!)
setwd(paste0(.brd, "PlantHub"))

# read in data (adapt this!)
TRY <- fread("TRY_PlantHub.gz")

# select data of interest
TRYSubset <- TRY[TraitName == "Species tolerance to fire"]

As there are numerical and categorical values here, we will start to process the categorical data, and do the numerical data afterwards. To get an overview of the data, we sort them, and show them as a table.

# extract original data strings
oriVals <- TRYSubset$OrigValueStr # oriVals == original values

# get an overview over the data by summarizing values and showing them in alphabetical order
valueOverview <- table(oriVals)
valueOverview[order(valueOverview)]

There is some data that has to be decoded with the help of the OriglName column.

# minimum recommended fire intervals will be removed
oriVals[TRYSubset$OriglName == "Recommended minimimum interval"] <- NA

oriVals[TRYSubset$OriglName == "SeedlSurv" & oriVals > 20] <- "high seedling survival"
oriVals[TRYSubset$OriglName == "SeedlSurv" & oriVals <= 20] <- "low seedling survival"

oriVals[TRYSubset$OriglName == "SeedlEmerg" & oriVals > 1] <- "high seed survival"
oriVals[TRYSubset$OriglName == "SeedlEmerg" & oriVals <= 1] <- "low seed survival"
oriVals[TRYSubset$OriglName == "SeedlEmerg" & oriVals %in% c("high", "yes")] <- "high seed survival"
oriVals[TRYSubset$OriglName == "SeedlEmerg" & oriVals %in% c("low", "no")] <- "low seed survival"
oriVals[TRYSubset$OriglName == "SeedlEmerg" & oriVals %in% c("variable")] <- NA

oriVals[TRYSubset$OriglName == "fire_surv" & oriVals == 0] <- "always killed by fire"
oriVals[TRYSubset$OriglName == "fire_surv" & oriVals == 1] <- "often killed by fire"
oriVals[TRYSubset$OriglName == "fire_surv" & oriVals == 2] <- "sometimes killed by fire"
oriVals[TRYSubset$OriglName == "fire_surv" & oriVals == 3] <- "usually resprout after fire"

oriVals[TRYSubset$OriglName == "fire.tol" & oriVals == 0] <- "always killed by fire"

It looks like a good idea to remove purely numeric values.

oriVals[!grepl("[[:lower:]]", oriVals)] <- NA

The most important part of the cleaning process is the definition of the search strings to look for. We use regular expressions in some cases to be more inclusive (or exclusive).

# create a vector containing the search strings to look for
searchNames <- c(
	"low seed survival",
	"high seed survival|^S(\\s|$)|seed storage in soil",
	"low seedling survival",
	"high seedling survival|^seed",
	"Killed by 100% scorch|always killed by fire|total|not fire resistant|^no(ne)?",
	"often killed by fire|(L|l)ow$|\\sr$",
	"sometimes killed by fire|moderate|(M|m)edium|\\ss$",
	"Survives 100% scorch|^fire resistant|Yes|(H|h)igh$",
	"(E|e)picormic",
	"(L|l)ignotuber",
	"rhizome",
	"root",
	"^resp$|obligate resprouters|resprout location unknown|usually resprout after fire|(^|\\s)R(\\s|$)",
	"non-sprouting"
)

We can now search for the strings defined before and give names to the new traits, depending on whether data is on seed or seedling survival, general survival, or the type of resprouting. We also prepare a matrix to save the new values in, as they will be separated into several ne traits.

# search for the strings defined before
searchResults <- sapply(searchNames, grepl, oriVals)

# name columns of searchResults matrix like corrected searchNames
searchResultsCols <- list()
searchResultsCols[[1]] <- c("low seed survival", "high seed survival")
searchResultsCols[[2]] <- c("low seedling survival", "high seedling survival")
searchResultsCols[[3]] <- c(
	"not fire resistant", "low fire resistance",
	"intermediate fire resistance", "high fire resistance"
)
searchResultsCols[[4]] <- c(
	"epicormic resprouting",
	"lignotuberous resprouting", "rhizome resprouting", "root-resprouting"
)
searchResultsCols[[5]] <- c("resprouting", "not resprouting")
colnames(searchResults) <- unlist(searchResultsCols)

# prepare matrix to save new values in
newVals <- matrix(NA, length(oriVals), length(searchResultsCols) + 1)

Let’s have a look at the results.

# show the number of matches to each category
colSums(searchResults)

# show the original entries for which no match was retrieved
sort(table(oriVals[rowSums(searchResults) < 1]))

# show the number of entries that weren't matched to any category
sum(rowSums(searchResults) < 1)

# show the number of entries that were matched to more that one category
sum(rowSums(searchResults) > 1)

There are several exclusive traits here, and we remove ambiguous entries.

# remove contradictory entries
# only one category possible
for (i in c(1, 2, 3, 5)) {
	searchResults[
		rowSums(searchResults[, colnames(searchResults) %in% searchResultsCols[[i]]]) > 1,
		colnames(searchResults) %in% searchResultsCols[[i]]
	] <- FALSE
}

As many specific forms of resprouting are stated, they should also be added to general resprouting.

# consider logical relationships
searchResults[
	rowSums(searchResults[, grep("(c\\s|s\\s|e\\s|t\\-)resprouting", colnames(searchResults))]) > 0,
	grep("^resprouting", colnames(searchResults))
] <- TRUE

We can now write the data into the prepared new results matrix.

# use the searchResults matrix to create new value strings by concatenating all data found
for (i in seq_along(searchResultsCols)) {
	searchResultsTemp <- searchResults[, colnames(searchResults) %in% searchResultsCols[[i]], drop = FALSE]
	newVals[, i] <- sapply(seq_len(nrow(searchResultsTemp)), function(x) {
		paste(searchResultsCols[[i]][searchResultsTemp[x, ]], collapse = ",")
	})
}
newVals[newVals == ""] <- NA

Let’s now work on the numerical data. We need to get the original values again, but we exclude those that have already been processed as categorical data. We also round the remaining values to one decimal place.

# extract original data strings
oriVals <- TRYSubset$OrigValueStr # oriVals == original values

# remove values already processed
oriVals[!(TRYSubset$OriglName %in% c(
	"perc_topkill", "per_dead",
	"Sensitivity: frequent", "Sensitivity: infrequent"
))] <- NA

# round values
newVals[, 6] <- round(as.numeric(oriVals), 1)

We now write bot categorical and numeric data into template variables. We also add some new trait names.

# write data into template
newValsAll <- newUnitsAll <- rep(NA, length(oriVals))
newTraitNamesAll <- rep("", length(oriVals))
newValsAll[!is.na(newVals[, 1])] <- newVals[, 1][!is.na(newVals[, 1])]
newTraitNamesAll[!is.na(newVals[, 1])] <- "Seed fire tolerance"
newValsAll[!is.na(newVals[, 2])] <- newVals[, 2][!is.na(newVals[, 2])]
newTraitNamesAll[!is.na(newVals[, 2])] <- "Plant seedling fire tolerance"
newValsAll[!is.na(newVals[, 3])] <- newVals[, 3][!is.na(newVals[, 3])]
newTraitNamesAll[!is.na(newVals[, 3])] <- "Plant fire tolerance (categories)"
newValsAll[!is.na(newVals[, 4])] <- newVals[, 4][!is.na(newVals[, 4])]
newTraitNamesAll[!is.na(newVals[, 4])] <- "Plant fire resprouting type"
newValsAll[!is.na(newVals[, 5])] <- newVals[, 5][!is.na(newVals[, 5])]
newTraitNamesAll[!is.na(newVals[, 5])] <- "Plant fire resprouting strength (categories)"
newUnitsAll[!is.na(newValsAll)] <- ""
newValsAll[!is.na(newVals[, 6])] <- newVals[, 6][!is.na(newVals[, 6])]
newTraitNamesAll[!is.na(newVals[, 6]) & TRYSubset$OriglName == "perc_topkill"] <- "Plant fire canopy mortality"
newTraitNamesAll[!is.na(newVals[, 6]) & TRYSubset$OriglName == "perc_dead"] <- "Plant fire mortality"
newTraitNamesAll[!is.na(newVals[, 6]) & TRYSubset$OriglName == "Sensitivity: frequent"] <-
	"Plant fire frequent fire tolerance (numeric)"
newTraitNamesAll[!is.na(newVals[, 6]) & TRYSubset$OriglName == "Sensitivity: infrequent"] <-
	"Plant fire infrequent fire tolerance (numeric)"
newUnitsAll[grepl("mortality", newTraitNamesAll)] <- "%"

We can now integrate the new values into the TRY data frame.

# integrate into TRY
TRY[TraitName == "Species tolerance to fire", CleanedValueStr := newValsAll]
TRY[TraitName == "Species tolerance to fire", OrigUnitStr := newUnitsAll]
TRY[TraitName == "Species tolerance to fire", TraitName := newTraitNamesAll]

Let’s write the data to a file.

fwrite(TRY, file = paste0("TRY_processed_", Sys.Date(), ".gz"))