Prepare Wood growth ring distinction data from TRY for use#
The Wood growth ring distinction data from TRY informs on whether growth rings can easily be distinguished or not. Additionally, there is data on the type of growth rings.
If you intend to clean more than one or two traits, we recommend the use of the batch pre-processing script. Refer to the TRY main page for details.
If you have questions, suggestions, spot errors, or want to contribute, get in touch with us through planthub@idiv.de.
Author: David Schellenberger Costa
Requirements#
To run the script, the following is needed:
TRY data, available here
the data.table library may need to be installed
Code#
# load in libraries
library(data.table) # handle large datasets
# clear workspace
rm(list = ls())
Let’s get the TRY data
# set working directory (adapt this!)
setwd(paste0(.brd, "PlantHub"))
# read in data (adapt this!)
TRY <- fread("TRY_PlantHub.gz")
# select data of interest
TRYSubset <- TRY[TraitName == "Wood growth ring distinction"]
To get an overview of the data, we convert values to lowercase, sort them, and show them as a table.
# extract original data strings
oriVals <- TRYSubset$OrigValueStr # oriVals == original values
# change all to lowercase to ease later classification
oriVals <- tolower(oriVals)
# get an overview over the data by summarizing values and showing them in alphabetical order
valueOverview <- table(oriVals)
valueOverview[order(valueOverview)]
Some decoding is necessary here. As the data is coded in a particular way, we will need to add some extra rows to the original dataset and re-create the oriVals variable.
# decode coded entries
table(TRYSubset[grepl("\\d", oriVals)]$OrigValueStr, TRYSubset[grepl("\\d", oriVals)]$OriglName)
# numbers from 0 to 1 indicate relative occurrence for diffuse/semi-ring/ring-porous wood
# as these are no single measurements, it seems ok to duplicate/triplicate measurements for
# porosity for the respective entries (with 0.33/0.5/0.66 values)
probIDs <- unique(TRYSubset[grepl("0.33|0.5|0.66", oriVals)]$ObservationID)
newLines <- TRYSubset[0]
for (i in seq_along(probIDs)) {
focLines <- TRYSubset[ObservationID == probIDs[i]]
if (any(grepl("0.5", focLines$OrigValueStr))) {
newVals <- focLines[grepl("0.5", OrigValueStr)]$OriglName
} else {
newVals <- focLines[grepl("0.33", OrigValueStr)]$OriglName
newVals <- c(newVals, rep(focLines[grepl("0.66", OrigValueStr)]$OriglName, 2))
}
newLinesTemp <- focLines[rep(1, length(newVals))]
newLinesTemp[, OrigValueStr := newVals]
newLines <- rbind(newLines, newLinesTemp)
}
TRYSubset <- TRYSubset[!(ObservationID %in% probIDs)]
TRYSubset <- rbind(TRYSubset, newLines)
TRYSubset[OrigValueStr == "0" & grepl("porosity", OriglName), OrigValueStr := ""]
TRYSubset[OrigValueStr == "1" & grepl("porosity", OriglName), OrigValueStr := OriglName]
# re-create oriVals
oriVals <- TRYSubset$OrigValueStr # oriVals == original values
oriVals <- tolower(oriVals)
# decode coded entries
oriVals[oriVals == "yes"] <- TRYSubset[OrigValueStr == "yes"]$OriglName
for (i in seq_along(letters[1:5])) {
oriVals[TRYSubset$OrigValueStr == letters[i]] <- i
}
The most important part of the cleaning process is the definition of the search strings to look for. We use regular expressions in some cases to be more inclusive (or exclusive).
# create a vector containing the search strings to look for
searchNames <- c(
# growth ring visibility
"1|Growth rings absent|Only one ring",
"2",
"3",
"4",
"5|Growth rings distinct and recognizable",
# growth ring type
"vesselless|non porous",
"diff?use",
"semi(( |-|\\.)?ring)?",
"ring"
)
We can now search for the strings defined before and give names to the new categories. As the trait will be splitted, we also create a matrix for the new values.
# search for the strings defined before
searchResults <- sapply(searchNames, grepl, oriVals, ignore.case = TRUE)
# name columns of searchResults matrix like corrected searchNames
searchResultsCols <- list()
searchResultsCols[[1]] <- c(1, 2, 3, 4, 5)
searchResultsCols[[2]] <- c("vesselless", "diffuse", "semi-ring", "ring")
colnames(searchResults) <- unlist(searchResultsCols)
# prepare matrix to save new values in
newVals <- matrix(NA, length(oriVals), length(searchResultsCols))
Let’s have a look at the results.
# show the number of matches to each category
colSums(searchResults)
# show the original entries for which no match was retrieved
sort(table(oriVals[rowSums(searchResults) < 1]))
# show the number of entries that weren't matched to any category
sum(rowSums(searchResults) < 1)
# show the number of entries that were matched to more that one category
sum(rowSums(searchResults) > 1)
As the categories defined have exclusive values, we remove ambiguous entries.
# remove contradictory entries
for (i in c(1:2)) {
searchResults[
rowSums(searchResults[, colnames(searchResults) %in% searchResultsCols[[i]]]) > 1,
colnames(searchResults) %in% searchResultsCols[[i]]
] <- FALSE
}
searchResults[
searchResults[, grep("1", colnames(searchResults))] == TRUE,
grep("\\D", colnames(searchResults))
] <- FALSE
We can now use the cleaned results data to create a new data vector.
# use the searchResults matrix to create new value strings by concatenating all data found
for (i in seq_along(searchResultsCols)) {
searchResultsTemp <- searchResults[, colnames(searchResults) %in% searchResultsCols[[i]], drop = FALSE]
newVals[, i] <- sapply(seq_len(nrow(searchResultsTemp)), function(x) {
paste(searchResultsCols[[i]][searchResultsTemp[x, ]], collapse = ",")
})
}
newVals[newVals == ""] <- NA
Let’s transfer the data into the original data frame. We first remove the original data, as we added some extra rows.
# remove original data because it has other row number
TRY <- TRY[TraitName != "Wood growth ring distinction"]
# move values to other traits
TRY <- rbind(TRY, TRYSubset, fill = TRUE)
TRY[TraitName == "Wood growth ring distinction", CleanedValueStr := newVals[, 1]]
TRY[TraitName == "Wood growth ring distinction", TraitName := "gotoWood porosity type"]
# integrate into TRY
TRY <- rbind(TRY, TRYSubset, fill = TRUE)
TRY[TraitName == "Wood growth ring distinction", CleanedValueStr := newVals[, 2]]
As we duplicated the data to accommodate the data belonging to other traits, to avoid an unnecessary increase in file size, we remove the rows of the duplicated data without values in the “CleanedValueStr” column.
TRY <- TRY[!grepl("^goto", TraitName) | !is.na(CleanedValueStr)]
We have used an existing trait name with the prefix “goto” to classify some data. This was done to eventually move the data to the respective trait, but avoid another round of pre-processing. So only run the following line if this is the last of various pre-processing scripts you want to use.
TRY[, TraitName := sub("^goto", "", TraitName)]
Let’s write the data to a file.
# write data
fwrite(TRY, file = paste0("TRY_processed_", Sys.Date(), ".gz"))