sPlot - A global database on vegetation surveys
sPlot is a huge collection of vegetation surveys. The scope of sPlot is global. This is achieved by collaborations of scientists across the world that share their data with each other. Many of the individual datasets from sPlot are open access, and they have been released as sPlotOpen. This pre-processing scripts works on sPlotOpen v2.0. It does the following:
add WGS84 smapling locations and TDWG4 zones to each occurrence record
remove typos
If you have questions, suggestions, spot errors, or want to contribute, get in touch with us through planthub@idiv.de.
Author: David Schellenberger Costa
Requirements¶
To run the script, the following is needed:
a world map shapefile of the TDWG4 regions, available here
sPlot data, available here
some R libraries that may need to be installed
Code¶
# load in libraries
library(data.table) # handle large datasets
library(sf) # work with spatial data
# clear workspace
rm(list = ls())
Let’s get the world map with TDWG regions (botanical regions). We will repair a little error in the data.
# set working directory (adapt this!)
setwd(paste0(.brd, "taxonomy/TDWG"))
# read in TDWG regions
wm4 <- st_read("level4.shp", crs = 4326)
# wrong name in Level4_cod
wm4$Level4_cod <- sub("AGE-CO", "AGE-CD", wm4$Level4_cod)
Now we need the sPlot data. There are two files, one with the plant occurrences in the plots and one with the plot locations.
# set working directory (adapt this!)
setwd(paste0(.brd, "PlantHub"))
# read in sPlot data
splot <- fread("sPlotOpen_3474_52/3474_52_sPlotOpen_DT(1).txt") # vegetation surveys
splotLoc <- fread("sPlotOpen_3474_52/3474_54_sPlotOpen_header(2).txt") # plot locations
We want to add TDWG4 zones to the plot locations. We will first get the TDWG4 zones for coordinates that fall directly into a TDWG4 zone, and then search for the nearest TDWG4 zone for coordinates that do not fall into any zone. This happens because the polygons defining the TDWG4 zones have a fixed resolution and may fail to include plots at coastlines.
# add TDWG4 zone
splotCoords <- st_as_sf(splotLoc, coords = c("Longitude", "Latitude"), crs = 4326)
sf_use_s2(FALSE) # necessary for working with planar features
pr <- st_intersects(splotCoords, worldMap4) # point region
pr <- sapply(pr, function(x) if (length(x) > 0) x else NA)
# closest matches (coastline and data issues)
prTemp <- st_nearest_feature(splotCoords[is.na(pr), ], worldMap4)
prTemp <- sapply(prTemp, function(x) if (length(x) > 0) x else NA)
pr[is.na(pr)] <- prTemp
worldMap4Data <- data.table(worldMap4) # done for performance reasons
splotLoc[, TDWG4 := worldMap4Data[pr]$Level4_cod]
Now we write the geographical information and ID into the data frame.
setkey(splotLoc, PlotObservationID)
splot[, Latitude_WGS84 := splotLoc[J(splot$PlotObservationID)]$Latitude]
splot[, Longitude_WGS84 := splotLoc[J(splot$PlotObservationID)]$Longitude]
splot[, TDWG4 := splotLoc[J(splot$PlotObservationID)]$TDWG4]
Changing column names is not necessary, but may be convenient.
# change column names
colnames(splot) <- gsub("Species", "AccSpeciesName", colnames(splot))
There are some remaining typos in taxon names and some inconsistencies in hybrid markers that will be removed in the last steps. We also add a genus column and sort the columns. We remove some data on algae and scan the taxon names for remaining (erronous) special characters.
# add plant genera
# correct typos
splot[, AccSpeciesName := gsub("Convululaceae", "Convolvulaceae", AccSpeciesName)]
splot[, AccSpeciesName := gsub("Laminaceae", "Lamiaceae", AccSpeciesName)]
# remove entries without AccSpeciesName
splot[AccSpeciesName == "", AccSpeciesName := Original_species] # add data where AccSpeciesName == ""
# add data where AccSpeciesName is NA
splot[is.na(AccSpeciesName) & !grepl("\\d", Original_species), AccSpeciesName := Original_species]
splot[is.na(AccSpeciesName) | AccSpeciesName == ""]
splot <- splot[!is.na(AccSpeciesName) & AccSpeciesName != ""]
# temporarily remove genus hybrid sign
hybrids <- grepl("^x ", splot$AccSpeciesName)
splot[, AccSpeciesName := sub("^x ", " ", AccSpeciesName)]
# standardize species hybrid signs
splot[, AccSpeciesName := sub("\xc3\x97", " x ", AccSpeciesName)]
# repair an error in the data
splot[, AccSpeciesName := sub("Platanus\\s+x\\s+", "Platanus x hispanica", AccSpeciesName)]
# remove unnecessary whitespaces
splot[, AccSpeciesName := gsub("(^\\s+|\\s+$)", "", AccSpeciesName)]
splot[, AccSpeciesName := gsub("\\s+", " ", AccSpeciesName)]
# make lower- and uppercase plant names normal case
splot[, AccSpeciesName := tolower(AccSpeciesName)]
splot[, AccSpeciesName := paste0(toupper(substr(AccSpeciesName, 1, 1)), sub("^.", "", AccSpeciesName))]
# add plant genus column
splot[, AccGenus := gsub(" .*", "", AccSpeciesName)]
# add back hybrid sign
splot[hybrids, AccGenus := paste0("x ", AccGenus)]
splot[hybrids, AccSpeciesName := paste0("x ", AccSpeciesName)]
# check for problems in entries, i.e. wrong structure or special characters
splot[
!grepl("^(x )?[A-Z]+[a-z]+(\\s+[a-z'\\-]+(\\s+[a-z]+\\.?\\s+[a-z'\\-]+)?)?", AccSpeciesName),
c("AccSpeciesName", "Original_species")
]
# remove entries with "algae in name
splot <- splot[!grepl("\"algae", AccSpeciesName)]
# sort columns
mainCols <- c("PlotObservationID", "AccGenus", "AccSpeciesName", "Latitude_WGS84", "Longitude_WGS84", "TDWG4")
setcolorder(splot, c(mainCols, setdiff(colnames(splot), mainCols)))
Now we can write the pre-processed sPlot data.
fwrite(splot, file = paste0("sPlot_processed_", Sys.Date(), ".gz"))