GloNAF - A global database of naturalized and alien flora#

GloNAF is a global database of occurrences of taxa outside their natural range. The GloNAF dataset consists of taxon names together with occurrence information coded as TDWG4-regions, and information on whether the respective taxon is naturalized, i.e. non-naturally occurring and reproducing in the region, or alien, i.e. non-naturally occurring without known reproduction in situ. The pre-processing script does the following:

  • remove typos

  • repair TDWG4 regions. GloNAF comes with a shapefile including a world map and regions corresponding to the distribution data. Unfortunately, there are several issues with the shapefile, impeding its proper use and also the integration of the data into other projects

If you have questions, suggestions, spot errors, or want to contribute, get in touch with us through planthub@idiv.de.

Author: David Schellenberger Costa

Requirements#

To run the script, the following is needed:

  • a world map shapefile of the TDWG4 regions, available here

  • GloNAF data, available here

  • some R libraries that may need to be installed

Code#

# load in libraries
library(data.table) # handle large datasets
library(rgdal) # handle shapefiles
library(raster) # manipulate shapefiles
library(rgeos) # used in raster::aggregate
library(geojsonio) # save modified shapefiles (due to errors in writeOGR from rgdal)

# clear workspace
rm(list = ls())

Let’s get the world map with TDWG regions (botanical regions). Unfortunately, there are some errors in the data, at least in level 4. We will repair this.

# set working directory (adapt this!)
setwd(paste0(.brd, "taxonomy/TDWG"))

wm2 <- readOGR("level2.shp")
wm3 <- readOGR("level3.shp")
wm4 <- readOGR("level4.shp")

# wrong name in Level4_cod
wm4@data$Level4_cod <- sub("AGE-CO", "AGE-CD", wm4@data$Level4_cod)

# wrong separation of a part of Slovakia
slovs <- which(wm4@data$Level4_cod == "CZE-SL")
wm4@polygons[[slovs[1]]] <- raster::merge(wm4[slovs[1], ], wm4[slovs[2], ])@polygons[[1]]
wm4 <- wm4[-slovs[2], ]

Now we need the GloNAF data. There is also one typo here to replace, and we have to define the projection of the TDWG data. It is the same as the one from GloNAF, so we just use that one for TDWG, too.

# set working directory (adapt this!)
setwd(paste0(.brd, "PlantHub"))

# read in GloNAF data
# data on naturalized or exotic taxa with region_id
glo <- fread("GLONAF_257_9_257_2/Taxon_x_List_GloNAF_vanKleunenetal2018Ecologyutf8.csv")
# maps region_id to tdwg4 and OBJIDsic
gloReg <- fread("GLONAF_257_9_257_2/Region_GloNAF_vanKleunenetal2018Ecology.csv")
gloShape <- readOGR("GloNAF_Shapefile_257_9_257_2/regions2.shp") # shapefile with OBJIDsic

# repair error in gloReg
gloReg[tdwg4 == "CZE-SK", tdwg4 := "CZE-SL"]

# set projection of all files to latitude/longitude WGS84
proj4string(wm2) <- proj4string(wm3) <- proj4string(wm4) <- proj4string(gloShape)

gloReg contains some codes that are not TDWG3 and TDWG4. They code for regions that differ in their extent from the standard. To be able to use the data in other contexts, these regions have to be transferred to TDWG.

all(glo$region_id %in% gloReg$region_id)
all(gloReg$tdwg3 %in% wm3@data$LEVEL3_COD)
all(gloReg$tdwg4 %in% wm4@data$Level4_cod)
gloReg[!(gloReg$tdwg3 %in% wm3@data$LEVEL3_COD)] # non-TDWG3 codes
gloReg[!(gloReg$tdwg4 %in% wm4@data$Level4_cod)] # non-TDWG4 codes

We will check for potential TDWG replacements of regions with non-TDWG3 and non-TDWG4 codes.

par(mfrow = c(1, 2))

gloReg[name == "Indonesia", ]
plot(gloShape[gloShape@data$OBJIDsic == 877, ], main = "Indonesia by GloNAF")
plot(wm3[wm3@data$LEVEL2_COD == 42, ], col = rainbow(5), main = "overlapping TDWG regions")
# Indonesia -> Malesia(TDWG2) w/o Philipines(TDWG3), w/o Malaya(TDWG3), w Irian Jaya (TDWG4)

gloReg[name == "Papua New Guinea", ]
plot(gloShape[gloShape@data$OBJIDsic == 353, ], main = "Papua New Guinea by GloNAF")
wm3@data[grepl("New Guinea", wm3@data$LEVEL3_NAM), ]
plot(wm4[wm4@data$Level3_cod == "NWG", ], col = rainbow(5), main = "overlapping TDWG regions")
# Papua New Guinea -> Papua New Guinea(TDWG4) and Bismarck archipelago(TDWG3)

plot(gloShape[gloShape@data$OBJIDsic == gloReg[grepl("Puducherry", tdwg4_name)]$OBJIDsic, ],
	main = "Puducherry by GloNAF"
)
plot(wm4[wm4@data$Level3_cod == "IND", ], col = rainbow(5), main = "overlapping TDWG regions")
q1 <- wm4[wm4@data$Level3_cod == "IND", ] # TDWG4 regions within India(TDWG3)
vapply(seq_along(q1), function(x) q1@polygons[[x]]@area, 1) # sizes of the regions
q2 <- order(vapply(seq_along(q1), function(x) q1@polygons[[x]]@area, 1))[c(1, 2, 6, 7)]
wm4@data[wm4@data$Level3_cod == "IND", ][q2, ]
# Puducherry -> Mahe(TDWG4), Yanam(TDWG4), Karaikal(TDWG4), Pondicherry(TDWG4)

# Andaman and Nicobar -> Andaman Is.(TDWG3), Nicobar Is.(TDWG3)

i <- which(wm3@data$LEVEL3_NAM == "Japan")
num <- gloReg[grepl("Japan", name), ]$OBJIDsic
shapeRow <- which(gloShape@data$OBJIDsic %in% num)
plot(gloShape[shapeRow, ], main = "Japan by GloNAF")
plot(wm3[i, ], main = "Japan by TWDG")
# Japan -> Japan(TDWG3)

# Baja California Peninsula -> Baja California(TDWG4), Baja California Sur(TDWG4)

# New Zealand -> New Zealand(TDWG2), replace by all TDWG3 regions from this TDWG2 zone

# Sao Tome and Principe -> Sao Tome(TDWG4), Principe(TDWG4)

i <- which(wm3@data$LEVEL3_NAM == "Yemen")
num <- gloReg[grepl("Yemen", name), ]$OBJIDsic
shapeRow <- which(gloShape@data$OBJIDsic %in% num)
plot(gloShape[shapeRow, ], main = "Yemen by GloNAF")
plot(wm3[i, ], main = "Yemen by TWDG")
# Yemen -> Yemen(TDWG3)

i <- which(wm3@data$LEVEL3_NAM == "Turkey")
num <- gloReg[grepl("Turkey", name), ]$OBJIDsic
shapeRow <- which(gloShape@data$OBJIDsic %in% num)
plot(gloShape[shapeRow, ], main = "Turkey by GloNAF")
plot(wm3[i, ], main = "Turkey by TWDG")
# Turkey -> Turkey(TDWG3), Turkey-in-Europe(TDWG3)

To find out which region sizes are different between the GloNAF and TDWG regions, we can calculate them and plot them into a file.

pdf("Comparison of region sizes between GloNAF and TDWG.pdf")
par(mfrow = c(2, 2))
for (i in seq_len(nrow(wm3@data))) {
	if (wm3@data$LEVEL3_COD[i] %in% gloReg$tdwg3) {
		shapeRow <- which(gloShape@data$OBJIDsic %in% gloReg[tdwg3 == wm3@data$LEVEL3_COD[i]]$OBJIDsic)
		wm3Size <- 0
		for (j in seq_along(wm3@polygons[[i]]@Polygons)) {
			wm3Size <- wm3Size + wm3@polygons[[i]]@Polygons[[j]]@area
		}
		gloSize <- 0
		for (j in seq_along(shapeRow)) {
			gloSize <- gloSize + sum(vapply(seq_along(gloShape@polygons[[shapeRow[j]]]@Polygons), function(x) {
				gloShape@polygons[[shapeRow[j]]]@Polygons[[x]]@area
			}, 1))
		}
		sizeRatio <- round(gloSize / wm3Size, 2)
		if (sizeRatio > 1.05 || sizeRatio < .95) {
			cols <- rep("#EEEEEE", nrow(gloShape))
			cols[shapeRow] <- "red"
			plot(gloShape, col = cols)
			cols <- rep("#EEEEEE", nrow(wm3))
			cols[i] <- "red"
			plot(wm3, col = cols, main = wm3@data$LEVEL3_NAM[i])
			plot(gloShape[shapeRow, ])
			plot(wm3[i, ])
			print(paste0(wm3@data$LEVEL3_NAM[i], " - ", sizeRatio))
		}
	} else {
		print(paste0(wm3@data$LEVEL3_NAM[i], " not processed."))
	}
}
dev.off()

These regions are considerably smaller than in TDWG (reduced in GloNAF to indicate data (non-)availability):

  • Antarctica

  • East Himalaya

  • Greenland

  • Haiti

  • Magadan

  • Palestine

  • Central European Russia

  • Northwest European Russia

  • West Sibiria

  • Transcaucasus

  • Baltic States

  • Svalbard

  • Yugoslavia

These regions are considerably larger than in TDWG (mostly errors in shape file):

  • Egypt -> Egypt(TDWG3), Sinai(TDWG3)

  • Equatorial Guinea -> Equatorial Guinea(TDWG3), Bioko(TDWG4)

  • Western Sahara (error!)

To map everything to TDWG4, it is necessary to produce a table that maps region_id from glo to correct TDWG4 to replace gloReg. A new region_id will be created for regions not currently recognized by GloNAF. These region_ids need then to be introduced as new lines in glo (that will be the next step).

glo2TDWG <- data.table(
	region_id = gloReg$region_id, name = gloReg$name,
	TDWG2 = gloReg$tdwg2, TDWG3 = gloReg$tdwg3, TDWG4 = gloReg$tdwg4
)

# change codes for the problems identified
codeChange <- list()
changedName <- c(
	"Indonesia", "Andaman and Nicobar", "New Zealand", "Papua New Guinea", "Turkey", "Egypt", "Equatorial Guinea",
	"Baja California Peninsula", "Puducherry", "Sao Tome and Principe", "Yemen", "Japan"
)
for (i in seq_along(changedName)) {
	# select those TDWG4 regions belonging to it
	if (i < 2) {
		q1 <- wm2@data[wm2@data$LEVEL2_NAM == "Malesia", ]$LEVEL2_COD
		wm4Rows <- which(wm4@data$Level2_cod == q1 & !(wm4@data$Level3_cod %in%
			c("PHI", "MLY")) | wm4@data$Level_4_Na == "Irian Jaya") # rows of wm4 used
	} else if (i < 3) {
		wm4Rows <- grep("Andaman|Nicobar", wm4@data$Level_4_Na)
	} else if (i < 4) {
		q1 <- wm2@data[wm2@data$LEVEL2_NAM == "New Zealand", ]$LEVEL2_COD
		wm4Rows <- which(wm4@data$Level2_cod == q1)
	} else if (i < 5) {
		wm4Rows <- which(wm4@data$Level_4_Na %in% c("Papua New Guinea", "Bismarck Archipelago"))
	} else if (i < 6) {
		wm4Rows <- grep("Turkey", wm4@data$Level_4_Na)
	} else if (i < 7) {
		wm4Rows <- grep("Egypt|Sinai", wm4@data$Level_4_Na)
	} else if (i < 8) {
		wm4Rows <- grep("Equatorial Guinea|Bioko", wm4@data$Level_4_Na)
	} else if (i < 9) {
		wm4Rows <- grep("Baja California", wm4@data$Level_4_Na)
	} else if (i < 10) {
		wm4Rows <- which(wm4@data$Level4_cod %in% c("IND-MH", "IND-YA", "IND-KL", "IND-PO"))
	} else if (i < 11) {
		wm4Rows <- which(wm4@data$Level4_cod %in% c("GGI-PR", "GGI-ST"))
	} else if (i < 12) {
		wm4Rows <- which(wm4@data$Level3_cod == "YEM")
	} else {
		wm4Rows <- which(wm4@data$Level3_cod == "JAP")
	}
	# adding rows from wm4 not in glo2TDWG
	q2 <- wm4@data[wm4Rows[!(wm4@data[wm4Rows, ]$Level4_cod %in% glo2TDWG$TDWG4)], ]
	if (nrow(q2) > 0) {
		q2$region_id <- seq_len(nrow(q2)) + max(glo2TDWG$region_id)
		q2 <- data.table(
			region_id = q2$region_id, name = q2$Level_4_Na,
			TDWG2 = q2$Level2_cod, TDWG3 = q2$Level3_cod, TDWG4 = q2$Level4_cod
		)
	}
	q3 <- glo2TDWG[glo2TDWG$TDWG4 %in% wm4@data[wm4Rows, ]$Level4_cod]$region_id
	codeChange[[length(codeChange) + 1]] <-
		list(changedName[i], gloReg[name == changedName[i]]$region_id, c(q3, q2$region_id))
	if (nrow(q2) > 0) glo2TDWG <- rbind(glo2TDWG, q2)
}

From here on, no comparison with the original data is possible anymore because of added and removed lines in glo!

# add lines in glo with new region_id
for (i in seq_along(codeChange)) {
	q1 <- glo[region_id == codeChange[[i]][[2]]]
	for (j in seq_along(codeChange[[i]][[3]])) {
		q2 <- q1
		q2[, region_id := codeChange[[i]][[3]][j]]
		glo <- rbind(glo, q2)
	}
}

# remove replaced regions from glo and gloReg to avoid their plotting
glo <- glo[!(region_id %in% unlist(sapply(codeChange, function(x) setdiff(x[[2]], x[[3]])))), ]
glo2TDWG <- glo2TDWG[!(region_id %in% unlist(sapply(codeChange, function(x) setdiff(x[[2]], x[[3]])))), ]

We have now repaired the GloNAF data to be plotted as accurately as possible using standard TDWG regions. However, we may create a custom TDWG4 shapefile from the original TDWG4 shapefile where regions considerably smaller in GloNAF are also smaller by replacing those with the regions from the original GloNAF shapefile.

wm4Glo <- wm4

The following already refer to only some, but not all parts of their corresponding TDWG3 regions in gloReg, and the size of the parts corresponds to their size in GloNAF. Therefore, no changes need to be done.

  • East Himalaya -> EHM-AP and EHM-SI

  • Baltic States -> BLT-ES, BLT-LA, BLT-LI

  • Yugoslavia -> YUG-CR, YUG-MA, YUG-SL

Haiti only consists of the second- and third-biggest islands in GloNAF, so we remove the main island and two small islands.

wm4Glo@polygons[[which(wm4@data$Level4_cod == "HAI-HA")]]@Polygons <-
	wm4Glo@polygons[[which(wm4@data$Level4_cod == "HAI-HA")]]@Polygons[c(3, 5)]
q1 <- wm4Glo@polygons[[which(wm4@data$Level4_cod == "HAI-HA")]]@plotOrder[c(3, 5)]
q1 <- as.integer(order(q1))
wm4Glo@polygons[[which(wm4@data$Level4_cod == "HAI-HA")]]@plotOrder <- q1
wm4Glo@polygons[[which(wm4@data$Level4_cod == "HAI-HA")]]@area <-
	sum(vapply(seq_along(wm4Glo@polygons[[which(wm4@data$Level4_cod == "HAI-HA")]]@Polygons), function(x) {
		wm4Glo@polygons[[which(wm4@data$Level4_cod == "HAI-HA")]]@Polygons[[x]]@area
	}, 1))

Georgia includes two other subregions in GloNAF, therefore, lines have to be added to glo, and the mapping has to be added in glo2TDWG.

Transcaucasus -> TCS-AB, TCS-AD, TCS-AR, TCS-GR (TCS-AB, TCS-AD to be added to TCS-GR)

# plot region
plot(gloShape[gloShape@data$OBJIDsic %in% c(858, 26), ], main = "Transcaucasus by GloNAF")
plot(wm4[wm4@data$Level3_cod == "TCS", ], col = rainbow(5), main = "Transcaucasus")
# lines with region_id 349 have to be triplicated and new region_ids to be created in glo2TDWG
gloReg[OBJIDsic %in% c(858, 26)]

# add mapping to glo2TDWG
temp <- data.table(cbind(max(glo2TDWG$region_id) + c(1:2), wm4@data[which(wm4@data$Level4_cod %in%
	c("TCS-AB", "TCS-AD")), c("Level_4_Na", "Level2_cod", "Level3_cod", "Level4_cod")]))
colnames(temp) <- colnames(glo2TDWG)
glo2TDWG <- rbind(glo2TDWG, temp)
newIDs <- temp$region_id
# triplicate rows in glo
for (i in newIDs) {
	temp <- glo[region_id == 349]
	temp$region_id <- i
	glo <- rbind(glo, temp)
}

The remaining regions have to be replaced by the regions from GloNAF, as they are considerably smaller in GloNAF than in TDWG, and the smaller parts do not represent individual TDWG4 regions, but are parts of TDWG4 regions, defined specifically by GloNAF. Note that Israel appears instead of Palestine here, because only the Israel part (TWDG4) of the Palestine (TWDG3) region is used in GloNAF. in the case of other TDWG3 regions (e.g. Antarctica), TDWG3 equals TDWG4.

We now search for the respective names in the gloReg twdg4_name column (several regions may be returned) and replace the respective TDWG4 region from TDWG by the smaller region from GloNAF

shrinkedShapes <- c(
	"Antarctica", "Greenland", "Magadan", "Israel", "Central European Russia",
	"Northwest European Russia", "West Siberia", "Svalbard"
)
for (i in seq_along(shrinkedShapes)) {
	q1 <- which(gloShape@data$OBJIDsic %in% gloReg[name == shrinkedShapes[i] | tdwg4_name == shrinkedShapes[i]]$OBJIDsic)
	tempID <- wm4Glo@polygons[[which(wm4Glo@data$Level_4_Na == shrinkedShapes[i])]]@ID
	if (length(q1) > 1) {
		q2 <- raster::aggregate(gloShape[which(gloShape@data$OBJIDsic %in%
			gloReg[name == shrinkedShapes[i] | tdwg4_name == shrinkedShapes[i]]$OBJIDsic), ])
		wm4Glo@polygons[[which(wm4Glo@data$Level_4_Na == shrinkedShapes[i])]] <- q2@polygons[[1]]
	} else {
		wm4Glo@polygons[[which(wm4Glo@data$Level_4_Na == shrinkedShapes[i])]] <-
			gloShape@polygons[[which(gloShape@data$OBJIDsic == gloReg[name == shrinkedShapes[i] | tdwg4_name == shrinkedShapes[i]]$OBJIDsic)]]
	}
	wm4Glo@polygons[[which(wm4Glo@data$Level_4_Na == shrinkedShapes[i])]]@ID <- tempID
}

# remove shapes without data from wm4Glo, as no distribution data has been recorded in GloNAF for these regions
q1 <- which(wm4Glo@data$Level4_cod %in% glo2TDWG$TDWG4)
wm4Glo <- wm4Glo[q1, ]

Comparing the resolution of GloNAF and TDWG4, we find that Australia has a finer resolution in glo than that provided by TDWG4. The same is treu for South America.

pdf("Comparison of resolution of GloNAF and TDWG4.pdf")
par(mfrow = c(2, 1))
par(mar = c(1, 1, 1, 1))
plot(wm4Glo, col = "grey", main = "TDWG4 GloNAF")
plot(gloShape, col = "grey", main = "Original GloNAF")
plot(wm4[wm4@data$Level2_cod == 50, ], col = rainbow(5))
plot(gloShape[gloShape@data$OBJIDsic %in% gloReg[tdwg2 == 50]$OBJIDsic, ], col = rainbow(5))
plot(wm4[wm4@data$Level1_cod == 8, ], col = rainbow(5))
plot(gloShape[gloShape@data$OBJIDsic %in% gloReg[tdwg1 == 8]$OBJIDsic, ], col = rainbow(5))
dev.off()

Changing column names is not necessary, but may be convenient.

colnames(glo) <- gsub("standardized_name", "AccSpeciesName", colnames(glo))
colnames(glo) <- gsub("author", "AccAuthorName", colnames(glo))
colnames(glo) <- gsub("^status", "GloNAF_status", colnames(glo))

There is now some redundancy in glo, as some TDWG4 regions in glo2TDWG are referenced by several of the smaller regions in GloNAF. We remove these lines that have become redundant and change the link from region_id to TDWG4. We also add a genus column and sort the columns.

glo[!(region_id %in% glo2TDWG$region_id)]
setkey(glo2TDWG, region_id)
glo[, TDWG4 := glo2TDWG[J(glo$region_id)]$TDWG4] # new link directly to (modified) TDWG4
glo[, region_id := NULL] # not needed anymore
# remove duplicates determined by comparing the given columns
glo <- unique(glo, by = c("AccSpeciesName", "AccAuthorName", "GloNAF_status", "TDWG4"))

# repair an error in the data
glo[, AccSpeciesName := sub("Brachyglottis x", "Brachyglottis", AccSpeciesName)]

# add plant genus column
glo[, AccGenus := gsub(" .*", "", AccSpeciesName)]

# sort columns
mainCols <- c("AccGenus", "AccSpeciesName", "TDWG4")
setcolorder(glo, c(mainCols, setdiff(colnames(glo), mainCols)))

Now we can write the pre-processed GloNAF data.

fwrite(glo, file = paste0("GloNAF_processed_", Sys.Date(), ".gz"))

Before saving the TDWG4 and GloNAF_TDWG4 shapefiles, we check that IDs in the shapefile data and polygons are consistent. We also remove column names in the coords slots of the polygons.

all(rownames(wm4Glo@data) == vapply(seq_along(wm4Glo@polygons), function(x) {
	wm4Glo@polygons[[x]]@ID
}, "1")) # this should be true
q1 <- table(vapply(seq_along(wm4Glo@polygons), function(x) wm4Glo@polygons[[x]]@ID, "1"))
q1[q1 > 1] # this should have length 0
# remove column names in coords slot in polygons
for (i in seq_along(wm4Glo@polygons)) {
	for (j in seq_along(wm4Glo@polygons[[i]]@Polygons)) {
		if ("x" %in% colnames(wm4Glo@polygons[[i]]@Polygons[[j]]@coords)) {
			colnames(wm4Glo@polygons[[i]]@Polygons[[j]]@coords) <- NULL
		}
	}
}

We can now write two shapefiles. One is the TDWG4 shapefile with the minor errors repaired in the beginning, and the other the GloNAF_TDWG4 shapefile that has polygons from the GloNAF shapefile where regions were considerably smaller than in TDWG4, but all errors from the GloNAF shapefile removed.

geojsonio::geojson_write(wm4Glo,
	file = paste0("TDWG4_GloNAF_processed_", Sys.Date(), ".geojson"),
	geometry = "polygon", precision = 5
)
geojsonio::geojson_write(wm4,
	file = paste0("TDWG4_processed_", Sys.Date(), ".geojson"),
	geometry = "polygon", precision = 5
)