-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathregionalInvertList.R
251 lines (196 loc) · 8.6 KB
/
regionalInvertList.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# Hawaii Invertebrate Regional Species List
# Sources: Bishop Museum, Lifewatch.be, Ocean Biodiversity Information System
# InverteBase, ARMS, Various professionals
# Compiled June 2023
library(xml2)
library(tidyverse)
library(tools)
library(readxl)
source("dataWranglingFuns.R")
# Function that will extract species from HTML files
# Bishop Museum stored species data in HTML files this function integrates
# another function `extractSpecies()` to pull out the data
# and save off into a more interact-able format.
bishopProcess <- function(fileName){
file_html = xml2::read_html(fileName)
# Save the file name which corresponds to taxa group
name = tools::file_path_sans_ext(basename(fileName))
# Use extractSpecies function to pull out species from raw HTML
name = extractSpecies(file_html)
name = name %>%
select(species)
# Return the list of species
# Should be a character vector splitting species by newlines
return(name)
}
#### Process Bishop Files and Create Master Bishop List ####
# Set working directory and save off names of all files
setwd("Species Lists/Bishop Processing/")
file.list <- dir()
# Apply the bishopProcess function to all files in the directory
allBishop <- sapply(file.list, bishopProcess)
# Flatten the list of character vectors into a single list
allBishopSp = purrr::flatten(allBishop)
# Define columns of interest to be used with wormsProcess function
# See dataWranglingFuns.R for full list of possible columns.
CoI <- c(
"scientificname", "AphiaID", "valid_AphiaID", "valid_name", "status",
"kingdom", "phylum", "class", "order", "family", "genus", "rank"
)
# Process list of species from Bishop using WoRMS
bishopListProcessed <- allBishopSp %>%
# Convert List of Species into a Dataframe
as.data.frame() %>%
# Spread to long
pivot_longer(everything()) %>%
# Rename column and select only that column.
dplyr::rename("species" = "value") %>%
dplyr::select("species") %>%
# Collapse classifications of sp. and cf to genus level id
dplyr::mutate(across("species", str_replace, " sp.| Sp.| cf", "")) %>%
# Keep only one of each taxonomic entry
dplyr::distinct() %>%
# Clean up list
dplyr::filter(!str_starts(species, "\\(")) %>%
dplyr::filter(!str_starts(species, "\\-")) %>%
# Process list using wormsProcess function
wormsProcess(., CoI)
# Process species that came up invalid in previous function using Fuzzy search.
bishopFuzzyProcessed <- allBishopSp %>%
# Convert List of Species into a Dataframe
as.data.frame() %>%
# Spread to long
pivot_longer(everything()) %>%
# Rename column and select only that column.
dplyr::rename("species" = "value") %>%
dplyr::select("species") %>%
# Collapse classifications of sp. and cf to genus level id
dplyr::mutate(across("species", str_replace, " sp.| Sp.| cf", "")) %>%
# Keep only one of each taxonomic entry
dplyr::distinct() %>%
# Clean up list
dplyr::filter(!str_starts(species, "\\(")) %>%
dplyr::filter(!str_starts(species, "\\-")) %>%
# Cross check WoRMS processed list to see which taxa were not found.
anti_join(bishopListProcessed, by = join_by("species" == "scientificname")) %>%
# Remove entries that are a part of taxa authority.
dplyr::filter(!str_ends(species, " and")) %>%
# Process list using wormsProcess function
wormsProcess(., CoI, fuzzy = TRUE)
# Combine fuzzy searched and normal searched lists
bishopList <- bishopListProcessed %>%
bind_rows(bishopFuzzyProcessed)
bishopListSpecies <- bishopList %>%
filter(rank == "Species")
setwd("~/Hawaii Invert List")
#### Lifewatch List ####
hiEEZReport <- read.csv("Species Lists/LifewatchList.csv")
# Filter out extinct species and species that have only a single recorded entry
lifewatchList <- hiEEZReport %>%
dplyr::filter(isExtinct != 1 | is.na(isExtinct)) %>%
distinct(acceptedNameUsageID, .keep_all = T) %>%
dplyr::filter(phylum != "Chordata",
count > 1)
# Create list of species not already recorded in the Bishop list
unmatchedLifewatch <- bishopListSpecies %>%
anti_join(lifewatchList, by = join_by("worms_name" == "acceptedNameUsage"))
# 817 unmatched species
# Put lifewatch list into format compatible with Bishop list
lifewatch <- lifewatchList %>%
anti_join(bishopListSpecies, by = join_by("acceptedNameUsage" == "worms_name")) %>%
dplyr::select(scientificName:genus) %>%
dplyr::select(!count) %>%
dplyr::select(!scientificNameAuthorship) %>%
dplyr::rename("scientificname" = "scientificName",
"AphiaID" = "aphiaID",
"status" = "taxonomicStatus",
"valid_AphiaID" = "acceptedNameUsageID",
"worms_name" = "acceptedNameUsage") %>%
dplyr::mutate("valid_name" = stringr::str_replace(worms_name, " \\s*\\([^\\)]+\\)", ""))
# Combine bishop and lifewatch lists
# Remove duplicated AphiaIDs
lifewatchAndBishop <- bishopListSpecies %>%
full_join(lifewatch) %>%
distinct(AphiaID, .keep_all = TRUE) %>%
mutate(origin = if_else(is.na(origin), "bishop", origin)) %>%
mutate(origin = as.factor(origin))
#### OBIS List ####
# Filter out empty species entires and select columns that have the data we
# are interested in
#
# WILL ERROR NEED TO DOWNLOAD OBIS DATA FROM THE OBIS PORTAL
# TOO LARGE TO STORE FILE ON GITHUB
hawaiiOBISinverts <- read.csv("Species Lists/HAWAII_OBIS_DATA.csv") %>%
dplyr::filter(phylum != "Chordata") %>%
dplyr::select(c("scientificname", "originalscientificname", "taxonrank", "aphiaid",
"kingdom", "phylum", "class", "order", "family", "genus", "basisofrecord", "id")) %>%
distinct(scientificname, aphiaid, .keep_all = TRUE) %>%
dplyr::filter(scientificname != "")
# Filter to Species entries
hawaiiOBISinverts = hawaiiOBISinverts %>%
filter(taxonrank == "Species")
OBISidList <- hawaiiOBISinverts %>%
select(scientificname, id)
unmatchedOBIS <- hawaiiOBISinverts %>%
anti_join(lifewatchAndBishop, by = join_by("scientificname" == "worms_name"))
# Remove entries that are based on preserved specimens.
unmatchedOBISFilter <- unmatchedOBIS %>%
filter(basisofrecord != "PreservedSpecimen") %>%
dplyr::rename("AphiaID" = "aphiaid",
"rank" = "taxonrank") %>%
select(!originalscientificname) %>%
select(!basisofrecord)
#### ARMS List ####
armsData <- read_xlsx("Species Lists/ARMS_MarineInverts_ListsGuildFood.xlsx")
armsSpecies <- armsData %>%
#
select(SCIENTIFICNAME, TROPHIC_GUILD_CODE) %>%
mutate(species = str_match(SCIENTIFICNAME, "[A-Z][a-z]+\\W[a-z]+\\b|[A-Z][a-z]+\\b")) %>%
mutate(across("species", str_replace, " sp", "")) %>%
distinct(species, .keep_all = TRUE)
armsWoRMS <- armsSpecies %>%
wormsProcess(., CoI, fuzzy = TRUE) %>%
mutate(origin = "arms") %>%
left_join(armsSpecies, by = join_by("scientificname" == "species")) %>%
select(!SCIENTIFICNAME)
#### InvertBase ####
invertBaseData <- read.csv("Species Lists/InvertBase_Marine Invertebrates of Kaneohe Bay_1685607641.csv")
invertBaseWoRMS <- invertBaseData %>%
filter(!str_starts(ScientificName, "\\[")) %>%
select(ScientificName) %>%
mutate(species = str_match(ScientificName, "[A-Z][a-z]+\\W[a-z]+\\b|[A-Z][a-z]+\\b")) %>%
mutate(across("species", str_replace, " sp", "")) %>%
distinct(species) %>%
wormsProcess(., CoI, fuzzy = TRUE) %>%
mutate(origin = "invertbase")
#### Micromollusc List ####
micromolluscs <- read_xlsx("Species Lists/Micromollusc species list.xlsx")
micromolluscWoRMS <- micromolluscs %>%
select(Species) %>%
mutate(Species = str_match(Species, "[A-Z][a-z]+\\W[a-z]+\\b|[A-Z][a-z]+\\b")) %>%
mutate(across("Species", str_replace, " sp", "")) %>%
distinct(Species, .keep_all = TRUE) %>%
wormsProcess(., CoI, fuzzy = TRUE) %>%
mutate(origin = "micromolluscList")
#### Creating full list ####
poriferaList <- read.csv("Species Lists/poriferaSpecies.csv") %>%
rename("AphiaID" = "id")
cnidariaList <- read.csv("Species Lists/cnidarianSpeciesCSV.csv") %>%
rename("scientificname" = "species")
# Combine all lists into one list
fullInvertList <- lifewatchAndBishop %>%
full_join(unmatchedOBISFilter) %>%
full_join(OBISidList) %>%
full_join(poriferaList) %>%
full_join(cnidariaList) %>%
full_join(armsWoRMS) %>%
full_join(invertBaseWoRMS) %>%
full_join(micromolluscWoRMS) %>%
# Keep one entry of each unique Aphia ID
distinct(AphiaID, .keep_all = TRUE) %>%
# Add in metadata regarding original dataset each entry came from
mutate(origin = if_else(is.na(origin), listID, origin)) %>%
mutate(origin = if_else(is.na(origin), "bishop", origin)) %>%
mutate(rank = if_else(is.na(rank), "Species", rank))
#### Write out the full list to a CSV file ####
write.csv(fullInvertList, file = "HawaiiInvertList.csv")