install.packages('lubridate')
install.packages('stringr')
library(lubridate)
library(stringr)
# Note:
# See ?strptime() for more on dates and times
regex tutorial: http://regexone.com/lesson/0
regex testing: https://regex101.com/#python
Go to regex101.com (the link above) and copy and paste the following into the test string box.
Citations
Associations between fossil diversity and environmental conditions are frequently assessed using globally averaged environmental proxies (Peters & Foote, 2002; Mayhew et al., 2008; Hannisdal & Peters, 2011; Peters et al., 2013)
As nations struggle to support marine protected areas (MPAs) that cross international borders, the maintenance of overall range size, and connectivity among ranges, will only increase in importance (Wells & Day, 2004; Moffitt et al., 2011)
Challenge strings - get the latitude and longitude
ARCTIC_OCEAN_DRIFTING_STATION_SEVERNYI_POLYUS_19_Nansen_closing_net_333_um_mesh_3090_85.9_-47.0333
BARENTS_SEA_PERSEY_Nansen_surface_net_333_um_272_71.9333_33.9167
_DRIFTING_STATION_SEVERNYI_POLUS_5_Nansen_closing_net_333_um_mesh_4372_86.0167_102.433
POLAR_BASIN_DRIFTING_STATION_SEVERNYI_POLUS_2_Nansen_closing_net_333_um_mesh_3464_80.9_-161.533
Slides for today:
Dates and 'datetimes' can cause a lot of hassle for something seemingly simple.
library(lubridate)
# Let's look at some date/datetime objects.
# Side note, you'll notice the use of the ::, this is a way to be explicit about
# the package from which you are calling a function. You may have noticed this
# already in the data wrangling cheatsheet.
# A quote from Hadley's R packages book: "The best practice is to explicitly
# refer to external functions using the syntax package::function(). This makes
# it very easy to identify which functions live outside of your package. This
# is especially useful when you read your code in the future."
# http://r-pkgs.had.co.nz/description.html
# I use it now to make it clear for you that now() is NOT a base R function.
# I will also mention that this is how you can specify functions that have been
# 'masked' by libraries that you load (e.g., the dplyr start up messages that
# you see when you load dplyr).
lubridate::now()
## [1] "2015-11-19 09:20:49 EST"
lubridate::today()
## [1] "2015-11-19"
today() # can be combined with paste() to timestamp exported files
## [1] "2015-11-19"
# This let's you keep track of a most recent file output!
str(today())
## Date[1:1], format: "2015-11-19"
Time to play with dates of different formats
date1 <- '20010102'
date2 <- '2001.01.02'
date3 <- '2001/01/02'
date4 <- '2001-01-02'
# Let's mix it up
date5 <- '2001..01/02'
# If we look at the structure it is a string/character
str(date1)
## chr "20010102"
as.Date(x = date1, format = '%Y%m%d')
## [1] "2001-01-02"
# POSIX awareness moment
as.numeric(today())
## [1] 16758
posix_time <- as.numeric(today())
posix_time
## [1] 16758
lubridate::origin
## [1] "1970-01-01 UTC"
as.Date(posix_time, origin = '1970-01-01')
## [1] "2015-11-19"
as.Date(posix_time, origin = origin)
## [1] "2015-11-19"
# Lubridate magic!
date_strings <- c('20010102', '2001.01.02', '2001..01..02', '2001_01_02',
'01/01/02', '2001, January, 2', '2001,_January, 2')
date_dates <- lubridate::ymd(date_strings)
year(date_dates)
## [1] 2001 2001 2001 2001 2001 2001 2001
month(date_dates)
## [1] 1 1 1 1 1 1 1
month(date_dates, label = TRUE)
## [1] Jan Jan Jan Jan Jan Jan Jan
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
month(date_dates, label = TRUE, abbr = FALSE)
## [1] January January January January January January January
## 12 Levels: January < February < March < April < May < June < ... < December
day(date_dates)
## [1] 2 2 2 2 2 2 2
What are regular expressions??
a language for pattern matching in strings
it’s a way for you to do really, really specific string matching - find and/or replace
literals + metacharacters
literals
exactly what they are
metacharacters
things that change how you interpret the literals (these characters do something other than what they are)
e.g: ?, *, [], ^, $, (), \ , .
grep - find a pattern in a character vector
# Let's make a Mickey Mouse string
mms <- c('M. Mouse', 'm. mouse', 'Mickey Mouse', 'mick. mouse')
grep(pattern = 'Mouse', x = mms)
## [1] 1 3
grep(pattern = 'Mouse', x = mms, value = TRUE)
## [1] "M. Mouse" "Mickey Mouse"
grep(pattern = 'Mouse', x = mms, value = FALSE, ignore.case = TRUE)
## [1] 1 2 3 4
grep(pattern = 'Mouse', x = mms, value = TRUE, ignore.case = TRUE)
## [1] "M. Mouse" "m. mouse" "Mickey Mouse" "mick. mouse"
# Using gsub to make replacements
gsub(pattern = 'Mickey', replacement = 'Minnie', x = mms)
## [1] "M. Mouse" "m. mouse" "Minnie Mouse" "mick. mouse"
# Or if we want to find more than one thing, we can use the OR operator '|'
gsub(pattern = 'Mickey|mick', replacement = 'Minnie', x = mms)
## [1] "M. Mouse" "m. mouse" "Minnie Mouse" "Minnie. mouse"
# Introduction to regular expressions
# returns logical vector
#grepl(pattern = 'Mouse', x = mms)
gsub - find and replace
a
, Z
, 5
, @
[aeiou]
some set of characters[^aeiou]
…or not those characters![a-zA-Z]
…or a range[[:alpha:]]
…or a POSIX character class.
^stuff
at start of linestuff$
at end of line\<stuff\>
as its own word?
zero or one*
zero or more (any)+
one or more (at least one){n}
exactly n{,n}
no more than n{n,}
at least nfoo|bar
…one of two possible patterns
- What does
a+
match?
- is it the literal {
a+
}?- or is it {
a
,aa
,aaa
, …}?- answer: it depends!
regex101
Citations \([^(MPAs)].*\)
challenge strings (-?._-?.)
stringr
regular expressions
library(dplyr) library(stringr)
ll_regex <- ’(-?\d.\d(_)-?\d.\d)\(' extracted <- str_extract(ts\)SampleID, ll_regex) lat_long <- str_split_fixed(extracted, “_“, 2) #lat_long
ts\(Lat <- lat_long[, 1] ts\)Lon <- lat_long[, 2]
write.csv(ts, ‘TSforCIEE_latlong.csv’)
example string:
c(‘Barnstable County’, ‘Berkshire County’, ‘Bristol County’, ‘Dukes County’, ‘Essex County’)
strs <- c('ARCTIC_OCEAN_DRIFTING_STATION_SEVERNYI_POLYUS_19_Nansen_closing_net_333_um_mesh_3090_85.9_-47.0333',
'BARENTS_SEA_PERSEY_Nansen_surface_net_333_um_272_71.9333_33.9167',
'_DRIFTING_STATION_SEVERNYI_POLUS_5_Nansen_closing_net_333_um_mesh_4372_86.0167_102.433',
'POLAR_BASIN_DRIFTING_STATION_SEVERNYI_POLUS_2_Nansen_closing_net_333_um_mesh_3464_80.9_-161.533'
)
str_extract(strs, '(-?\\d*\\.\\d*_-?\\d*\\.\\d*)')
## [1] "85.9_-47.0333" "71.9333_33.9167" "86.0167_102.433" "80.9_-161.533"