Checkout the etherpad for today https://etherpad.wikimedia.org/p/mogBNf6uEm
Install/load the following packages:
library(tidyr)
library(dplyr)
library(gapminder)
gap <-
filter(gapminder, grepl("^N", country)) %>%
filter(year %in% c(1952, 1977, 2007)) %>%
slice(1:9) %>%
select(-continent, -gdpPercap, -pop) %>%
mutate(lifeExp = round(lifeExp)) %>%
spread(key = year, value = lifeExp)
gap
## country 1952 1977 2007
## 1 Namibia 42 56 53
## 2 Nepal 36 47 64
## 3 Netherlands 72 75 80
gap_with_cont <-
filter(gapminder, grepl("^N", country)) %>%
filter(year %in% c(1952, 1977, 2007)) %>%
slice(1:9) %>%
select(-gdpPercap, -pop) %>%
mutate(lifeExp = round(lifeExp)) %>%
spread(key = year, value = lifeExp)
gap_with_cont
## country continent 1952 1977 2007
## 1 Namibia Africa 42 56 53
## 2 Nepal Asia 36 47 64
## 3 Netherlands Europe 72 75 80
Using gather() - class demo
tidied_gap <- gather(data = gap, key = year, value = lifeExp, -country)
tidied_gap
## country year lifeExp
## 1 Namibia 1952 42
## 2 Nepal 1952 36
## 3 Netherlands 1952 72
## 4 Namibia 1977 56
## 5 Nepal 1977 47
## 6 Netherlands 1977 75
## 7 Namibia 2007 53
## 8 Nepal 2007 64
## 9 Netherlands 2007 80
tidied_gap_with_cont <- gather(data = gap_with_cont, key = year, value = lifeExp, 3:5)
tidied_gap_with_cont
## country continent year lifeExp
## 1 Namibia Africa 1952 42
## 2 Nepal Asia 1952 36
## 3 Netherlands Europe 1952 72
## 4 Namibia Africa 1977 56
## 5 Nepal Asia 1977 47
## 6 Netherlands Europe 1977 75
## 7 Namibia Africa 2007 53
## 8 Nepal Asia 2007 64
## 9 Netherlands Europe 2007 80
# Two ways that you can tell gather() which columns to gather.
tidied_gap_with_cont <- gather(data = gap_with_cont, key = year, value = lifeExp, -country, -continent)
tidied_gap_with_cont
## country continent year lifeExp
## 1 Namibia Africa 1952 42
## 2 Nepal Asia 1952 36
## 3 Netherlands Europe 1952 72
## 4 Namibia Africa 1977 56
## 5 Nepal Asia 1977 47
## 6 Netherlands Europe 1977 75
## 7 Namibia Africa 2007 53
## 8 Nepal Asia 2007 64
## 9 Netherlands Europe 2007 80
# key is the name of the column that contains the values that were across the
# top of your data.frame. Value is the name of the values that populated all of
# the cells that you gathered.
Using spread() - class demo
# Remind ourselve what the data.frame looks like
tidied_gap
## country year lifeExp
## 1 Namibia 1952 42
## 2 Nepal 1952 36
## 3 Netherlands 1952 72
## 4 Namibia 1977 56
## 5 Nepal 1977 47
## 6 Netherlands 1977 75
## 7 Namibia 2007 53
## 8 Nepal 2007 64
## 9 Netherlands 2007 80
# Let's spread the data by country.
spread_gap <-
spread(data = tidied_gap, key = year, value = lifeExp)
# Let's look at it
spread_gap
## country 1952 1977 2007
## 1 Namibia 42 56 53
## 2 Nepal 36 47 64
## 3 Netherlands 72 75 80
Exercise:
Run this code to create a data frame called counts.
set.seed(1)
counts <-
data.frame(site = c(1, 1, 2, 3, 3, 3), taxon = c("A", "B", "A", "A", "B", "C"),
abundance = round(runif(n = 6, min = 0, max = 20), 0))
counts
## site taxon abundance
## 1 1 A 5
## 2 1 B 7
## 3 2 A 11
## 4 3 A 18
## 5 3 B 4
## 6 3 C 18
counts_wide
## site A B C
## 1 1 5 7 NA
## 2 2 11 NA NA
## 3 3 18 4 18
## site taxon abundance
## 1 1 A 5
## 2 2 A 11
## 3 3 A 18
## 4 1 B 7
## 5 2 B NA
## 6 3 B 4
## 7 1 C NA
## 8 2 C NA
## 9 3 C 18
Run this code to create the data frame we will be working with today.
mammals <- data.frame(site = c(1,1,2,3,3,3),
taxon = c('Suncus etruscus', 'Sorex cinereus',
'Myotis nigricans', 'Notiosorex crawfordi',
'Scuncus etruscus', 'Myotis nigricans'),
density = c(6.2, 5.2, 11.0, 1.2, 9.4, 9.6)
)
mammals
is the raw data that they collected. Re-write mammals to a new dataframe called mam_wide
which puts the taxa across the top (wide-format), with NA values replaced with 0. You should get something that looks like this. (tell me in the etherpad when you’re done!)## site Myotis nigricans Notiosorex crawfordi Scuncus etruscus
## 1 1 0.0 0.0 0.0
## 2 2 11.0 0.0 0.0
## 3 3 9.6 1.2 9.4
## Sorex cinereus Suncus etruscus
## 1 5.2 6.2
## 2 0.0 0.0
## 3 0.0 0.0
## site taxon density
## 1 1 Myotis nigricans 0.0
## 2 2 Myotis nigricans 11.0
## 3 3 Myotis nigricans 9.6
## 4 1 Notiosorex crawfordi 0.0
## 5 2 Notiosorex crawfordi 0.0
## 6 3 Notiosorex crawfordi 1.2
## 7 1 Scuncus etruscus 0.0
## 8 2 Scuncus etruscus 0.0
## 9 3 Scuncus etruscus 9.4
## 10 1 Sorex cinereus 5.2
## 11 2 Sorex cinereus 0.0
## 12 3 Sorex cinereus 0.0
## 13 1 Suncus etruscus 6.2
## 14 2 Suncus etruscus 0.0
## 15 3 Suncus etruscus 0.0
Sometimes you may need your data in wide format (or go from wide to long to fill in with zeros as we just did in the warm-up), but you have multiple columns of values. We are going to update mammals
to include an extra column called counts. Try putting this into wide format, like you did in 1a above.
# Let's add a column to mammals that has the actual species counts
set.seed(100)
mammals$counts <- round(runif(n = 6, min = 0, max = 100))
mammals
## site taxon density counts
## 1 1 Suncus etruscus 6.2 31
## 2 1 Sorex cinereus 5.2 26
## 3 2 Myotis nigricans 11.0 55
## 4 3 Notiosorex crawfordi 1.2 6
## 5 3 Scuncus etruscus 9.4 47
## 6 3 Myotis nigricans 9.6 48
spread(mammals, key = taxon, value = density, counts)
spread(mammals, key = taxon, value = c(density, counts))
So R doesn’t like that we’re trying to spread the data using multiple columns, and that makes sense. How would you even do this by hand? You couldn’t, you’d have to do it in two separate tables like so:
# One for density
select(mammals, site, taxon, density) %>%
spread(key = taxon, value = density, fill = 0)
## site Myotis nigricans Notiosorex crawfordi Scuncus etruscus
## 1 1 0.0 0.0 0.0
## 2 2 11.0 0.0 0.0
## 3 3 9.6 1.2 9.4
## Sorex cinereus Suncus etruscus
## 1 5.2 6.2
## 2 0.0 0.0
## 3 0.0 0.0
# One for counts
select(mammals, site, taxon, counts) %>%
spread(key = taxon, value = counts, fill = 0)
## site Myotis nigricans Notiosorex crawfordi Scuncus etruscus
## 1 1 0 0 0
## 2 2 55 0 0
## 3 3 48 6 47
## Sorex cinereus Suncus etruscus
## 1 26 31
## 2 0 0
## 3 0 0
Is there another way we can do this? This is potentially a lot of repetition.
united_vals <- unite(data = mammals, col = density_counts, density, counts,
sep = "__")
united_vals
## site taxon density_counts
## 1 1 Suncus etruscus 6.2__31
## 2 1 Sorex cinereus 5.2__26
## 3 2 Myotis nigricans 11__55
## 4 3 Notiosorex crawfordi 1.2__6
## 5 3 Scuncus etruscus 9.4__47
## 6 3 Myotis nigricans 9.6__48
Now we can continue with our spread.
# Notice that we have to be slightly clever with how we fill... How did I
# figure out that we need to use '0__0'?? I went through it all with just
# fill = 0 and R got mad at me because you cannot separate single values
# (e.g., 0). So I went back and changed this.
spread(united_vals, key = taxon, value = density_counts, fill = '0__0')
## site Myotis nigricans Notiosorex crawfordi Scuncus etruscus
## 1 1 0__0 0__0 0__0
## 2 2 11__55 0__0 0__0
## 3 3 9.6__48 1.2__6 9.4__47
## Sorex cinereus Suncus etruscus
## 1 5.2__26 6.2__31
## 2 0__0 0__0
## 3 0__0 0__0
And let’s go back to long format.
spread(united_vals, key = taxon, value = density_counts, fill = '0__0') %>%
gather(key = taxon, value = density_counts, -site)
## site taxon density_counts
## 1 1 Myotis nigricans 0__0
## 2 2 Myotis nigricans 11__55
## 3 3 Myotis nigricans 9.6__48
## 4 1 Notiosorex crawfordi 0__0
## 5 2 Notiosorex crawfordi 0__0
## 6 3 Notiosorex crawfordi 1.2__6
## 7 1 Scuncus etruscus 0__0
## 8 2 Scuncus etruscus 0__0
## 9 3 Scuncus etruscus 9.4__47
## 10 1 Sorex cinereus 5.2__26
## 11 2 Sorex cinereus 0__0
## 12 3 Sorex cinereus 0__0
## 13 1 Suncus etruscus 6.2__31
## 14 2 Suncus etruscus 0__0
## 15 3 Suncus etruscus 0__0
Then let’s add one more layer so that we can go back to having density and count columns!
spread(united_vals, key = taxon, value = density_counts, fill = '0__0') %>%
gather(key = taxon, value = density_counts, -site) %>%
separate(col = density_counts, into = c('density', 'counts'), sep = '__')
## site taxon density counts
## 1 1 Myotis nigricans 0 0
## 2 2 Myotis nigricans 11 55
## 3 3 Myotis nigricans 9.6 48
## 4 1 Notiosorex crawfordi 0 0
## 5 2 Notiosorex crawfordi 0 0
## 6 3 Notiosorex crawfordi 1.2 6
## 7 1 Scuncus etruscus 0 0
## 8 2 Scuncus etruscus 0 0
## 9 3 Scuncus etruscus 9.4 47
## 10 1 Sorex cinereus 5.2 26
## 11 2 Sorex cinereus 0 0
## 12 3 Sorex cinereus 0 0
## 13 1 Suncus etruscus 6.2 31
## 14 2 Suncus etruscus 0 0
## 15 3 Suncus etruscus 0 0
Tidy the data in the data.frame below. For reference, the column names are in the format Site_Year.
set.seed(7)
whale_counts <- data.frame(whale = c('Badger', 'Bamboo', 'Humphrey', 'Kumiko',
'Ester', 'Moby Dick'),
A_2009 = round(runif(n = 6, min = 0, max = 20), 0),
A_2010 = round(runif(n = 6, min = 0, max = 20), 0),
A_2011 = round(runif(n = 6, min = 0, max = 20), 0),
B_2009 = round(runif(n = 6, min = 0, max = 20), 0),
B_2010 = round(runif(n = 6, min = 0, max = 20), 0),
B_2011 = round(runif(n = 6, min = 0, max = 20), 0)
)
whale_counts
## whale A_2009 A_2010 A_2011 B_2009 B_2010 B_2011
## 1 Badger 20 7 15 20 20 14
## 2 Bamboo 8 19 2 6 1 5
## 3 Humphrey 2 3 9 13 13 4
## 4 Kumiko 1 9 2 6 10 4
## 5 Ester 5 3 11 20 19 8
## 6 Moby Dick 16 5 0 18 7 17
Data mashups!
You are often going to collect some data that you then want to join up data from multiple tables/data.frames.
We’re going to go back to using our mammals data.
mammals
## site taxon density counts
## 1 1 Suncus etruscus 6.2 31
## 2 1 Sorex cinereus 5.2 26
## 3 2 Myotis nigricans 11.0 55
## 4 3 Notiosorex crawfordi 1.2 6
## 5 3 Scuncus etruscus 9.4 47
## 6 3 Myotis nigricans 9.6 48
We also now have another data table that contains additional information about each of the our species. How can we go about matching these data up?
# Lookup table for joins. Small mammal metabolic data.
#sci_name <- c('Suncus etruscus', 'Sorex cinereus', 'Myotis nigricans' ,'Notiosorex crawfordi')
genus <- c('Suncus', 'Sorex', 'Myotis' ,'Notiosorex')
species <- c('etruscus', 'cinereus', 'nigricans', 'crawfordi')
order <- c('Soricomorpha', 'Soricomorpha', 'Chiroptera', 'Soricomorpha')
family_common <- c('Shrews', 'Shrews', 'Vesper Bats', 'Shrews')
body_temp <- c(38.7, 38.7, 99999, 37.6)
body_mass <- c(2.4, 3.5, 3.7, 4)
basal_metabolism <- c(0.08, 0.176, 0.27, 0.074)
metabolic <- data.frame(genus = genus, species = species, order = order,
family_common = family_common, body_temp = body_temp,
body_mass = body_mass,
basal_metabolism = basal_metabolism)
metabolic
## genus species order family_common body_temp body_mass
## 1 Suncus etruscus Soricomorpha Shrews 38.7 2.4
## 2 Sorex cinereus Soricomorpha Shrews 38.7 3.5
## 3 Myotis nigricans Chiroptera Vesper Bats 99999.0 3.7
## 4 Notiosorex crawfordi Soricomorpha Shrews 37.6 4.0
## basal_metabolism
## 1 0.080
## 2 0.176
## 3 0.270
## 4 0.074
Talk about unique keys.
Make unique key for lookup!
Joining A with B
left_join (matches B onto A)
right_join (matches A onto B)
inner_join (retain only matching values in A and B)
full_join (retain all values in all rows)
anti_join (get all rows in A, not found in B)
One to many
Code for one-to-many join example.
site_number <- c(1, 1, 1, 2, 2, 2, 3, 3, 3)
site <- c('North Saanich', 'North Saanich', 'North Saanich', 'Salt Spring Island', 'Salt Spring Island', 'Salt Spring Island', 'Victoria', 'Victoria', 'Victoria')
sub_site <- c('Chez Jillian', 'Horth Hill', 'Iroquois Park', 'Mt. Tuam', 'Ruckle Park', 'Mt. Maxwell', 'Mt. Doug', 'Uvic', 'Uplands')
lat <- c(48.669207, 48.686775, 48.643806, 48.735256, 48.785117, 48.797760,
48.487006, 48.461353, 48.441668)
lon <- c(-123.468452, -123.434806, -123.403392, -123.458152, -123.388680, -123.506763, -123.352139, -123.309311, -123.301797)
site_data <- data.frame(site_number = site_number, site = site,
sub_site = sub_site, lat = lat, lon = lon)
Create your starting dataframe