library(tidyverse)
library(nycflights13)

nycflights13

We will use the nycflights13 package to learn about relational data. nycflights13 contains four tibbles that are related to the flights table that you used in [data transformation]:

Schematic representation

Simple example of mutating join

Get a narrower dataset so we can see it better.

flightsA <- flights %>% 
  select(year:day, hour, tailnum, carrier)
flightsA
## # A tibble: 336,776 x 6
##     year month   day  hour tailnum carrier
##    <int> <int> <int> <dbl> <chr>   <chr>  
##  1  2013     1     1     5 N14228  UA     
##  2  2013     1     1     5 N24211  UA     
##  3  2013     1     1     5 N619AA  AA     
##  4  2013     1     1     5 N804JB  B6     
##  5  2013     1     1     6 N668DN  DL     
##  6  2013     1     1     5 N39463  UA     
##  7  2013     1     1     6 N516JB  B6     
##  8  2013     1     1     6 N829AS  EV     
##  9  2013     1     1     6 N593JB  B6     
## 10  2013     1     1     6 N3ALAA  AA     
## # ... with 336,766 more rows

Imagine you want to add the full airline name this data.

flightsA %>%
  left_join(airlines)
## Joining, by = "carrier"
## # A tibble: 336,776 x 7
##     year month   day  hour tailnum carrier name                    
##    <int> <int> <int> <dbl> <chr>   <chr>   <chr>                   
##  1  2013     1     1     5 N14228  UA      United Air Lines Inc.   
##  2  2013     1     1     5 N24211  UA      United Air Lines Inc.   
##  3  2013     1     1     5 N619AA  AA      American Airlines Inc.  
##  4  2013     1     1     5 N804JB  B6      JetBlue Airways         
##  5  2013     1     1     6 N668DN  DL      Delta Air Lines Inc.    
##  6  2013     1     1     5 N39463  UA      United Air Lines Inc.   
##  7  2013     1     1     6 N516JB  B6      JetBlue Airways         
##  8  2013     1     1     6 N829AS  EV      ExpressJet Airlines Inc.
##  9  2013     1     1     6 N593JB  B6      JetBlue Airways         
## 10  2013     1     1     6 N3ALAA  AA      American Airlines Inc.  
## # ... with 336,766 more rows
flightsA %>%
  full_join(airlines)
## Joining, by = "carrier"
## # A tibble: 336,776 x 7
##     year month   day  hour tailnum carrier name                    
##    <int> <int> <int> <dbl> <chr>   <chr>   <chr>                   
##  1  2013     1     1     5 N14228  UA      United Air Lines Inc.   
##  2  2013     1     1     5 N24211  UA      United Air Lines Inc.   
##  3  2013     1     1     5 N619AA  AA      American Airlines Inc.  
##  4  2013     1     1     5 N804JB  B6      JetBlue Airways         
##  5  2013     1     1     6 N668DN  DL      Delta Air Lines Inc.    
##  6  2013     1     1     5 N39463  UA      United Air Lines Inc.   
##  7  2013     1     1     6 N516JB  B6      JetBlue Airways         
##  8  2013     1     1     6 N829AS  EV      ExpressJet Airlines Inc.
##  9  2013     1     1     6 N593JB  B6      JetBlue Airways         
## 10  2013     1     1     6 N3ALAA  AA      American Airlines Inc.  
## # ... with 336,766 more rows
Empirical test for key
flights %>% 
  count(year, month, day, hour, flight, origin) %>% 
  filter(n > 1)
## # A tibble: 764 x 7
##     year month   day  hour flight origin     n
##    <int> <int> <int> <dbl>  <int> <chr>  <int>
##  1  2013     1     1    19     87 JFK        2
##  2  2013     1     2    18   2019 LGA        2
##  3  2013     1     2    19     87 JFK        2
##  4  2013     1     3    18   2019 LGA        2
##  5  2013     1     3    19     87 JFK        2
##  6  2013     1     4    18   2019 LGA        2
##  7  2013     1     4    19     87 JFK        2
##  8  2013     1     5    19     87 JFK        2
##  9  2013     1     6     8    517 EWR        2
## 10  2013     1     6    13    348 LGA        2
## # ... with 754 more rows
flights %>% 
  count(flight, origin, time_hour, tailnum) %>% 
  filter(n > 1)
## # A tibble: 1 x 5
##   flight origin time_hour           tailnum     n
##    <int> <chr>  <dttm>              <chr>   <int>
## 1    398 EWR    2013-12-15 07:00:00 <NA>        2
weather %>% 
  count(origin, year, month, day, hour, time_hour) %>% 
  filter(n > 1)
## # A tibble: 0 x 7
## # ... with 7 variables: origin <chr>, year <dbl>, month <dbl>, day <int>,
## #   hour <int>, time_hour <dttm>, n <int>
weather %>% 
  count(origin, time_hour) %>% 
  filter(n > 1)
## # A tibble: 0 x 3
## # ... with 3 variables: origin <chr>, time_hour <dttm>, n <int>
weather %>% 
  count(time_hour) %>% 
  filter(n > 1)
## # A tibble: 8,706 x 2
##    time_hour               n
##    <dttm>              <int>
##  1 2013-01-01 01:00:00     3
##  2 2013-01-01 02:00:00     3
##  3 2013-01-01 03:00:00     3
##  4 2013-01-01 04:00:00     3
##  5 2013-01-01 05:00:00     3
##  6 2013-01-01 06:00:00     3
##  7 2013-01-01 07:00:00     3
##  8 2013-01-01 08:00:00     3
##  9 2013-01-01 09:00:00     3
## 10 2013-01-01 10:00:00     3
## # ... with 8,696 more rows
Inner Join
x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     3, "x3"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     4, "y3"
)
  x
## # A tibble: 3 x 2
##     key val_x
##   <dbl> <chr>
## 1     1 x1   
## 2     2 x2   
## 3     3 x3
  y
## # A tibble: 3 x 2
##     key val_y
##   <dbl> <chr>
## 1     1 y1   
## 2     2 y2   
## 3     4 y3
  inner_join(x, y, by = "key")
## # A tibble: 2 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2
  inner_join(y, x, by = "key")
## # A tibble: 2 x 3
##     key val_y val_x
##   <dbl> <chr> <chr>
## 1     1 y1    x1   
## 2     2 y2    x2
  left_join(x, y, by = "key")
## # A tibble: 3 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA>
  right_join(y, x, by = "key")
## # A tibble: 3 x 3
##     key val_y val_x
##   <dbl> <chr> <chr>
## 1     1 y1    x1   
## 2     2 y2    x2   
## 3     3 <NA>  x3
  right_join(x, y, by = "key")
## # A tibble: 3 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     4 <NA>  y3
  left_join(y, x, by = "key")
## # A tibble: 3 x 3
##     key val_y val_x
##   <dbl> <chr> <chr>
## 1     1 y1    x1   
## 2     2 y2    x2   
## 3     4 y3    <NA>
  full_join(x, y, by = "key")
## # A tibble: 4 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA> 
## 4     4 <NA>  y3
  full_join(y, x, by = "key")
## # A tibble: 4 x 3
##     key val_y val_x
##   <dbl> <chr> <chr>
## 1     1 y1    x1   
## 2     2 y2    x2   
## 3     4 y3    <NA> 
## 4     3 <NA>  x3
  semi_join(x, y, by="key")
## # A tibble: 2 x 2
##     key val_x
##   <dbl> <chr>
## 1     1 x1   
## 2     2 x2
  semi_join(y, x, by="key")
## # A tibble: 2 x 2
##     key val_y
##   <dbl> <chr>
## 1     1 y1   
## 2     2 y2
  anti_join(x, y, by="key")
## # A tibble: 1 x 2
##     key val_x
##   <dbl> <chr>
## 1     3 x3
  anti_join(y, x, by="key")
## # A tibble: 1 x 2
##     key val_y
##   <dbl> <chr>
## 1     4 y3
  union_all(x, y)
## # A tibble: 6 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    <NA> 
## 2     2 x2    <NA> 
## 3     3 x3    <NA> 
## 4     1 <NA>  y1   
## 5     2 <NA>  y2   
## 6     4 <NA>  y3
  union_all(y, x)
## # A tibble: 6 x 3
##     key val_y val_x
##   <dbl> <chr> <chr>
## 1     1 y1    <NA> 
## 2     2 y2    <NA> 
## 3     4 y3    <NA> 
## 4     1 <NA>  x1   
## 5     2 <NA>  x2   
## 6     3 <NA>  x3
Left Join
```r
left_join(x, y, by = "key")
```

```
## # A tibble: 3 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA>
```
Right Join
```r
right_join(x, y, by = "key")
```

```
## # A tibble: 3 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     4 <NA>  y3
```
Full Join
```r
full_join(x, y, by = "key")
```

```
## # A tibble: 4 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA> 
## 4     4 <NA>  y3
```
Duplicate Keys - one table
x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     2, "x3",
     1, "x4"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2"
)
left_join(x, y, by = "key")
## # A tibble: 4 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     2 x3    y2   
## 4     1 x4    y1
Duplicate Keys - both tables
```r
x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     2, "x3",
     3, "x4"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     2, "y3",
     3, "y4"
)
left_join(x, y, by = "key")
```

```
## # A tibble: 6 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     2 x2    y3   
## 4     2 x3    y2   
## 5     2 x3    y3   
## 6     3 x4    y4
```

semi-join,anti-join

```r
semi_join(x, y, by="key")
```

```
## # A tibble: 4 x 2
##     key val_x
##   <dbl> <chr>
## 1     1 x1   
## 2     2 x2   
## 3     2 x3   
## 4     3 x4
```

```r
anti_join(x, y, by="key")
```

```
## # A tibble: 0 x 2
## # ... with 2 variables: key <dbl>, val_x <chr>
```




```r
x <- tribble(
  ~col1, ~col2,
     1, "a",
     2, "b",
     3, "c"
)
y <- tribble(
  ~col1, ~col2,
     1, "a",
     2, "d",
     4, "e"
)
semi_join(x, y, by="col1")
```

```
## # A tibble: 2 x 2
##    col1 col2 
##   <dbl> <chr>
## 1     1 a    
## 2     2 b
```

```r
anti_join(x, y, by="col1")
```

```
## # A tibble: 1 x 2
##    col1 col2 
##   <dbl> <chr>
## 1     3 c
```

Set operations

    dplyr::intersect(x, y)
## # A tibble: 1 x 2
##    col1 col2 
##   <dbl> <chr>
## 1     1 a
    dplyr::union(x, y)
## # A tibble: 5 x 2
##    col1 col2 
##   <dbl> <chr>
## 1     3 c    
## 2     2 b    
## 3     4 e    
## 4     2 d    
## 5     1 a
    dplyr::setdiff(x, y)
## # A tibble: 2 x 2
##    col1 col2 
##   <dbl> <chr>
## 1     2 b    
## 2     3 c
   dplyr::intersect(x, y)
## # A tibble: 1 x 2
##    col1 col2 
##   <dbl> <chr>
## 1     1 a
    base::intersect(x, y)
## Warning: Length of logical index must be 1 or 3, not 0
## # A tibble: 0 x 0
    dplyr::union(x, y)
## # A tibble: 5 x 2
##    col1 col2 
##   <dbl> <chr>
## 1     3 c    
## 2     2 b    
## 3     4 e    
## 4     2 d    
## 5     1 a
    base::union(x, y)
## [[1]]
## [1] 1 2 3
## 
## [[2]]
## [1] "a" "b" "c"
## 
## [[3]]
## [1] 1 2 4
## 
## [[4]]
## [1] "a" "d" "e"
    dplyr::setdiff(x, y)
## # A tibble: 2 x 2
##    col1 col2 
##   <dbl> <chr>
## 1     2 b    
## 2     3 c
    base::setdiff(x, y)
## # A tibble: 3 x 2
##    col1 col2 
##   <dbl> <chr>
## 1     1 a    
## 2     2 b    
## 3     3 c