merge.R

Plain text source: merge.R


# -*- Mode:R; Coding:us-ascii-unix; fill-column:160 -*-

################################################################################################################################################################
##
# @file      merge.R
# @author    Mitch Richling <https://www.mitchr.me>
# @Copyright Copyright 2015 by Mitch Richling.  All rights reserved.
# @brief     Demo of The R merge (join) command.@EOL
# @Keywords  base r merge
#

################################################################################################################################################################
# First we need to have some data to work with.

daObjects <- read.table(header=T, text='
     hairs SEX  name
  brunette   M  rock
    blonde   M  paper
  brunette   F  scissors
    blonde   F  hammer
       red   M  gun
       red   F  knife
')

daPeople <- read.table(header=T, text='
      hair sex  name
  brunette   M  bill
    blonde   M  ted
    blonde   M  fred
  brunette   F  wilma
    blonde   F  betzi
      pink   F  pinkey
')

################################################################################################################################################################
# The most common case -- we expect every row in both tables to have a precise match in the output

base::merge(x=daPeople, y=daObjects, by.x=c('hair', 'sex'), by.y=c('hairs', 'SEX'))
      hair sex name.x   name.y
1   blonde   F  betzi   hammer
2   blonde   M    ted    paper
3   blonde   M   fred    paper
4 brunette   F  wilma scissors
5 brunette   M   bill     rock
################################################################################################################################################################
# It is quite common for the x-table to be the master data set, and a y-table be a set of "look-up" values related to some quality of each x-table row.  In this
# case, the y-table is frequently missing some values for the master data in the x-table, but it is important that every data row appear in the output -- even
# if the 'look-up' value is missing.

base::merge(x=daPeople, y=daObjects, by.x=c('hair', 'sex'), by.y=c('hairs', 'SEX'), all.x=TRUE)
      hair sex name.x   name.y
1   blonde   F  betzi   hammer
2   blonde   M    ted    paper
3   blonde   M   fred    paper
4 brunette   F  wilma scissors
5 brunette   M   bill     rock
6     pink   F pinkey     <NA>
################################################################################################################################################################
# Really, this is just the reverse of the previous case.

base::merge(x=daPeople, y=daObjects, by.x=c('hair', 'sex'), by.y=c('hairs', 'SEX'), all.y=TRUE)
      hair sex name.x   name.y
1   blonde   F  betzi   hammer
2   blonde   M    ted    paper
3   blonde   M   fred    paper
4 brunette   F  wilma scissors
5 brunette   M   bill     rock
6      red   F   <NA>    knife
7      red   M   <NA>      gun
################################################################################################################################################################
# Lastly we do a 'full outer join' -- in this case we want a row in the output to appear if it appears in the x-table or y-table.

base::merge(x=daPeople, y=daObjects, by.x=c('hair', 'sex'), by.y=c('hairs', 'SEX'), all=TRUE)
      hair sex name.x   name.y
1   blonde   F  betzi   hammer
2   blonde   M    ted    paper
3   blonde   M   fred    paper
4 brunette   F  wilma scissors
5 brunette   M   bill     rock
6     pink   F pinkey     <NA>
7      red   F   <NA>    knife
8      red   M   <NA>      gun

The R session information (including the OS info, R version and all packages used):

    options(width=80)
    sessionInfo()
R version 3.3.0 (2016-05-03)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 8 (jessie)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] graphics  grDevices datasets  utils     grid      stats     base     

other attached packages:
[1] RColorBrewer_1.1-2 reshape2_1.4.1     ggplot2_2.1.0      dplyr_0.4.3       
[5] data.table_1.9.6   gridExtra_2.2.1    knitr_1.13         lattice_0.20-33   

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.5      assertthat_0.1   plyr_1.8.3       chron_2.3-47    
 [5] R6_2.1.2         gtable_0.2.0     DBI_0.4-1        formatR_1.4     
 [9] magrittr_1.5     evaluate_0.9     scales_0.4.0     highr_0.6       
[13] stringi_1.0-1    tools_3.3.0      stringr_1.0.0    munsell_0.4.3   
[17] parallel_3.3.0   colorspace_1.2-6 methods_3.3.0   
    Sys.time()
[1] "2016-07-09 20:06:30 CDT"