longAndWide.R

Plain text source: longAndWide.R


# -*- Mode:R; Coding:us-ascii-unix; fill-column:160 -*-

################################################################################################################################################################
##
# @file      longAndWide.R
# @author    Mitch Richling <https://www.mitchr.me>
# @Copyright Copyright 2015 by Mitch Richling.  All rights reserved.
# @brief     Transforming between long and wide data sets.@EOL
# @Keywords  r cran reshape2::melt reshape2::dcast utils::unstack utils::stack stats::aggregate tidyr::gather tidyr::spread
#
# We demonstrate two reshape2 commands:
#   * reshape2::melt -- transform from 'wide' to 'long'
#       * arg 1 is the data
#       * id.vars -- the variables to keep but not split apart on
#       * measure.vars -- source columns
#       * variable.name -- Name of the destination column that will identify the original
#       * value.name - column that the measurement came from
#   * reshape2::dcast --transform from 'long' to 'wide'
#       * Arg 1 is the data
#       * Arg 2 is a formula
#       * foo + bar ~ foobar + foobam
#         LHS are columns you want to keep as is (vertical)
#         RHS are columns who's values will be used to construct new columns (horizontal)
#       * value.var is the variable with the measurement you want in the table
#       * fun.aggregate is used when the table will have multiple value.var values for each cell.
#         This function is then used to aggregate the multiple values into one for the table

################################################################################################################################################################
# First, load the library.

library(reshape2)
library(tidyr)

################################################################################################################################################################
# This is the most trivial, and common, use case when converting from 'wide' to 'long'

someWideData <- read.table(header=T, text='
  subject sex breakfast dinner lunch
1     Dad   M     101.9  102.1  99.2
2     Mom   F      98.6   98.9  95.4
3     Son   M     120.3  120.4 110.1
')

reshape2::melt(someWideData, id.vars=c('subject', 'sex'), measure.vars=c('breakfast', 'dinner', 'lunch'), variable.name='meal', value.name='blgl')
  subject sex      meal  blgl
1     Dad   M breakfast 101.9
2     Mom   F breakfast  98.6
3     Son   M breakfast 120.3
4     Dad   M    dinner 102.1
5     Mom   F    dinner  98.9
6     Son   M    dinner 120.4
7     Dad   M     lunch  99.2
8     Mom   F     lunch  95.4
9     Son   M     lunch 110.1
tidyr::gather(someWideData, meal, blgl, breakfast:lunch)
  subject sex      meal  blgl
1     Dad   M breakfast 101.9
2     Mom   F breakfast  98.6
3     Son   M breakfast 120.3
4     Dad   M    dinner 102.1
5     Mom   F    dinner  98.9
6     Son   M    dinner 120.4
7     Dad   M     lunch  99.2
8     Mom   F     lunch  95.4
9     Son   M     lunch 110.1
################################################################################################################################################################
# If you don't have an 'x variable', then you can use 'aggregate' to make a wide table horizontal:

head(InsectSprays)
  count spray
1    10     A
2     7     A
3    20     A
4    14     A
5    14     A
6    12     A
stats::aggregate(InsectSprays$count, list(spray=InsectSprays$spray), identity)
  spray x.1 x.2 x.3 x.4 x.5 x.6 x.7 x.8 x.9 x.10 x.11 x.12
1     A  10   7  20  14  14  12  10  23  17   20   14   13
2     B  11  17  21  11  16  14  17  17  19   21    7   13
3     C   0   1   7   2   3   1   2   1   3    0    1    4
4     D   3   5  12   6   4   3   5   5   5    5    2    4
5     E   3   5   3   5   3   6   1   1   3    2    6    4
6     F  11   9  15  22  15  16  13  10  26   26   24   13
################################################################################################################################################################
# Now lets look at the most common wide to long use case:

someLongData <- read.table(header=T, text='
 subject sex        meal blgl
     Dad   M   breakfast  101.9
     Dad   M       lunch   99.2
     Dad   M      dinner  102.1
     Mom   F   breakfast   98.6
     Mom   F       lunch   95.4
     Mom   F      dinner   98.9
     Son   M   breakfast  120.3
     Son   M       lunch  110.1
     Son   M      dinner  120.4
 ')

reshape2::dcast(someLongData, sex ~ meal, value.var='blgl', fun.aggregate=mean)
  sex breakfast dinner  lunch
1   F      98.6  98.90  95.40
2   M     111.1 111.25 104.65
################################################################################################################################################################
# Another common 'wide' to 'long' case is when we have more than one variable that should be on the vertical axis of the table.  For example, suppose we want
# 'subject' & 'sex' on the vertical, 'meal' on the horizontal, and the unique measurement for that combination in the table.  Then we might do this:

reshape2::dcast(someLongData, subject + sex ~ meal, value.var='blgl')
  subject sex breakfast dinner lunch
1     Dad   M     101.9  102.1  99.2
2     Mom   F      98.6   98.9  95.4
3     Son   M     120.3  120.4 110.1
tidyr::spread(someLongData, meal, blgl)
  subject sex breakfast dinner lunch
1     Dad   M     101.9  102.1  99.2
2     Mom   F      98.6   98.9  95.4
3     Son   M     120.3  120.4 110.1
################################################################################################################################################################
# Lets do another example with a different dataset -- mtcars. Suppose we want a table with 'cyl' & 'gear' on the vertical, 'carb' on the horizontal, and mean
# 'mpg' in the table.  We can do that like so:

head(mtcars, 20)
                     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
reshape2::dcast(mtcars, cyl + gear ~ carb, value.var='mpg', fun.aggregate=mean)
  cyl gear     1     2    3     4    6   8
1   4    3 21.50   NaN  NaN   NaN  NaN NaN
2   4    4 29.10 24.75  NaN   NaN  NaN NaN
3   4    5   NaN 28.20  NaN   NaN  NaN NaN
4   6    3 19.75   NaN  NaN   NaN  NaN NaN
5   6    4   NaN   NaN  NaN 19.75  NaN NaN
6   6    5   NaN   NaN  NaN   NaN 19.7 NaN
7   8    3   NaN 17.15 16.3 12.62  NaN NaN
8   8    5   NaN   NaN  NaN 15.80  NaN  15
################################################################################################################################################################
# Lets do one more example with yet another data set -- tips. We want 'time' on vertical, and both 'sex' & 'smoker' on the horizontal, and average tip in the
# table:

head(tips)
  total_bill  tip    sex smoker day   time size
1      16.99 1.01 Female     No Sun Dinner    2
2      10.34 1.66   Male     No Sun Dinner    3
3      21.01 3.50   Male     No Sun Dinner    3
4      23.68 3.31   Male     No Sun Dinner    2
5      24.59 3.61 Female     No Sun Dinner    4
6      25.29 4.71   Male     No Sun Dinner    4
reshape2::dcast(tips, time ~ sex + smoker, value.var='tip', fun.aggregate=mean)
    time Female_No Female_Yes  Male_No Male_Yes
1 Dinner  3.044138    2.94913 3.158052 3.123191
2  Lunch  2.459600    2.89100 2.941500 2.790769
################################################################################################################################################################
# Sometimes you have a group of vectors each containing measurements for different groups -- or a data frame with columns that are each measurements for
# different groups, and you simply want to stack them into a two column, 'long' data frame.

utils::stack(list(group1=c(0,1,2,3), group2=c(4,5,6,7)))
  values    ind
1      0 group1
2      1 group1
3      2 group1
4      3 group1
5      4 group2
6      5 group2
7      6 group2
8      7 group2
utils::stack(data.frame(group1=c(0,1,2,3), group2=c(4,5,6,7)))
  values    ind
1      0 group1
2      1 group1
3      2 group1
4      3 group1
5      4 group2
6      5 group2
7      6 group2
8      7 group2
################################################################################################################################################################
# Alternately, if you have a data.frame with two columns -- one for the group and one for the measurements, then you can get a wide

a <- data.frame(vals=c(0,1,2,3,4,5,6), group=c('group1', 'group1', 'group1', 'group2', 'group2', 'group2', 'group2'))
a
  vals  group
1    0 group1
2    1 group1
3    2 group1
4    3 group2
5    4 group2
6    5 group2
7    6 group2
utils::unstack(a, vals ~ group)
$group1
[1] 0 1 2

$group2
[1] 3 4 5 6
################################################################################################################################################################
# In the case when the number of measurements in each group is equal, unstack will return a data.frame

a <- data.frame(vals=c(0,1,2,3,4,5), group=c('group1', 'group1', 'group1', 'group2', 'group2', 'group2'))
a
  vals  group
1    0 group1
2    1 group1
3    2 group1
4    3 group2
5    4 group2
6    5 group2
utils::unstack(a, vals ~ group)
  group1 group2
1      0      3
2      1      4
3      2      5

The R session information (including the OS info, R version and all packages used):

    options(width=80)
    sessionInfo()
R version 3.3.0 (2016-05-03)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 8 (jessie)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] graphics  grDevices datasets  utils     grid      stats     base     

other attached packages:
[1] RColorBrewer_1.1-2 reshape2_1.4.1     ggplot2_2.1.0      tidyr_0.5.1       
[5] dplyr_0.4.3        data.table_1.9.6   gridExtra_2.2.1    knitr_1.13        
[9] lattice_0.20-33   

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.5      magrittr_1.5     munsell_0.4.3    colorspace_1.2-6
 [5] R6_2.1.2         stringr_1.0.0    highr_0.6        plyr_1.8.3      
 [9] tools_3.3.0      parallel_3.3.0   gtable_0.2.0     DBI_0.4-1       
[13] lazyeval_0.2.0   assertthat_0.1   tibble_1.1       formatR_1.4     
[17] evaluate_0.9     stringi_1.0-1    methods_3.3.0    scales_0.4.0    
[21] chron_2.3-47    
    Sys.time()
[1] "2016-07-21 22:41:45 CDT"