Basic operations

Basics

Clear cache

# clear the cache
rm(list = ls())

Installing packages

  • Installing and loading packages
# load a package. Install it if it's not on the system yet.
if(!require("gdata")) {install.packages("gdata")}
#install.packages(c("psych","gdata","reshape","xtable","gplots","nlme","foreign","ez"))
#library(ggplot2); library(gdata); library(ggthemes); library(plyr); 
#library(memisc); library(reshape); library(reshape2); library(devtools)
#library(tidyverse); library(dplyr)
#library(devtools); library(ipeds); library(xtable); library(openxlsx); library(foreign); 
#install.packages("acs"); install.packages("ggplot2");xt install.packages("maps"); install.packages("mapdata"); 
data(airquality)
head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

Checking R version

getRversion()
## [1] '4.3.1'
#package-version()

Saving history

# save history
#savehistory("name.Rhistory")
#loadhistory("name.Rhistory")

Show the data structure

# save hisotry
str(airquality)
## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...

Formatting text output

string <- "This is a sample string"
cat(substring(string,1,10),substring(string,15,19),"\n",sep="\t")
## This is a    le st   
cat("Roots of two: ",format(2^(1:10),digit=2),"\n")
## Roots of two:     2    4    8   16   32   64  128  256  512 1024

Array and matrices

# make a matrix with array or matrix
matrix1 <- array(1:10, dim=c(2,5))
matrix1
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    3    5    7    9
## [2,]    2    4    6    8   10
matrix2 <- matrix(1:10,2,5)
matrix2
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    3    5    7    9
## [2,]    2    4    6    8   10
matrix3 <- 1:10; 
dim(matrix3) <- c(2,5)
matrix3
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    3    5    7    9
## [2,]    2    4    6    8   10
class(matrix1)
## [1] "matrix" "array"
# numbers are recycled if the number of cells is larger
array(1:10, dim=c(5,10))
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]    1    6    1    6    1    6    1    6    1     6
## [2,]    2    7    2    7    2    7    2    7    2     7
## [3,]    3    8    3    8    3    8    3    8    3     8
## [4,]    4    9    4    9    4    9    4    9    4     9
## [5,]    5   10    5   10    5   10    5   10    5    10

Matrix calculus

# this performs cell-by-cell calculation
m1 <- array(1:10, dim=c(2,5))
m1 + m1
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    2    6   10   14   18
## [2,]    4    8   12   16   20
m1 * m1
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    9   25   49   81
## [2,]    4   16   36   64  100
# matrix calculus from J.Penzer's note (2006)
m4 <- array(1:3, c(4,2)) 
m5 <- array(3:8, c(2,3)) 
m4 %*% m5 
##      [,1] [,2] [,3]
## [1,]   11   17   23
## [2,]   18   28   38
## [3,]   13   21   29
## [4,]   11   17   23
m6 <- array(c(1,3,2,1),c(2,2)) 
m6 
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    1
v1 <- array(c(1,0), c(2,1)) 
solve(m6,v1) 
##      [,1]
## [1,] -0.2
## [2,]  0.6
solve(m6) # inverts m6 
##      [,1] [,2]
## [1,] -0.2  0.4
## [2,]  0.6 -0.2
# does the same as solve(m6,v1) 
solve(m6) %*% v1 
##      [,1]
## [1,] -0.2
## [2,]  0.6
  • Creating a function using the function keyword
sum_numbers <- function(a, b) {
  sum <- a + b
  return(sum)
}

# Call the function
result <- sum_numbers(3, 4)
print(result) # Output: 7
## [1] 7

Change the default options (print width, number/format of digits)

# change the default width
width.default <- getOption("width"); options(width=120)

# restoring default
options(width=width.default)

# changing the number of digits
options(digits=7)

# put biase toward non-scientific notations
options(scipen=3)

R colors

# list of all color names
# also see ?palette
head(colors())
## [1] "white"         "aliceblue"     "antiquewhite"  "antiquewhite1"
## [5] "antiquewhite2" "antiquewhite3"
# different color combo
colors3 <- c("lightblue","lightyellow","lightgreen")
colors5 = c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99")
# pre-set color functions
# it looks better to set the alpha level to less than 1
rainbow(10)
##  [1] "#FF0000" "#FF9900" "#CCFF00" "#33FF00" "#00FF66" "#00FFFF" "#0066FF"
##  [8] "#3300FF" "#CC00FF" "#FF0099"
heat.colors(10, alpha = 1)
##  [1] "#FF0000FF" "#FF2400FF" "#FF4900FF" "#FF6D00FF" "#FF9200FF" "#FFB600FF"
##  [7] "#FFDB00FF" "#FFFF00FF" "#FFFF40FF" "#FFFFBFFF"
terrain.colors(10, alpha = 1)
##  [1] "#00A600FF" "#2DB600FF" "#63C600FF" "#A0D600FF" "#E6E600FF" "#E8C32EFF"
##  [7] "#EBB25EFF" "#EDB48EFF" "#F0C9C0FF" "#F2F2F2FF"
topo.colors(10, alpha = 1)
##  [1] "#4C00FFFF" "#0019FFFF" "#0080FFFF" "#00E5FFFF" "#00FF4DFF" "#4DFF00FF"
##  [7] "#E6FF00FF" "#FFFF00FF" "#FFDE59FF" "#FFE0B3FF"
cm.colors(10, alpha = 1)
##  [1] "#80FFFFFF" "#99FFFFFF" "#B3FFFFFF" "#CCFFFFFF" "#E6FFFFFF" "#FFE6FFFF"
##  [7] "#FFCCFFFF" "#FFB3FFFF" "#FF99FFFF" "#FF80FFFF"
# with color brewer
library(RColorBrewer)
display.brewer.all() # show all pre-set colors

colorRampPalette(brewer.pal(9,"Blues"))(15) # for more than 9 colors
##  [1] "#F7FBFF" "#E8F1FA" "#DAE8F5" "#CCDFF1" "#BAD6EB" "#A3CCE3" "#88BEDC"
##  [8] "#6BAED6" "#539ECC" "#3D8DC3" "#2A7AB9" "#1967AD" "#0B559F" "#084287"
## [15] "#08306B"
display.brewer.pal(6,"BuGn")

brewer.pal(6,"BuGn")
## [1] "#EDF8FB" "#CCECE6" "#99D8C9" "#66C2A4" "#2CA25F" "#006D2C"

Making dummy data

# make a dummy data set
x = c(rep(1,5),rep(2,3),rep(3,5))
y = c(7,5,6,8,4,5,4,6,3,5,4,6,2)
z = rnorm(13,mean=10,sd=5)
thisData = cbind(x,y,z)
thisData = as.data.frame(thisData)
thisData$x = factor(thisData$x)
levels(thisData$x) = c("A","B","C")

summary(thisData)
##  x           y           z         
##  A:5   Min.   :2   Min.   : 1.428  
##  B:3   1st Qu.:4   1st Qu.: 4.732  
##  C:5   Median :5   Median : 6.436  
##        Mean   :5   Mean   : 7.044  
##        3rd Qu.:6   3rd Qu.: 7.773  
##        Max.   :8   Max.   :15.603
library(psych)
describe(thisData)
##    vars  n mean   sd median trimmed  mad  min  max range skew kurtosis   se
## x*    1 13 2.00 0.91   2.00    2.00 1.48 1.00  3.0  2.00 0.00    -1.89 0.25
## y     2 13 5.00 1.63   5.00    5.00 1.48 2.00  8.0  6.00 0.00    -0.84 0.45
## z     3 13 7.04 4.39   6.44    6.78 2.53 1.43 15.6 14.18 0.76    -0.41 1.22
head(thisData)
##   x y         z
## 1 A 7  5.303664
## 2 A 5  8.842583
## 3 A 6  7.772980
## 4 A 8  4.732023
## 5 A 4 15.603343
## 6 B 5 15.504260
thisData[1:3,]
##   x y        z
## 1 A 7 5.303664
## 2 A 5 8.842583
## 3 A 6 7.772980

Making dummy data (with matrix())

matrix(letters[1:10],nrow=5,ncol=2)
##      [,1] [,2]
## [1,] "a"  "f" 
## [2,] "b"  "g" 
## [3,] "c"  "h" 
## [4,] "d"  "i" 
## [5,] "e"  "j"
matrix(letters[1:10],nrow=5,ncol=2,byrow=TRUE)
##      [,1] [,2]
## [1,] "a"  "b" 
## [2,] "c"  "d" 
## [3,] "e"  "f" 
## [4,] "g"  "h" 
## [5,] "i"  "j"

Making dummy data (reconstructing data from SD and mean)

# mean, SD, and n from the published research
mean = 5; SD = 0.8; n = 300
X0 <- rnorm(n)
SE = SD/sqrt(n)
thisData <- cbind(mean+SE*sqrt(length(X0))*(X0-mean(X0)/sd(X0)),"some_level")
# or use rnorm(n, mean = mean_val, sd = sd_val)

mean = 7; SD = 0.5; n = 100
X0 <- rnorm(n)
SE = SD/sqrt(n)
thisData <- rbind(thisData,cbind(mean+SE*sqrt(length(X0))*(X0-mean(X0)/sd(X0)),"another_level"))
thisData <- data.frame(as.numeric(thisData[,1]),thisData[,-1])
colnames(thisData) <- c("value","factor")
head(thisData)
##      value     factor
## 1 5.571832 some_level
## 2 4.231714 some_level
## 3 3.551474 some_level
## 4 5.262986 some_level
## 5 4.891005 some_level
## 6 4.967727 some_level
# checking the accuracy
mean(thisData$value)
## [1] 5.500525
tapply(thisData$value,thisData$factor,mean)
## another_level    some_level 
##      6.998733      5.001122
sd(thisData$value)
## [1] 1.116399
tapply(thisData$value,thisData$factor,sd)
## another_level    some_level 
##     0.5121005     0.7585546

for loop (looping different levels)

library(gdata)

x = c(rep(1,5),rep(2,3),rep(3,5))
y = c(7,5,6,8,4,5,4,6,3,5,4,6,2)
z = rnorm(13,mean=10,sd=5)
thisData = cbind(x,y,z)
thisData = as.data.frame(thisData)
thisData$x = factor(thisData$x)
levels(thisData$x) = c("A","B","C")

for (i in 1:length(levels(thisData$x))) {
    print(levels(thisData$x)[i])
  tempData <- drop.levels(thisData[thisData$x == levels(thisData$x)[i],],reorder=FALSE)
    print(summary(tempData))
}
## [1] "A"
##  x           y           z        
##  A:5   Min.   :4   Min.   :10.78  
##        1st Qu.:5   1st Qu.:13.11  
##        Median :6   Median :13.90  
##        Mean   :6   Mean   :13.71  
##        3rd Qu.:7   3rd Qu.:14.69  
##        Max.   :8   Max.   :16.07  
## [1] "B"
##  x           y             z         
##  B:3   Min.   :4.0   Min.   :0.5999  
##        1st Qu.:4.5   1st Qu.:3.8907  
##        Median :5.0   Median :7.1815  
##        Mean   :5.0   Mean   :5.7888  
##        3rd Qu.:5.5   3rd Qu.:8.3833  
##        Max.   :6.0   Max.   :9.5850  
## [1] "C"
##  x           y           z         
##  C:5   Min.   :2   Min.   : 5.818  
##        1st Qu.:3   1st Qu.: 6.972  
##        Median :4   Median : 7.959  
##        Mean   :4   Mean   :10.485  
##        3rd Qu.:5   3rd Qu.: 9.121  
##        Max.   :6   Max.   :22.556

Save and read data from files (csv or SPSS)

library(dplyr)
library(foreign)
library(haven)

# Set the seed for reproducibility
set.seed(123)
# Generate fake data
fake_data <- data.frame(
  ID = 1:100,                                 # ID column from 1 to 100
  Age = sample(18:70, 100, replace = TRUE),    # Age column with random values between 18 and 70
  Gender = sample(c("Male", "Female"), 100, replace = TRUE),   # Gender column with random values of "Male" and "Female"
  Weight = round(rnorm(100, mean = 70, sd = 10), digits = 2),   # Weight column with random normal distribution between 50 and 90 with mean 70, standard deviation 10, and rounded to 2 decimal places
  Height = round(rnorm(100, mean = 170, sd = 10), digits = 2),  # Height column with random normal distribution between 150 and 190 with mean 170, standard deviation 10, and rounded to 2 decimal places
  Income = round(runif(100, min = 1000, max = 10000), digits = 2)  # Income column with random uniform distribution between 1000 and 10000 and rounded to 2 decimal places
)
write.table(fake_data, file = "fake_data.csv", sep = ",", col.names = NA, qmethod = "double")
write_sav(fake_data, "fake_data.sav")

# Define the path in Unix
# setwd('/media/MYUSB/PATH/file.txt')
# Define the path in  Windows
# setwd('F:/PATH/file.txt')

# from a text file
ratings = read.table("data/fake_data.csv",sep=" ",header=TRUE)
summary(ratings)
##  X.ID.Age.Gender.Weight.Height.Income
##  Length:100                          
##  Class :character                    
##  Mode  :character
# from SPSS file
ratings =  read.spss("data/fake_data.sav", to.data.frame=TRUE)
summary(ratings)
##        ID              Age           Gender              Weight     
##  Min.   :  1.00   Min.   :20.00   Length:100         Min.   :45.05  
##  1st Qu.: 25.75   1st Qu.:32.00   Class :character   1st Qu.:62.91  
##  Median : 50.50   Median :44.00   Mode  :character   Median :69.81  
##  Mean   : 50.50   Mean   :45.03                      Mean   :70.58  
##  3rd Qu.: 75.25   3rd Qu.:57.25                      3rd Qu.:78.03  
##  Max.   :100.00   Max.   :70.00                      Max.   :93.75  
##      Height          Income    
##  Min.   :136.9   Min.   :1181  
##  1st Qu.:161.2   1st Qu.:3526  
##  Median :169.8   Median :5779  
##  Mean   :169.1   Mean   :5650  
##  3rd Qu.:176.7   3rd Qu.:7869  
##  Max.   :197.1   Max.   :9937

Save plots as PDF

normal <- rnorm(100,10,10)
t <- rt(100,10)*10
f <- rf(100,10,10)*10

pdf("plot.pdf", width = 8, height = 6, onefile = TRUE, pointsize = 9)
plot(rnorm(100,10,10),col="blue",pch=78)
points(t,col="red",pch=84)
points(f,col="green",pch=70)
dev.off()
## quartz_off_screen 
##                 2

Output with sink()

# Output with sink() (the layout is aligned)
# change STDOUT
sink("output.txt")
i = 1:10
outer(i, i, "*")
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1    2    3    4    5    6    7    8    9    10
##  [2,]    2    4    6    8   10   12   14   16   18    20
##  [3,]    3    6    9   12   15   18   21   24   27    30
##  [4,]    4    8   12   16   20   24   28   32   36    40
##  [5,]    5   10   15   20   25   30   35   40   45    50
##  [6,]    6   12   18   24   30   36   42   48   54    60
##  [7,]    7   14   21   28   35   42   49   56   63    70
##  [8,]    8   16   24   32   40   48   56   64   72    80
##  [9,]    9   18   27   36   45   54   63   72   81    90
## [10,]   10   20   30   40   50   60   70   80   90   100
unlink("output.txt")

Output with write.table()

# printing dataframe or table
x = data.frame(a="a", b=pi)
# sep can be changed to "\t" or " "
write.table(x, file = "foo.csv", sep = ",", col.names = NA, qmethod = "double")

Data transformation for RM analyses

library(dplyr)
library(tidyr)

# Create and define the data frame
thisData <- data.frame(
    early = c(4, 10, 5, 8, 2),
    mid = c(6, 9, 4, 6, 2),
    late = c(2, 8, 3, 7, 2),
    gender = factor(c(1, 1, 1, 2, 2), levels = c(1, 2), labels = c("male", "female"))
)

# Melting the data with gather from tidyr package
moltenData <- thisData %>% 
    mutate(subject = row_number()) %>%
    gather(key = "time", value = "score", -subject, -gender)

# Recovering the original data format using pivot_wider from tidyr
originalData <- moltenData %>% 
    pivot_wider(names_from = "time", values_from = "score")

# Cross-tabulating the data
crossTabulatedData <- xtabs(score ~ subject + time, data = moltenData)

print(thisData)
##   early mid late gender
## 1     4   6    2   male
## 2    10   9    8   male
## 3     5   4    3   male
## 4     8   6    7 female
## 5     2   2    2 female
print(moltenData)
##    gender subject  time score
## 1    male       1 early     4
## 2    male       2 early    10
## 3    male       3 early     5
## 4  female       4 early     8
## 5  female       5 early     2
## 6    male       1   mid     6
## 7    male       2   mid     9
## 8    male       3   mid     4
## 9  female       4   mid     6
## 10 female       5   mid     2
## 11   male       1  late     2
## 12   male       2  late     8
## 13   male       3  late     3
## 14 female       4  late     7
## 15 female       5  late     2
print(originalData)
## # A tibble: 5 × 5
##   gender subject early   mid  late
##   <fct>    <int> <dbl> <dbl> <dbl>
## 1 male         1     4     6     2
## 2 male         2    10     9     8
## 3 male         3     5     4     3
## 4 female       4     8     6     7
## 5 female       5     2     2     2
print(crossTabulatedData)
##        time
## subject early late mid
##       1     4    2   6
##       2    10    8   9
##       3     5    3   4
##       4     8    7   6
##       5     2    2   2

Using grep/regular expressions in R

library(gdata)

# Create a dummy data set
thisData <- data.frame(
    Q1 = sample(1:5, 10, replace = TRUE),
    Q2 = sample(1:5, 10, replace = TRUE),
    midterm = sample(60:100, 10, replace = TRUE),
    final = sample(60:100, 10, replace = TRUE),
    presentation = sample(60:100, 10, replace = TRUE),
    assignment = sample(60:100, 10, replace = TRUE),
    prescore = sample(60:100, 10, replace = TRUE),
    other = sample(1:5, 10, replace = TRUE)
)
head(thisData)
##   Q1 Q2 midterm final presentation assignment prescore other
## 1  1  2      87    91           86         72       75     4
## 2  1  1      92    66           86        100       61     3
## 3  3  2      91    73           84         70       96     3
## 4  2  3      82    84           87         81       78     2
## 5  1  4      80    62           87         83       72     4
## 6  2  5      98    82           72         71       98     5
pattern <- "Q\\d+|^midterm$|^final$|^pres"
tempIndex <- grep(pattern, colnames(thisData))
newData <- drop.levels(thisData[, tempIndex, drop = FALSE])

# Summarize the new data
head(newData)
##   Q1 Q2 midterm final presentation prescore
## 1  1  2      87    91           86       75
## 2  1  1      92    66           86       61
## 3  3  2      91    73           84       96
## 4  2  3      82    84           87       78
## 5  1  4      80    62           87       72
## 6  2  5      98    82           72       98

Using tables

thisData = read.table("data/Daphnia.txt",header=TRUE)
attach(thisData)
# means of Growth.rate by Detergent
tapply(Growth.rate,Detergent,mean)
##   BrandA   BrandB   BrandC   BrandD 
## 3.884832 4.010044 3.954512 3.558231
# use list() for two-dimensional tables
# tapply(VALUE,list(ROW,CLUMN),FUNCTION)
tapply(Growth.rate,list(Daphnia,Detergent),mean)
##          BrandA   BrandB   BrandC   BrandD
## Clone1 2.732227 2.929140 3.071335 2.626797
## Clone2 3.919002 4.402931 4.772805 5.213745
## Clone3 5.003268 4.698062 4.019397 2.834151
# using an anonymous function
tapply(Growth.rate,list(Daphnia,Detergent), function(x){sqrt(var(x)/length(x))})
##           BrandA    BrandB    BrandC    BrandD
## Clone1 0.2163448 0.2319320 0.3055929 0.1905771
## Clone2 0.4702855 0.3639819 0.5773096 0.5520220
## Clone3 0.2688604 0.2683660 0.5395750 0.4260212
# a three-dimensional table
tapply(Growth.rate,list(Daphnia,Detergent,Water),mean)
## , , Tyne
## 
##          BrandA   BrandB   BrandC   BrandD
## Clone1 2.811265 2.775903 3.287529 2.597192
## Clone2 3.307634 4.191188 3.620532 4.105651
## Clone3 4.866524 4.766258 4.534902 3.365766
## 
## , , Wear
## 
##          BrandA   BrandB   BrandC   BrandD
## Clone1 2.653189 3.082377 2.855142 2.656403
## Clone2 4.530371 4.614673 5.925078 6.321838
## Clone3 5.140011 4.629867 3.503892 2.302537
# and use ftable() to flatten the table
ftable(tapply(Growth.rate,list(Daphnia,Detergent,Water),mean))
##                    Tyne     Wear
##                                 
## Clone1 BrandA  2.811265 2.653189
##        BrandB  2.775903 3.082377
##        BrandC  3.287529 2.855142
##        BrandD  2.597192 2.656403
## Clone2 BrandA  3.307634 4.530371
##        BrandB  4.191188 4.614673
##        BrandC  3.620532 5.925078
##        BrandD  4.105651 6.321838
## Clone3 BrandA  4.866524 5.140011
##        BrandB  4.766258 4.629867
##        BrandC  4.534902 3.503892
##        BrandD  3.365766 2.302537
# the fourth argument is an option for the function
tapply(Growth.rate,list(Daphnia,Detergent),mean,na.rm=TRUE)
##          BrandA   BrandB   BrandC   BrandD
## Clone1 2.732227 2.929140 3.071335 2.626797
## Clone2 3.919002 4.402931 4.772805 5.213745
## Clone3 5.003268 4.698062 4.019397 2.834151
as.data.frame.table(tapply(Growth.rate,list(Daphnia,Detergent),mean))
##      Var1   Var2     Freq
## 1  Clone1 BrandA 2.732227
## 2  Clone2 BrandA 3.919002
## 3  Clone3 BrandA 5.003268
## 4  Clone1 BrandB 2.929140
## 5  Clone2 BrandB 4.402931
## 6  Clone3 BrandB 4.698062
## 7  Clone1 BrandC 3.071335
## 8  Clone2 BrandC 4.772805
## 9  Clone3 BrandC 4.019397
## 10 Clone1 BrandD 2.626797
## 11 Clone2 BrandD 5.213745
## 12 Clone3 BrandD 2.834151
detach(thisData)

Tables (frequency tables)

# Assuming the previous setup of random number generation and assigning genders
cells <- rnbinom(10000, size=0.63, prob=0.63/1.83)
gender <- rep(c("male", "female"), c(5000, 5000))

# Generate and output frequency table for cells
freq_cells <- table(cells)
freq_cells
## cells
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 5161 2105 1149  661  356  188  138  101   42   33   24   19   11    1    2    4 
##   16   18   19   20 
##    2    1    1    1
# Generate and output cross tabulation of cells by gender
freq_cells_gender <- table(cells, gender)
freq_cells_gender
##      gender
## cells female male
##    0    2601 2560
##    1    1055 1050
##    2     551  598
##    3     319  342
##    4     180  176
##    5     101   87
##    6      62   76
##    7      54   47
##    8      22   20
##    9      17   16
##    10     12   12
##    11      9   10
##    12      8    3
##    13      0    1
##    14      2    0
##    15      4    0
##    16      1    1
##    18      1    0
##    19      0    1
##    20      1    0

Probability table

# Creating a matrix of counts
counts <- matrix(c(2, 2, 4, 3, 1, 4, 2, 9, 1, 5, 3, 3), nrow = 4)

# Creating row-wise probability table
rowwise_table <- prop.table(counts, 1)
print("Row-wise probability table:")
## [1] "Row-wise probability table:"
print(rowwise_table)
##           [,1]      [,2]      [,3]
## [1,] 0.5000000 0.2500000 0.2500000
## [2,] 0.1818182 0.3636364 0.4545455
## [3,] 0.4444444 0.2222222 0.3333333
## [4,] 0.2000000 0.6000000 0.2000000
# Creating column-wise probability table
colwise_table <- prop.table(counts, 2)
print("Column-wise probability table:")
## [1] "Column-wise probability table:"
print(colwise_table)
##           [,1]   [,2]       [,3]
## [1,] 0.1818182 0.0625 0.08333333
## [2,] 0.1818182 0.2500 0.41666667
## [3,] 0.3636364 0.1250 0.25000000
## [4,] 0.2727273 0.5625 0.25000000
# Creating table-wise probability table
tablewise_table <- prop.table(counts)
print("Table-wise probability table:")
## [1] "Table-wise probability table:"
print(tablewise_table)
##            [,1]       [,2]       [,3]
## [1,] 0.05128205 0.02564103 0.02564103
## [2,] 0.05128205 0.10256410 0.12820513
## [3,] 0.10256410 0.05128205 0.07692308
## [4,] 0.07692308 0.23076923 0.07692308

Permutations and combinations

# Load the gtools package
library(gtools)

# Define the vector of elements
x <- c("A", "B", "C", "D", "E", "F")

# Define the size of the target group (r)
r <- 4

# Calculate and print permutations of elements
perm <- permutations(n = length(x), r = r, v = x, repeats.allowed = FALSE)
head(perm) # There are 360 permutations of 6 elements taken 4 at a time
##      [,1] [,2] [,3] [,4]
## [1,] "A"  "B"  "C"  "D" 
## [2,] "A"  "B"  "C"  "E" 
## [3,] "A"  "B"  "C"  "F" 
## [4,] "A"  "B"  "D"  "C" 
## [5,] "A"  "B"  "D"  "E" 
## [6,] "A"  "B"  "D"  "F"
# Calculate and print combinations of elements
comb <- combinations(n = length(x), r = r, v = x)
print(comb) # There are 15 combinations of 6 elements taken 4 at a time
##       [,1] [,2] [,3] [,4]
##  [1,] "A"  "B"  "C"  "D" 
##  [2,] "A"  "B"  "C"  "E" 
##  [3,] "A"  "B"  "C"  "F" 
##  [4,] "A"  "B"  "D"  "E" 
##  [5,] "A"  "B"  "D"  "F" 
##  [6,] "A"  "B"  "E"  "F" 
##  [7,] "A"  "C"  "D"  "E" 
##  [8,] "A"  "C"  "D"  "F" 
##  [9,] "A"  "C"  "E"  "F" 
## [10,] "A"  "D"  "E"  "F" 
## [11,] "B"  "C"  "D"  "E" 
## [12,] "B"  "C"  "D"  "F" 
## [13,] "B"  "C"  "E"  "F" 
## [14,] "B"  "D"  "E"  "F" 
## [15,] "C"  "D"  "E"  "F"

Doing Statistics with R

Basic math calculations (W1)

Calculate sum, sumOfSq, sigma, s, and SS

practiceCalculation = function (x) {
    sumX = sum(x)                           # sum of vector x
    sumOfSq = sum(x^2)                      # sum of squares of vector x
    meanX = mean(x)                         # mean of vector x
    SS = sum((meanX-x)^2)                   # sum of squares
    n = length(x)                           # number of observations
    sd = sqrt(SS / n)                       # population standard deviation
    sampleSd = sqrt(SS / (n-1))             # sample standard deviation
    seOfMean = sd / sqrt(n)                 # standard error of the mean
    return(list(sum=sumX, sumOfSq=sumOfSq, SS=SS, sd=sd, sampleSd=sampleSd, seOfMean=seOfMean))
}

x1 = c(25.0, 5.0, 76.0, 43.54, 43.34, 154, 64.34, 54.6, 76, 3)
calcResults = practiceCalculation(x1)
calcResults
## $sum
## [1] 544.82
## 
## $sumOfSq
## [1] 46821.88
## 
## $SS
## [1] 17139
## 
## $sd
## [1] 41.39927
## 
## $sampleSd
## [1] 43.63867
## 
## $seOfMean
## [1] 13.0916
# Built-in functions and the psych package can further describe the data
installationStatus <- require(psych)
if (!installationStatus) {
    install.packages("psych")
    library(psych)
}

describe(x1)
##    vars  n  mean    sd median trimmed   mad min max range skew kurtosis   se
## X1    1 10 54.48 43.64  49.07   48.48 37.81   3 154   151 0.87      0.1 13.8
sum(x1)
## [1] 544.82
sd(x1)
## [1] 43.63867

Random variables

x = c(1,1,1,3,4,5)

# Create a histogram for continuous-like data.
hist(x, main="Histogram of x", xlab="Value", ylab="Frequency", col="blue")

# Create a barplot for discrete data.
# Count the number of occurrences of each unique value
counts <- table(x)
barplot(counts, main="Barplot of x", xlab="Value", ylab="Frequency", col="red")

# Create a histogram with densities instead of frequencies
hist(x, freq=FALSE, main="Histogram of x with Density", xlab="Value", ylab="Density", col="green")

Measures of central tendency

x = c(rep(letters[1:5],3),rep(letters[6:10],4),rep(letters[11],8))

# Calculate mode
getMode <- function(v) {
    vTable <- table(v)
    vMode <- as.numeric(names(vTable[vTable == max(vTable)]))
    if(is.na(vMode)) {
        return(NA)    # In case of no mode
    }
    return(vMode)
}

# Display mode
mode_x <- getMode(x)
## Warning in getMode(x): NAs introduced by coercion
# Create a numeric vector for summary
x_numeric <- c(1,3,3,5)

# Calculate mean
mean_x <- mean(x_numeric)

# Calculate sum of squares (SS)
ss_x <- sum((x_numeric - mean_x)^2)

# Calculate variance (sigma^2)
variance_x <- var(ss_x) / length(x_numeric)

# Calculate standard deviation (sigma)
sd_x <- sd(x_numeric)

# Use psych package for descriptive statistics
library(psych)
psych_desc <- describe(x_numeric)

# Output results
list(mode = mode_x, mean = mean_x, ss = ss_x, variance = variance_x, sd = sd_x, psych_desc = psych_desc)
## $mode
## [1] NA
## 
## $mean
## [1] 3
## 
## $ss
## [1] 8
## 
## $variance
## [1] NA
## 
## $sd
## [1] 1.632993
## 
## $psych_desc
##    vars n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 4    3 1.63      3       3 1.48   1   5     4    0    -1.88 0.82

Built-in distributions

Beta beta Log-normal lnorm
Binomial binom Negative Binomial nbinom
Cauchy cauchy Normal rnom
Chisquare chisq Poisson pois
Exponental exp Student t t
Fisher F f Uniform unif
Gamma gamma Tukey tukey
Geometric geom Weibull weib
Hypergeometric hyper Wilcoxon wilcox
Logistic logis - -
name description usage
dname() density or probability function dnorm(0) 1/ sqrt(2pi); dnorm(1) exp(-1/2)/ sqrt(2pi); dnorm(1) == 1/ sqrt(2piexp(1))
pname() cumulative density function pnorm(-1.63)=0.05155 (the area under the standard normal curve to the left of -1.63)
qname() quantile function qnorm(0.9)=1.2815 (1.2815 is the 90th percentile of the standard normal distribution)
rname() drandam deviates rnorm(100,mean=50,sd=10) (generates 100 random deviates from a normal distribution with the define parameters)
Distribution p (probability) function q (quantile) function d (density) function r (random) function
Beta pbeta qbeta dbeta rbeta
Binomial pbinom qbinom dbinom rbinom
Cauchy pcauchy qcauchy dcauchy rcauchy
Chi-Square pchisq qchisq dchisq rchisq
Exponential pexp qexp dexp rexp
F pf qf df rf
Gamma pgamma qgamma dgamma rgamma
Geometric pgeom qgeom dgeom rgeom
Hypergeometric phyper qhyper dhyper rhyper
Logistic plogis qlogis dlogis rlogis
Log Normal plnorm qlnorm dlnorm rlnorm
Negative Binomial pnbinom qnbinom dnbinom rnbinom
Normal pnorm qnorm dnorm rnorm
Poisson ppois qpois dpois rpois
Student t pt qt dt rt
Studentized Range ptukey qtukey dtukey rtukey
Uniform punif qunif dunif runif
Weibull pweibull qweibull dweibull rweibull
Wilcoxon Rank Sum Statistic pwilcox qwilcox dwilcox rwilcox
Wilcoxon Signed Rank Statistic psignrank qsignrank dsignrank rsignrank

Probability distributions in R

# http://www.statmethods.net/advgraphs/probability.html
pnorm(-2.58)
## [1] 0.004940016
pnorm(-1.63)
## [1] 0.05155075
pnorm(1.63)
## [1] 0.9484493
pnorm(2.58)
## [1] 0.99506
qnorm(0.005)
## [1] -2.575829
qnorm(0.025)
## [1] -1.959964
qnorm(0.975)
## [1] 1.959964
qnorm(0.995)
## [1] 2.575829
# mu = 100, sigma = 15, select one individual who is larger than 118
mu = 100; sigma = 15; n = 1
se = sigma/sqrt(n)
z = (118-100)/se
1 - pnorm(z)
## [1] 0.1150697
# mu = 100, sigma = 15, select one individual between 79 and 94
mu = 100; sigma = 15; n = 1
se = sigma/sqrt(n)
z_low = (79-100)/se
z_high = (94-100)/se
pnorm(z_high) - pnorm(z_low)
## [1] 0.2638216
# mu = 100, sigma = 15, select 4 samples whose mean is larger than 118
mu = 100; sigma = 15; n = 4
se = sigma/sqrt(n)
# z-score for sample means
z = (118-100)/se
1 - pnorm(z)
## [1] 0.008197536
# mu = 100, sigma = 15, conf interval of 36 samples with 95% confidence
mu = 100; sigma = 15; n = 36
se = sigma/sqrt(n)
z_low = qnorm(0.025)
z_high = qnorm(0.975)
# x = z*se + mu
z_low*se + mu
## [1] 95.10009
z_high*se + mu
## [1] 104.8999
# mu = 100, sigma = 15, select one individual who is larger than 118
mu = 100; sigma = 15; n = 1
se = sigma/sqrt(n)
z = (118-100)/se
1 - pnorm(z)
## [1] 0.1150697
# mu = 100, sigma = 15, select one individual between 79 and 94
mu = 100; sigma = 15; n = 1
se = sigma/sqrt(n)
z_low = (79-100)/se
z_high = (94-100)/se
pnorm(z_high) - pnorm(z_low)
## [1] 0.2638216
# mu = 100, sigma = 15, select 4 samples whose mean is larger than 118
mu = 100; sigma = 15; n = 4
se = sigma/sqrt(n)
# z-score for sample means
z = (118-100)/se
1 - pnorm(z)
## [1] 0.008197536
# mu = 100, sigma = 15, conf interval of 36 samples with 95% confidence
mu = 100; sigma = 15; n = 36
se = sigma/sqrt(n)
z_low = qnorm(0.025)
z_high = qnorm(0.975)
# x = z*se + mu
z_low*se + mu
## [1] 95.10009
z_high*se + mu
## [1] 104.8999

One-sample t-test (Week 2)

Assumptions of t-test

  • Variable type: The response variable is continuous, and the predictor variable is categorical.
  • Independence of sampling: Samples are independently drawn from the population.
  • Normality: The population from which samples are drawn is normally distributed.

Demonstrating t-distributions

# From Quick-R http://www.statmethods.net
# Display the Student's t distributions with various degrees of freedom and compare to the normal distribution

x = seq(-4, 4, length=100)
hx = dnorm(x)
degf = c(1, 3, 8, 30)
colors = c("red", "blue", "darkgreen", "gold")
labels = c("df=1", "df=3", "df=8", "df=30")

plot(x, hx, type="l", lty=2, xlab="x value", ylab="Density", 
     main="Comparison of t Distributions")
for (i in 1:4){
    lines(x, dt(x, degf[i]), lwd=2, col=colors[i])
}
legend("topright", inset=.05, title="Distributions", c(labels, "normal"), 
       lwd=2, lty=c(1, 1, 1, 1, 2), col=c(colors, "black"))

Hypothesis test of \(\mu\)

# Given: mu = 50, sigma = 10, n = 10, x_bar = 58
mu = 50; sigma = 10; n = 10; x_bar = 58
se = sigma / sqrt(n)
z_low = qnorm(0.025)
z_high = qnorm(0.975)
# Confidence interval: x_bar ± z*se
lower = x_bar + z_low * se
upper = x_bar + z_high * se
c(lower, upper)
## [1] 51.80205 64.19795

Biased and unbiased estimations of variance

# Function to calculate sample variance (unbiased estimator)
s2 = function(x) { sum((x - mean(x))^2) / (length(x) - 1) }
# Function to calculate population variance (biased estimator)
variance = function(x) { sum((x - mean(x))^2) / length(x) }

x = c(2, 4, 6, 8)
library(gtools)
sample_size = 2
perm = permutations(length(x), sample_size, x, repeats.allowed=TRUE)

# Calculating variance for all permutations
thisData = cbind(perm, apply(perm, 1, s2), apply(perm, 1, variance))
colnames(thisData) = c("X_1", "X_2", "sample_var", "population_var")
thisData = as.data.frame(thisData)
thisData
##    X_1 X_2 sample_var population_var
## 1    2   2          0              0
## 2    2   4          2              1
## 3    2   6          8              4
## 4    2   8         18              9
## 5    4   2          2              1
## 6    4   4          0              0
## 7    4   6          2              1
## 8    4   8          8              4
## 9    6   2          8              4
## 10   6   4          2              1
## 11   6   6          0              0
## 12   6   8          2              1
## 13   8   2         18              9
## 14   8   4          8              4
## 15   8   6          2              1
## 16   8   8          0              0

Critical values of t

dfs = c(5, 10, 20, 40, 80, 1000)

alpha_0.05 = qt(0.975, dfs)  # Two-tailed test, alpha = 0.05
alpha_0.01 = qt(0.995, dfs)  # Two-tailed test, alpha = 0.01

thisData = data.frame(df = dfs, alpha_0.05 = alpha_0.05, alpha_0.01 = alpha_0.01)
thisData
##     df alpha_0.05 alpha_0.01
## 1    5   2.570582   4.032143
## 2   10   2.228139   3.169273
## 3   20   2.085963   2.845340
## 4   40   2.021075   2.704459
## 5   80   1.990063   2.638691
## 6 1000   1.962339   2.580755

Estimating \(\mu\) with sample variance

# Given: mu = 50, n = 100, x_bar = 54, s = 19
mu = 50; n = 100; x_bar = 54; s = 19
se = s / sqrt(n)
t_value = qt(0.975, n - 1)
# Confidence interval: x_bar ± t*se
lower = x_bar - t_value * se
upper = x_bar + t_value * se
c(lower, upper)
## [1] 50.22999 57.77001

Correlation and Regression (Week 3)

Positive and negative correlations

set.seed(123)
par(mfrow=c(1,2))

# Positive relationship
x = rnorm(100, mean=100, sd=10)
y_pos = jitter(x, factor=10, amount=10)
plot(x, y_pos, main="Positive Relationship", xlab="X", ylab="Y")
abline(lm(y_pos ~ x), col="red")

# Negative relationship
y_neg = jitter(-x, factor=10, amount=10)
plot(x, y_neg, main="Negative Relationship", xlab="X", ylab="Y")
abline(lm(y_neg ~ x), col="red")

Demonstration of correlation \(r\) (hand calculation)

# Data
x = seq(1, 6)
y = c(0.440, 0.515, 0.550, 0.575, 0.605, 0.615)

# Calculations
cov_xy = sum((x - mean(x)) * (y - mean(y))) / (length(x) - 1)
s_x = sd(x)
s_y = sd(y)
r = cov_xy / (s_x * s_y)

# Output
cov_xy
## [1] 0.117
s_x
## [1] 1.870829
s_y
## [1] 0.06511528
r
## [1] 0.9604371

Demonstration of correlation \(r\) (built-in function)

x = seq(1, 6)
y = c(0.440, 0.515, 0.550, 0.575, 0.605, 0.615)

# Correlation test
cor.test(x, y)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 6.8973, df = 4, p-value = 0.002317
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6750314 0.9958104
## sample estimates:
##       cor 
## 0.9604371
# Descriptive statistics
library(psych)
describe(cbind(x, y))
##   vars n mean   sd median trimmed  mad  min  max range  skew kurtosis   se
## x    1 6 3.50 1.87   3.50    3.50 2.22 1.00 6.00  5.00  0.00    -1.80 0.76
## y    2 6 0.55 0.07   0.56    0.55 0.07 0.44 0.62  0.17 -0.55    -1.37 0.03

Demonstration of regression (built-in function)

x = seq(1, 6)
y = c(0.440, 0.515, 0.550, 0.575, 0.605, 0.615)

# Linear model
fit = lm(y ~ x)
summary(fit)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##         1         2         3         4         5         6 
## -0.026429  0.015143  0.016714  0.008286  0.004857 -0.018571 
## 
## Coefficients:
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) 0.433000   0.018875  22.941 0.0000214 ***
## x           0.033429   0.004847   6.897   0.00232 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02027 on 4 degrees of freedom
## Multiple R-squared:  0.9224, Adjusted R-squared:  0.903 
## F-statistic: 47.57 on 1 and 4 DF,  p-value: 0.002317
# Descriptive statistics
library(psych)
describe(cbind(x, y))
##   vars n mean   sd median trimmed  mad  min  max range  skew kurtosis   se
## x    1 6 3.50 1.87   3.50    3.50 2.22 1.00 6.00  5.00  0.00    -1.80 0.76
## y    2 6 0.55 0.07   0.56    0.55 0.07 0.44 0.62  0.17 -0.55    -1.37 0.03

Producing correlation matrix

# Generating data
set.seed(123)
thisData = data.frame(matrix(nrow=6, ncol=7))
for (i in 1:7) {
    x = sort(rnorm(1000, mean=100, sd=i))
    thisData[[i]] <- sample(x, 6)
}

# Correlation matrix
cor_matrix = cor(thisData)
cor_matrix
##             X1         X2         X3         X4         X5          X6
## X1  1.00000000  0.3346496 -0.1396935 -0.2662075 -0.1443629 -0.01556647
## X2  0.33464956  1.0000000 -0.5594597  0.3145060  0.4879924  0.61960513
## X3 -0.13969355 -0.5594597  1.0000000 -0.3192769  0.3309353 -0.30697962
## X4 -0.26620746  0.3145060 -0.3192769  1.0000000  0.3812170 -0.30828010
## X5 -0.14436287  0.4879924  0.3309353  0.3812170  1.0000000  0.30524693
## X6 -0.01556647  0.6196051 -0.3069796 -0.3082801  0.3052469  1.00000000
## X7 -0.17853580 -0.2479134  0.2468131 -0.8615435 -0.1956732  0.56850026
##            X7
## X1 -0.1785358
## X2 -0.2479134
## X3  0.2468131
## X4 -0.8615435
## X5 -0.1956732
## X6  0.5685003
## X7  1.0000000
# Computing p-value for one correlation
cor.test(thisData[[1]], thisData[[2]])
## 
##  Pearson's product-moment correlation
## 
## data:  thisData[[1]] and thisData[[2]]
## t = 0.71025, df = 4, p-value = 0.5168
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6547285  0.9014007
## sample estimates:
##       cor 
## 0.3346496
# Function to compute p-values for correlation matrix
cor.prob <- function(X, dfr = nrow(X) - 2) {
  R <- cor(X)
  above <- row(R) < col(R)
  r2 <- R[above]^2
  Fstat <- r2 * dfr / (1 - r2)
  R[above] <- 1 - pf(Fstat, 1, dfr)
  R[!above] <- NA
  diag(R) <- NA
  R
}

# Correlation p-values
cor_pvalues = cor.prob(thisData)
cor_pvalues
##    X1        X2        X3        X4        X5        X6         X7
## X1 NA 0.5167644 0.7918227 0.6101214 0.7849600 0.9766522 0.73504171
## X2 NA        NA 0.2483645 0.5437955 0.3261158 0.1895288 0.63574843
## X3 NA        NA        NA 0.5373579 0.5217187 0.5539949 0.63729788
## X4 NA        NA        NA        NA 0.4558750 0.5522288 0.02742819
## X5 NA        NA        NA        NA        NA 0.5563504 0.71023618
## X6 NA        NA        NA        NA        NA        NA 0.23911713
## X7 NA        NA        NA        NA        NA        NA         NA

Comparing Two Means (t-test) (Week 4)

Assumptions of t-test

  • Variable type: The response variable is continuous, and the predictor variable is categorical.
  • Independence of sampling: Samples are independently drawn from the populations.
  • Normality: Populations from which samples are drawn are normally distributed.
  • Homogeneity of variance: The variances of the populations are equal.

Dependent t-test (hand calculation)

# Data
Subject = c("A", "B", "C", "D", "E", "F")
Unprimed = c(550, 1030, 760, 600, 940, 820)
Primed = c(510, 980, 770, 600, 870, 760)

# Calculations
Difference = Unprimed - Primed
mean_diff = mean(Difference)
sd_diff = sd(Difference)
n = length(Difference)
se = sd_diff / sqrt(n)
t_value = (mean_diff - 0) / se
df = n - 1
p_value = 2 * pt(-abs(t_value), df)

# Output
mean_diff
## [1] 35
sd_diff
## [1] 32.71085
t_value
## [1] 2.620908
df
## [1] 5
p_value
## [1] 0.0470462

Dependent t-test (built-in function)

x = c(550, 1030, 760, 600, 940, 820)
y = c(510, 980, 770, 600, 870, 760)

# Paired t-test
t.test(x, y, paired=TRUE)
## 
##  Paired t-test
## 
## data:  x and y
## t = 2.6209, df = 5, p-value = 0.04705
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##   0.6720635 69.3279365
## sample estimates:
## mean difference 
##              35
# Descriptive statistics
library(psych)
describe(cbind(x, y))
##   vars n   mean     sd median trimmed    mad min  max range  skew kurtosis
## x    1 6 783.33 187.26    790  783.33 252.04 550 1030   480  0.00    -1.86
## y    2 6 748.33 171.98    765  748.33 200.15 510  980   470 -0.08    -1.70
##      se
## x 76.45
## y 70.21

Problems with repeated-measures (within-subjects) research designs

  • Order effects: Subjects may improve or become fatigued over time.
    • Solution: Counterbalancing (half the subjects receive condition A first, the other half condition B first), or intermixing trials.
  • Carryover effects: The effect of the first treatment may affect the second treatment.
  • Difficulty in matching: It may be challenging to match subjects across mutually exclusive categories.

Variance Sum Law

  • The variance of the sum or difference of two independent variables is equal to the sum of their variances.

  • For independent samples t-test:

    \[ \text{var}(\bar{x}_1 - \bar{x}_2) = \frac{s_1^2}{n_1} + \frac{s_2^2}{n_2} \]

  • Standard error of the difference:

    \[ \text{SE}(\bar{x}_1 - \bar{x}_2) = \sqrt{\frac{s_1^2}{n_1} + \frac{s_2^2}{n_2}} \]

Comparison between independent and dependent t-tests

x = c(550, 1030, 760, 600, 940, 820)
y = c(510, 980, 770, 600, 870, 760)

# Independent t-test
t.test(x, y, paired=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  x and y
## t = 0.3372, df = 9.9284, p-value = 0.743
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -196.5012  266.5012
## sample estimates:
## mean of x mean of y 
##  783.3333  748.3333
# Dependent t-test
t.test(x, y, paired=TRUE)
## 
##  Paired t-test
## 
## data:  x and y
## t = 2.6209, df = 5, p-value = 0.04705
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##   0.6720635 69.3279365
## sample estimates:
## mean difference 
##              35
# Descriptive statistics
library(psych)
describe(cbind(x, y))
##   vars n   mean     sd median trimmed    mad min  max range  skew kurtosis
## x    1 6 783.33 187.26    790  783.33 252.04 550 1030   480  0.00    -1.86
## y    2 6 748.33 171.98    765  748.33 200.15 510  980   470 -0.08    -1.70
##      se
## x 76.45
## y 70.21

Independent t-test equals point-biserial correlation

# Data
x = c(rep(0, 6), rep(1, 6))
y = c(55, 73, 76, 66, 72, 81, 71, 93, 77, 89, 101, 97)

# Linear model
model = lm(y ~ x)
summary(model)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.000  -6.125   2.000   6.375  13.000 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   70.500      4.273  16.499  1.4e-08 ***
## x             17.500      6.043   2.896   0.0159 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.47 on 10 degrees of freedom
## Multiple R-squared:  0.4561, Adjusted R-squared:  0.4017 
## F-statistic: 8.387 on 1 and 10 DF,  p-value: 0.01594
# Independent t-test
t.test(y ~ x, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  y by x
## t = -2.896, df = 10, p-value = 0.01594
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -30.964425  -4.035575
## sample estimates:
## mean in group 0 mean in group 1 
##            70.5            88.0

Graphics for t-test

# Assuming 'das.txt' is available in the working directory
data = read.table("data/das.txt", header=TRUE)

# Simulating data similar to 'das.txt'
set.seed(123)
data = data.frame(y = rnorm(100))

par(mfrow=c(2,2))
plot(data$y, main="Scatter Plot of Y", ylab="Y", xlab="Index")
boxplot(data$y, main="Boxplot of Y", ylab="Y")
hist(data$y, main="Histogram of Y", xlab="Y", breaks=10)

# Introduce an outlier
data$y2 = data$y
data$y2[52] = 21.75
plot(data$y2, main="Scatter Plot with Outlier", ylab="Y", xlab="Index")

Test for normality (Shapiro-Wilk test)

# Generating non-normal data
x = exp(rnorm(30))
# Shapiro-Wilk test
shapiro.test(x)
## 
##  Shapiro-Wilk normality test
## 
## data:  x
## W = 0.57719, p-value = 3.888e-08

One-sample t-test

y = c(1, -2, 2, 1, -4, 5, -8, -5, 2, 5, 6)

# Test whether mean of y equals 2
t.test(y, mu=2)
## 
##  One Sample t-test
## 
## data:  y
## t = -1.2678, df = 10, p-value = 0.2336
## alternative hypothesis: true mean is not equal to 2
## 95 percent confidence interval:
##  -2.762942  3.308396
## sample estimates:
## mean of x 
## 0.2727273

Independent two-group t-test (numeric variable and grouping factor)

# Data
x = factor(c(rep(1, 8), rep(2, 5)), labels = c("A", "B"))
y = c(7, 5, 6, 8, 4, 5, 4, 6, 3, 5, 4, 6, 2)

# Independent t-test
t.test(y ~ x)
## 
##  Welch Two Sample t-test
## 
## data:  y by x
## t = 1.8792, df = 7.8456, p-value = 0.09775
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
##  -0.3759343  3.6259343
## sample estimates:
## mean in group A mean in group B 
##           5.625           4.000

Independent two-group t-test (two numeric vectors)

x = c(7, 5, 6, 8, 4, 5, 4, 6)
y = c(3, 5, 4, 6, 2)

# Independent t-test
t.test(x, y)
## 
##  Welch Two Sample t-test
## 
## data:  x and y
## t = 1.8792, df = 7.8456, p-value = 0.09775
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3759343  3.6259343
## sample estimates:
## mean of x mean of y 
##     5.625     4.000

Paired/Dependent t-test (two numeric vectors)

x = c(4, 5, 6, 4, 5, 6, 6, 4)
y = c(2, 1, 4, 5, 6, 9, 2, 2)

# Paired t-test
t.test(x, y, paired=TRUE)
## 
##  Paired t-test
## 
## data:  x and y
## t = 1.2567, df = 7, p-value = 0.2492
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.9917538  3.2417538
## sample estimates:
## mean difference 
##           1.125

More on Correlation (Week 5)

Assumptions of Correlations

  • Variable type: Both variables are continuous.
  • Normality: Both variables are approximately normally distributed.
  • Linearity: The relationship between the variables is linear.
  • Homoscedasticity: The variance of the error terms is constant at all levels.

Comparing two correlations

# Function to compare two correlation coefficients
compcorr = function(n1, r1, n2, r2){
    # Fisher's Z-transform
    zf1 = 0.5 * log((1 + r1) / (1 - r1))
    zf2 = 0.5 * log((1 + r2) / (1 - r2))
    
    # Difference between Z-transformed correlations
    dz = (zf1 - zf2) / sqrt(1 / (n1 - 3) + 1 / (n2 - 3))
    
    # p-value (two-tailed)
    p_value = 2 * (1 - pnorm(abs(dz)))
    return(list(diff = dz, p_value = p_value))
}

# Example data
r_male = -0.506; n_male = 52
r_female = -0.381; n_female = 51

compcorr(n_male, r_male, n_female, r_female)
## $diff
## [1] -0.7687093
## 
## $p_value
## [1] 0.4420659

Partial and semi-partial correlations

# Partial correlation function
partialCor = function(x, y, z){
    r_xy = cor(x, y)
    r_xz = cor(x, z)
    r_yz = cor(y, z)
    numerator = r_xy - r_xz * r_yz
    denominator = sqrt((1 - r_xz^2) * (1 - r_yz^2))
    partial_corr = numerator / denominator
    return(partial_corr)
}

# Semi-partial correlation function
semipartialCor = function(x, y, z){
    r_xy = cor(x, y)
    r_xz = cor(x, z)
    numerator = r_xy - cor(z, y) * r_xz
    denominator = sqrt(1 - r_xz^2)
    semipartial_corr = numerator / denominator
    return(semipartial_corr)
}

# Example data
child = letters[1:8]
shoe_size = c(5.2, 4.7, 7.0, 5.8, 7.2, 6.9, 7.7, 8.0)
reading_level = c(1.7, 1.5, 2.7, 3.1, 3.9, 4.5, 5.1, 7.4)
age = c(5, 6, 7, 8, 9, 10, 11, 12)

# Correlation matrix
library(Hmisc)
rcorr_matrix = rcorr(cbind(reading_level, age, shoe_size))
rcorr_matrix$r  # Correlation coefficients
##               reading_level       age shoe_size
## reading_level     1.0000000 0.9603441 0.8699218
## age               0.9603441 1.0000000 0.8719177
## shoe_size         0.8699218 0.8719177 1.0000000
# Partial correlation between reading_level and shoe_size controlling for age
partial_correlation = partialCor(reading_level, shoe_size, age)
partial_correlation
## [1] 0.2386455
# Semi-partial correlation
semipartial_correlation = semipartialCor(reading_level, shoe_size, age)
semipartial_correlation
## [1] 0.1168533

Multiple Regressions (Week 6)

Assumptions of Multiple Regression

  • Variable type: Outcome variable is continuous, predictors are continuous or dichotomous.
  • Non-zero variance: Predictors have variation in their values.
  • Linearity: Relationship between predictors and outcome is linear.
  • Independence: Observations are independent.
  • No multicollinearity: Predictors are not highly correlated.
  • Homoscedasticity: Constant variance of errors.
  • Independent errors: Errors are uncorrelated.
  • Normally-distributed errors: Errors are normally distributed.

Computing Sum of Squares (SS)

# Assuming 'mtcars.txt' is available in the working directory
thisData = read.table("data/mtcars.txt", header=TRUE)
data(mtcars)
thisData = mtcars

# Total Sum of Squares (SST)
SST_manual = sum((thisData$disp - mean(thisData$disp))^2)
SST_formula = deviance(lm(disp ~ 1, data=thisData))
SST_manual
## [1] 476184.8
SST_formula
## [1] 476184.8
# Sum of Squares Error (SSE)
SSE = deviance(lm(disp ~ cyl, data=thisData))
SSE
## [1] 88730.7
# Linear model with intercept
fit1 = lm(disp ~ cyl, data=thisData)
summary(fit1)
## 
## Call:
## lm(formula = disp ~ cyl, data = thisData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -73.985 -45.233   3.565  26.688 127.818 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -156.609     35.181  -4.452 0.000109 ***
## cyl           62.599      5.469  11.445  1.8e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54.38 on 30 degrees of freedom
## Multiple R-squared:  0.8137, Adjusted R-squared:  0.8075 
## F-statistic:   131 on 1 and 30 DF,  p-value: 1.803e-12
# Linear model without intercept
fit2 = lm(disp ~ cyl - 1, data=thisData)
summary(fit2)
## 
## Call:
## lm(formula = disp ~ cyl - 1, data = thisData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -90.07 -67.47 -36.06  26.34 158.57 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## cyl   39.179      1.895   20.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68.94 on 31 degrees of freedom
## Multiple R-squared:  0.9324, Adjusted R-squared:  0.9302 
## F-statistic: 427.6 on 1 and 31 DF,  p-value: < 2.2e-16

Plots for multiple regression

# Assuming 'ozone.data.txt' is available
ozone.pollution = read.table("data/ozone.data.txt", header=TRUE)

# Simulated ozone pollution data
set.seed(123)
ozone.pollution = data.frame(
  ozone = rnorm(100, mean=50, sd=10),
  rad = rnorm(100, mean=200, sd=50),
  temp = rnorm(100, mean=70, sd=10),
  wind = rnorm(100, mean=10, sd=2)
)

attach(ozone.pollution)

# Scatterplot matrix
pairs(ozone.pollution, panel=panel.smooth)

# Generalized additive model (GAM)
library(mgcv)
model = gam(ozone ~ s(rad) + s(temp) + s(wind))

# Plot GAM
par(mfrow=c(2,2))
plot(model)

# Decision tree
library(tree)
## Warning: package 'tree' was built under R version 4.3.3
tree_model = tree(ozone ~ ., data=ozone.pollution)
plot(tree_model)
text(tree_model, pretty=0)

detach(ozone.pollution)

Beta coefficients (standardized slopes)

# Given correlations
r_1y = 0.4; r_2y = 0.3; r_12 = 0.2

# Calculating standardized beta coefficients
B_1 = (r_1y - r_2y * r_12) / (1 - r_12^2)
B_2 = (r_2y - r_1y * r_12) / (1 - r_12^2)
B_1
## [1] 0.3541667
B_2
## [1] 0.2291667
# Calculating R and R^2
R = sqrt(B_1 * r_1y + B_2 * r_2y)
R
## [1] 0.458712
R^2
## [1] 0.2104167

Complementarity and Suppression

Complementarity

# Predictors positively correlated with Y but negatively correlated with each other
r_1y = 0.4; r_2y = 0.3; r_12 = -0.2
B_1 = (r_1y - r_2y * r_12) / (1 - r_12^2)
B_2 = (r_2y - r_1y * r_12) / (1 - r_12^2)
B_1
## [1] 0.4791667
B_2
## [1] 0.3958333

Suppression

# Predictor 2 is not correlated with Y but correlated with Predictor 1
r_1y = 0.4; r_2y = 0; r_12 = 0.5
B_1 = (r_1y - r_2y * r_12) / (1 - r_12^2)
B_2 = (r_2y - r_1y * r_12) / (1 - r_12^2)
B_1
## [1] 0.5333333
B_2
## [1] -0.2666667

Dummy Data (mtcars)

# Loading mtcars dataset
data(mtcars)
thisData = mtcars

Data visualization

attach(thisData)
par(mfrow=c(2,2))

# Model with one predictor
plot(hp, disp, main="Model with One Predictor", ylab="Displacement (cu.in.)", xlab="Gross horsepower")
abline(lm(disp ~ hp), col="red")

# Model with two predictors (3D scatter plot)
library(scatterplot3d)
s3d = scatterplot3d(hp, wt, disp, pch=16, highlight.3d=TRUE, type="h",
                    main="Model with Two Predictors", ylab="Weight (lb/1000)",
                    xlab="Gross horsepower", zlab="Displacement (cu.in.)")
fit = lm(disp ~ hp + wt)
s3d$plane3d(fit)

# Diagnostic plots
plot(fit)

detach(thisData)

Methods of multiple regression

  • Hierarchical: Known predictors are entered into the model first, followed by new predictors.
  • Forced entry: All variables are entered into the model simultaneously.
  • Stepwise: Variables are entered into the model based on statistical criteria.

Example: Stepwise Regression

data(mtcars)
thisData = mtcars

# Initial model
fit_initial = lm(disp ~ mpg + cyl + hp + drat + wt + qsec + gear, data=thisData)

# Stepwise regression (both directions)
library(MASS)
stepwise_model = stepAIC(fit_initial, direction="both")
## Start:  AIC=242.45
## disp ~ mpg + cyl + hp + drat + wt + qsec + gear
## 
##        Df Sum of Sq   RSS    AIC
## - drat  1     142.5 38024 240.57
## - cyl   1    1678.2 39560 241.84
## <none>              37881 242.45
## - qsec  1    2645.1 40527 242.61
## - mpg   1    2674.4 40556 242.63
## - hp    1    4247.6 42129 243.85
## - gear  1    5179.1 43061 244.55
## - wt    1   23343.6 61225 255.81
## 
## Step:  AIC=240.57
## disp ~ mpg + cyl + hp + wt + qsec + gear
## 
##        Df Sum of Sq   RSS    AIC
## - cyl   1    2197.3 40221 240.37
## <none>              38024 240.57
## - qsec  1    2507.7 40532 240.61
## - mpg   1    2565.0 40589 240.66
## - hp    1    4189.2 42213 241.91
## + drat  1     142.5 37881 242.45
## - gear  1    6027.8 44052 243.28
## - wt    1   23416.5 61441 253.92
## 
## Step:  AIC=240.37
## disp ~ mpg + hp + wt + qsec + gear
## 
##        Df Sum of Sq   RSS    AIC
## - mpg   1    2108.9 42330 240.00
## <none>              40221 240.37
## + cyl   1    2197.3 38024 240.57
## + drat  1     661.6 39560 241.84
## - hp    1    7854.7 48076 244.07
## - qsec  1    8990.9 49212 244.82
## - gear  1   19274.5 59496 250.89
## - wt    1   31014.4 71236 256.66
## 
## Step:  AIC=240
## disp ~ hp + wt + qsec + gear
## 
##        Df Sum of Sq   RSS    AIC
## <none>              42330 240.00
## + mpg   1      2109 40221 240.37
## + cyl   1      1741 40589 240.66
## + drat  1       349 41981 241.74
## - hp    1      6275 48605 242.42
## - qsec  1      7434 49764 243.18
## - gear  1     17301 59632 248.97
## - wt    1     35075 77405 257.31
stepwise_model$anova  # Display the steps
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## disp ~ mpg + cyl + hp + drat + wt + qsec + gear
## 
## Final Model:
## disp ~ hp + wt + qsec + gear
## 
## 
##     Step Df  Deviance Resid. Df Resid. Dev      AIC
## 1                            24   37881.49 242.4474
## 2 - drat  1  142.4784        25   38023.97 240.5676
## 3  - cyl  1 2197.3139        26   40221.29 240.3653
## 4  - mpg  1 2108.8563        27   42330.14 240.0006

Some diagnostic tests for multiple regression

  • Outliers: Cases with standardized residuals beyond ±2 or ±3.
  • Cook’s distance: Values larger than 1 may indicate influential observations.
  • Leverage values: High leverage points may unduly influence the model.
  • Mahalanobis distance: Values greater than critical chi-square value may indicate outliers.
  • Variance Inflation Factor (VIF): VIF > 10 indicates multicollinearity concerns.

Effect Size (Week 7)

Unstandardized (absolute) effect size

  • Use unstandardized effect size when units of measurement are meaningful.
  • Examples include regression coefficients or mean differences.

r-family measures

  • Simple regression: Use Pearson’s \(r\) or \(r^2\).
  • Multiple regression: Use \(R\) or \(R^2\).
  • t-test: \(r = \sqrt{\frac{t^2}{t^2 + df}}\)
  • ANOVA: Use \(\eta^2\) or \(\omega^2\).
  • Interpretation:
    • Small effect: \(r = 0.1\)
    • Medium effect: \(r = 0.3\)
    • Large effect: \(r = 0.5\)

d-family measures

  • Cohen’s d: Standardized mean difference.
  • Interpretation:
    • Small effect: \(d = 0.2\)
    • Medium effect: \(d = 0.5\)
    • Large effect: \(d = 0.8\)

Alpha and Beta Errors

  • Alpha (\(\alpha\)): Probability of Type I error (false positive).
  • Beta (\(\beta\)): Probability of Type II error (false negative).
  • Power: \(1 - \beta\); probability of correctly rejecting a false null hypothesis.

Increasing Power

  • Increase sample size.
  • Increase effect size.
  • Increase alpha level (trade-off with Type I error).
  • Use one-tailed tests when appropriate.
  • Reduce variability (e.g., control experimental conditions).

Power Analysis

# Power analysis for two-sample t-test
library(pwr)

# Parameters
d = 0.4  # Effect size
alpha = 0.05
power = 0.8

# Calculating sample size for independent samples
pwr.t.test(d = d, sig.level = alpha, power = power, type = "two.sample", alternative = "two.sided")
## 
##      Two-sample t test power calculation 
## 
##               n = 99.08032
##               d = 0.4
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
# Calculating sample size for paired samples
pwr.t.test(d = d, sig.level = alpha, power = power, type = "paired", alternative = "two.sided")
## 
##      Paired t test power calculation 
## 
##               n = 51.00945
##               d = 0.4
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number of *pairs*

One-way ANOVA (Week 8)

Levene’s Test (Equality of Variances)

# Levene's test function
levene.test = function(y, group) {
    meds = tapply(y, group, median, na.rm=TRUE)
    resp = abs(y - meds[group])
    table = anova(lm(resp ~ group))
    row.names(table)[2] = " "
    cat("Levene's Test for Homogeneity of Variance\n\n")
    table[, c(1, 4, 5)]
}

# Example data
group = factor(c(rep(1, 5), rep(2, 3), rep(3, 5)))
score = c(7, 5, 6, 8, 4, 5, 4, 6, 3, 5, 4, 6, 2)
thisData = data.frame(group = group, score = score)

# Performing one-way ANOVA
anova_result = aov(score ~ group, data = thisData)
summary(anova_result)
##             Df Sum Sq Mean Sq F value Pr(>F)
## group        2     10     5.0   2.273  0.154
## Residuals   10     22     2.2
# Performing Levene's test
levene.test(thisData$score, thisData$group)
## Levene's Test for Homogeneity of Variance
##       Df F value Pr(>F)
## group  2  0.5237 0.6077
##       10

Planned Comparisons and Post-hoc Tests (Week 9)

Two-way Repeated Measures ANOVA (W11)

Common Errors

Don’t forget to check the data when you transform the format (with melt() or apply()). Fix the factor variables with as.factor() if necessary. Set sum-to-zero so that the results are comparable with other statistical software such as SPSS.

# Set contrasts to sum-to-zero
options(contrasts = c("contr.sum", "contr.poly"))

Data Format for Repeated Measures

The data format for repeated measures in R is different from that for SPSS. In R (and most other statistical software), the repeated measure should be one of the factors. In other words, each line represents one data point.

# Example data in R format
gender <- c("male", "male", "male", "female", "female", "male", "male", "male", "female", "female", "male", "male", "male", "female", "female")
variable <- c(rep("early", 5), rep("mid", 5), rep("late", 5))
value <- c(4, 10, 5, 8, 2, 6, 9, 4, 6, 2, 2, 8, 3, 7, 2)
data_r <- data.frame(gender, variable, value)
data_r
##    gender variable value
## 1    male    early     4
## 2    male    early    10
## 3    male    early     5
## 4  female    early     8
## 5  female    early     2
## 6    male      mid     6
## 7    male      mid     9
## 8    male      mid     4
## 9  female      mid     6
## 10 female      mid     2
## 11   male     late     2
## 12   male     late     8
## 13   male     late     3
## 14 female     late     7
## 15 female     late     2

In SPSS, the repeated measure is represented by multiple data points in one row. In other words, within-factors are represented by the row.

# Example data in SPSS format
early <- c(4, 10, 5, 8, 2)
mid <- c(6, 9, 4, 6, 2)
late <- c(2, 8, 3, 7, 2)
gender <- c("male", "male", "male", "female", "female")
data_spss <- data.frame(early, mid, late, gender)
data_spss
##   early mid late gender
## 1     4   6    2   male
## 2    10   9    8   male
## 3     5   4    3   male
## 4     8   6    7 female
## 5     2   2    2 female

Even in SPSS, the data format similar to that for R is required for more complex analyses (e.g., one that involves more than two within factors). In such cases, dummy coding is used (thus, the data format is almost identical to that for R). Use melt() in the reshape package to transform the data format from SPSS to that for R.

Calculating Sum of Squares (Hand Calculation)

SS_total = SS_bet_sub + SS_within_sub

# Make a dummy data set
thisData <- data.frame(
  early = c(4, 10, 5, 8, 2),
  mid = c(6, 9, 4, 6, 2),
  late = c(2, 8, 3, 7, 2)
)

# SS from each score (total variability)
# sum((score - grand_mean)^2)
grand_mean <- mean(as.matrix(thisData))
SS_total <- sum((as.matrix(thisData) - grand_mean)^2)
SS_total
## [1] 106.4
# SS from row means (within-subject variability)
# sum((score - row_mean)^2)
row_mean <- rowMeans(thisData)
SS_within <- sum((as.matrix(thisData) - row_mean)^2)
SS_within
## [1] 14
# Between-subject variability
# sum(n * (row_mean - grand_mean)^2)
n <- ncol(thisData)
SS_bet_sub <- sum(n * (row_mean - grand_mean)^2)
SS_bet_sub
## [1] 92.4
# Column means (RM variability)
# sum(n * (col_mean - grand_mean)^2)
n <- nrow(thisData)
col_mean <- colMeans(thisData)
SS_rm <- sum(n * (col_mean - grand_mean)^2)
SS_rm
## [1] 5.2

Field Textbook Ch.13 Repeated-measure ANOVA (Demo Using R)

# Load necessary libraries
library(foreign)
library(reshape)
library(psych)
library(nlme)
library(car)
library(ez)
library(gplots)
# Load the data (see https://discoverspss.com/pages/data)
thisData <- read.spss("data/Bushtucker.sav", to.data.frame = TRUE)
summary(thisData)
##      stick           testicle         eye          witchetty   
##  Min.   : 5.000   Min.   :2.00   Min.   :1.000   Min.   :1.00  
##  1st Qu.: 6.750   1st Qu.:2.75   1st Qu.:1.750   1st Qu.:4.25  
##  Median : 8.000   Median :4.50   Median :4.000   Median :6.50  
##  Mean   : 8.125   Mean   :4.25   Mean   :4.125   Mean   :5.75  
##  3rd Qu.: 9.250   3rd Qu.:5.25   3rd Qu.:6.250   3rd Qu.:8.00  
##  Max.   :12.000   Max.   :7.00   Max.   :8.000   Max.   :9.00
describe(thisData)  # p.474
## thisData 
## 
##  4  Variables      8  Observations
## --------------------------------------------------------------------------------
## stick 
##        n  missing distinct     Info     Mean      Gmd 
##        8        0        7    0.988    8.125    2.679 
##                                                     
## Value          5     6     7     8     9    10    12
## Frequency      1     1     1     2     1     1     1
## Proportion 0.125 0.125 0.125 0.250 0.125 0.125 0.125
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## testicle 
##        n  missing distinct     Info     Mean      Gmd 
##        8        0        6    0.976     4.25    2.214 
##                                               
## Value          2     3     4     5     6     7
## Frequency      2     1     1     2     1     1
## Proportion 0.250 0.125 0.125 0.250 0.125 0.125
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## eye 
##        n  missing distinct     Info     Mean      Gmd 
##        8        0        7    0.988    4.125    3.321 
##                                                     
## Value          1     2     3     5     6     7     8
## Frequency      2     1     1     1     1     1     1
## Proportion 0.250 0.125 0.125 0.125 0.125 0.125 0.125
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## witchetty 
##        n  missing distinct     Info     Mean      Gmd 
##        8        0        7    0.988     5.75    3.429 
##                                                     
## Value          1     2     5     6     7     8     9
## Frequency      1     1     1     1     1     2     1
## Proportion 0.125 0.125 0.125 0.125 0.125 0.250 0.125
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
# Change to the sum-to-zero convention (cf. Type II and Type III sums of squares)
# options(contrasts = c("contr.sum", "contr.poly"))

Using Anova() in the car Package

# Fit the model using lm()
fit1 <- lm(as.matrix(thisData) ~ 1)

# Prepare the factors
factors.thisData <- colnames(thisData)

# Perform ANOVA
#fit1.aov <- Anova(fit1, idata = data.frame(factors.thisData), idesign = ~factors.thisData, type = "III")
#summary(fit1.aov)

Reformatting the Data Structure

# Add subject identifier
# thisData$subject <- as.factor(rownames(thisData))

# Melt the data
# thisData.molten <- melt(thisData, id.vars = "subject")
# head(thisData.molten)

Using lme() and aov()

Note: aov() and lme() do not support Mauchly test and Tukey HSD.

# Fit the model using lme()
#fit2 <- lme(value ~ variable, random = ~1 | subject / variable, data = thisData.molten)
#summary(fit2)
# Fit the model using aov()
#fit3 <- aov(value ~ variable + Error(subject / variable), data = thisData.molten)
#summary(fit3)

Using ezANOVA()

# Fit the model using ezANOVA
#fit4 <- ezANOVA(data = thisData.molten, dv = .(value), wid = .(subject), within = .(variable))
#fit4

New Data (pp.483-)

# Load the data
thisData <- read.spss("data/Attitude.sav", to.data.frame = TRUE)
describe(thisData)
## thisData 
## 
##  9  Variables      20  Observations
## --------------------------------------------------------------------------------
## beerpos 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       20        0       14    0.994    21.05    15.25     1.00     6.40 
##      .25      .50      .75      .90      .95 
##    12.75    18.50    31.00    40.00    40.15 
##                                                                            
## Value         1    7    8    9   14   15   17   20   22   26   30   34   40
## Frequency     2    1    1    1    1    3    1    1    1    1    2    2    2
## Proportion 0.10 0.05 0.05 0.05 0.05 0.15 0.05 0.05 0.05 0.05 0.10 0.10 0.10
##                
## Value        43
## Frequency     1
## Proportion 0.05
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## beerneg 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       20        0       17    0.996     4.45    20.11   -18.05   -17.10 
##      .25      .50      .75      .90      .95 
##    -9.50     0.00    20.25    27.30    30.00 
##                                                                            
## Value       -19  -18  -17  -12  -11   -9   -8   -6    6   12   15   17   20
## Frequency     1    1    1    1    1    1    1    3    1    1    1    1    1
## Proportion 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.15 0.05 0.05 0.05 0.05 0.05
##                               
## Value        21   23   27   30
## Frequency     1    1    1    2
## Proportion 0.05 0.05 0.05 0.10
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## beerneut 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       20        0       18    0.998       10    11.82    -5.25    -0.50 
##      .25      .50      .75      .90      .95 
##     4.00     8.00    16.00    26.10    27.05 
##                                                                            
## Value       -10   -5    0    3    4    5    6    7    8    9   12   13   15
## Frequency     1    1    1    1    2    1    1    1    2    1    1    1    1
## Proportion 0.05 0.05 0.05 0.05 0.10 0.05 0.05 0.05 0.10 0.05 0.05 0.05 0.05
##                                    
## Value        19   21   26   27   28
## Frequency     1    1    1    1    1
## Proportion 0.05 0.05 0.05 0.05 0.05
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## winepos 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       20        0       14    0.994    25.35    7.774    14.80    16.80 
##      .25      .50      .75      .90      .95 
##    22.25    25.00    29.25    34.00    34.20 
##                                                                            
## Value        11   15   17   20   23   24   26   27   28   29   30   32   34
## Frequency     1    1    1    2    2    3    1    1    2    1    1    1    2
## Proportion 0.05 0.05 0.05 0.10 0.10 0.15 0.05 0.05 0.10 0.05 0.05 0.05 0.10
##                
## Value        38
## Frequency     1
## Proportion 0.05
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## wineneg 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       20        0       16    0.992      -12    7.179   -22.05   -18.40 
##      .25      .50      .75      .90      .95 
##   -15.25   -13.50    -6.75    -3.80    -2.00 
##                                                                            
## Value       -23  -22  -18  -17  -16  -15  -14  -13  -12  -10   -9   -7   -6
## Frequency     1    1    1    1    1    4    1    1    1    1    1    1    1
## Proportion 0.05 0.05 0.05 0.05 0.05 0.20 0.05 0.05 0.05 0.05 0.05 0.05 0.05
##                          
## Value        -5   -4   -2
## Frequency     1    1    2
## Proportion 0.05 0.05 0.10
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## wineneut 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       20        0       13    0.993    11.65    7.289     3.80     4.00 
##      .25      .50      .75      .90      .95 
##     6.00    12.50    16.50    19.10    20.05 
##                                                                            
## Value         0    4    6    7   12   13   14   15   16   18   19   20   21
## Frequency     1    2    3    2    2    1    2    1    1    1    2    1    1
## Proportion 0.05 0.10 0.15 0.10 0.10 0.05 0.10 0.05 0.05 0.05 0.10 0.05 0.05
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## waterpos 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       20        0       15    0.996     17.4    8.158     8.85     9.00 
##      .25      .50      .75      .90      .95 
##    12.00    17.00    21.00    27.20    29.20 
##                                                                            
## Value         6    9   10   12   13   15   16   17   19   20   21   23   27
## Frequency     1    2    1    2    1    1    1    2    2    1    2    1    1
## Proportion 0.05 0.10 0.05 0.10 0.05 0.05 0.05 0.10 0.10 0.05 0.10 0.05 0.05
##                     
## Value        29   33
## Frequency     1    1
## Proportion 0.05 0.05
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## waterneg 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       20        0       14    0.994     -9.2    7.937   -19.05   -17.20 
##      .25      .50      .75      .90      .95 
##   -14.50   -10.00    -4.00    -1.00    -0.70 
##                                                                            
## Value       -20  -19  -17  -16  -14  -12  -11  -10   -9   -6   -4   -2   -1
## Frequency     1    1    2    1    1    1    1    3    1    2    2    1    2
## Proportion 0.05 0.05 0.10 0.05 0.05 0.05 0.05 0.15 0.05 0.10 0.10 0.05 0.10
##                
## Value         5
## Frequency     1
## Proportion 0.05
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## waterneut 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       20        0       13    0.993     2.35    7.574    -13.0     -5.8 
##      .25      .50      .75      .90      .95 
##      0.0      2.5      8.0     10.0     10.1 
##                                                                            
## Value       -13   -5   -2    0    1    2    3    4    5    8    9   10   12
## Frequency     2    1    1    2    1    3    1    2    1    2    1    2    1
## Proportion 0.10 0.05 0.05 0.10 0.05 0.15 0.05 0.10 0.05 0.10 0.05 0.10 0.05
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
# Add subject identifier
thisData$subject <- as.factor(rownames(thisData))

# Melt the data
thisData.molten <- melt(thisData, id.vars = "subject")
# Create factors for 'drink' and 'image'
thisData.molten$drink <- factor(ifelse(grepl("beer", thisData.molten$variable), "beer",
                               ifelse(grepl("wine", thisData.molten$variable), "wine", "water")),
                               levels = c("beer", "wine", "water"))

thisData.molten$image <- factor(ifelse(grepl("pos", thisData.molten$variable), "sexy",
                               ifelse(grepl("neg", thisData.molten$variable), "corpse", "person in armchair")),
                               levels = c("sexy", "corpse", "person in armchair"))
# Fit the model using ezANOVA
fit5 <- ezANOVA(data = thisData.molten, dv = .(value), wid = .(subject), within = .(drink, image))
fit5
## $ANOVA
##        Effect DFn DFd          F            p p<.05       ges
## 2       drink   2  38   5.105981 1.086293e-02     * 0.1158687
## 3       image   2  38 122.564825 2.680197e-17     * 0.5753191
## 4 drink:image   4  76  17.154922 4.589040e-10     * 0.1411741
## 
## $`Mauchly's Test for Sphericity`
##        Effect         W              p p<.05
## 2       drink 0.2672411 0.000006952302     *
## 3       image 0.6621013 0.024452301563     *
## 4 drink:image 0.5950440 0.435658665787      
## 
## $`Sphericity Corrections`
##        Effect       GGe        p[GG] p[GG]<.05       HFe        p[HF] p[HF]<.05
## 2       drink 0.5771143 2.976868e-02         * 0.5907442 2.881391e-02         *
## 3       image 0.7474407 1.757286e-13         * 0.7968420 3.142804e-14         *
## 4 drink:image 0.7983979 1.900249e-08         * 0.9785878 6.809640e-10         *

Plotting

# Define colors
color <- c("lightyellow", "lightblue", "lightgreen")

# Calculate means and standard deviations
thisData.mean <- tapply(thisData.molten$value, thisData.molten$drink, mean)
thisData.sd <- tapply(thisData.molten$value, thisData.molten$drink, sd)
thisData.n <- tapply(thisData.molten$value, thisData.molten$drink, length)
# Create bar plot with confidence intervals
library(gplots)
barplot2(thisData.mean,
         ylim = c(0, 20),
         beside = TRUE,
         main = "Field Ch.13 (p.494)",
         ylab = "Mean Attitude",
         xlab = "Drink",
         col = color,
         ci.l = thisData.mean - ((thisData.sd / sqrt(thisData.n)) * qt(0.975, df = (thisData.n - 1))),
         ci.u = thisData.mean + ((thisData.sd / sqrt(thisData.n)) * qt(0.975, df = (thisData.n - 1))),
         plot.ci = TRUE)

legend("topright", levels(thisData.molten$drink), fill = color)


Repeated-measure ANOVA (One-way; Built-in Function)

# Create a dummy data set
thisData <- data.frame(
  early = c(4, 10, 5, 8, 2),
  mid = c(6, 9, 4, 6, 2),
  late = c(2, 8, 3, 7, 2)
)

library(reshape)
# Add subject identifier
thisData$subject <- as.factor(rownames(thisData))

# Melt the data
thisData.molten <- melt(thisData, id.vars = "subject")
head(thisData.molten)
##   subject variable value
## 1       1    early     4
## 2       2    early    10
## 3       3    early     5
## 4       4    early     8
## 5       5    early     2
## 6       1      mid     6
# Conduct one-way RM ANOVA
fit <- aov(value ~ variable + Error(subject / variable), data = thisData.molten)
summary(fit)
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4   92.4    23.1               
## 
## Error: subject:variable
##           Df Sum Sq Mean Sq F value Pr(>F)
## variable   2    5.2     2.6   2.364  0.156
## Residuals  8    8.8     1.1

Repeated-measure ANOVA (One-way; With Follow-up Tests)

# Read data from local disk
thisData <- read.table("data/Pitt_Shoaf3.txt", header = TRUE)

# Aggregate the data (i.e., collapsing the 'overlap' factor)
aggregatedData <- aggregate(thisData$rt, list(thisData$subj, thisData$position), mean)
colnames(aggregatedData) <- c("subj", "position", "rt")

library(psych)
by(aggregatedData$rt, list(aggregatedData$position), describe)
## : early
## dd[x, ] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       97        0       93        1    832.8      169    615.9    636.5 
##      .25      .50      .75      .90      .95 
##    714.5    825.0    930.0   1044.8   1072.3 
## 
## lowest : 540.5  560.5  577.5  609.5  615.5 , highest: 1075.5 1107   1113.5 1125   1134  
## ------------------------------------------------------------ 
## : late
## dd[x, ] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       97        0       93        1      814    207.8    565.3    588.7 
##      .25      .50      .75      .90      .95 
##    665.0    812.0    941.5   1048.0   1094.7 
## 
## lowest : 244    437.5  539.5  540    556.5 , highest: 1105.5 1126.5 1200.5 1238.5 1267  
## ------------------------------------------------------------ 
## : mid
## dd[x, ] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       97        0       93        1      799    176.1    583.5    595.6 
##      .25      .50      .75      .90      .95 
##    692.5    810.5    898.5    994.3   1060.2 
## 
## lowest : 398    462    479.5  501.5  583.5 , highest: 1069   1080   1087   1090.5 1143.5
# Conduct one-way RM ANOVA with aov()
fit <- aov(rt ~ position + Error(subj / position), data = aggregatedData)
summary(fit)
## 
## Error: subj
##           Df  Sum Sq Mean Sq F value Pr(>F)
## Residuals 96 5093672   53059               
## 
## Error: subj:position
##            Df  Sum Sq Mean Sq F value Pr(>F)
## position    2   55829   27914    2.15  0.119
## Residuals 192 2492861   12984
# Conduct one-way RM ANOVA with lme()
library(nlme)
fit2 <- lme(rt ~ position, random = ~1 | subj, data = aggregatedData)
summary(fit2)
## Linear mixed-effects model fit by REML
##   Data: aggregatedData 
##        AIC      BIC    logLik
##   3706.147 3724.462 -1848.074
## 
## Random effects:
##  Formula: ~1 | subj
##         (Intercept) Residual
## StdDev:     115.579 113.9458
## 
## Fixed effects:  rt ~ position 
##                Value Std.Error  DF  t-value p-value
## (Intercept) 815.2629 13.503110 192 60.37594  0.0000
## position1    17.5670  9.446408 192  1.85965  0.0645
## position2    -1.2784  9.446408 192 -0.13533  0.8925
##  Correlation: 
##           (Intr) postn1
## position1  0.0         
## position2  0.0   -0.5  
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -2.59527641 -0.56553153 -0.05408611  0.57800835  3.01507827 
## 
## Number of Observations: 291
## Number of Groups: 97
anova(fit2)
##             numDF denDF  F-value p-value
## (Intercept)     1   192 3645.254  <.0001
## position        2   192    2.150  0.1193
# Conduct one-way RM ANOVA with multivariate approach
crossTabulatedData <- xtabs(rt ~ subj + position, data = thisData) / 2
crossTabulatedData
##       position
## subj    early   late    mid
##   AALA 1107.0  724.5  773.0
##   AAMS  702.5  751.5  853.0
##   ADAN 1064.5  609.0  959.0
##   AING  884.0  998.5  804.0
##   AITT  822.0  724.0  846.5
##   ALER 1125.0 1105.5  870.5
##   AMAM  703.0  680.0  619.0
##   AMAN  710.5  827.0  686.5
##   AOOD  908.0  893.5  805.5
##   AOPE  609.5  579.0  501.5
##   ASAO  793.0 1267.0 1087.0
##   BECK  835.5  654.0  813.5
##   BHEL  686.5  911.5  659.0
##   BHER 1005.0  802.5  791.5
##   BKIS 1021.5  862.0  695.0
##   BREK  880.5  941.0  957.0
##   BUHN  893.0  922.0  898.5
##   CLET  943.5  539.5  708.5
##   CMER  889.0  665.5  583.5
##   CNAN  794.5  567.5  709.0
##   COTS  836.5  826.0  850.5
##   DDUM  616.0  735.5  650.5
##   DIED  620.0  726.0  692.5
##   DKEL 1032.5  975.0  990.5
##   DLER  734.5  678.5  711.5
##   DSER 1044.0 1048.0  783.0
##   DYNE  809.5 1055.0  825.5
##   EEVA  651.5  569.0  685.0
##   ELES  759.5  844.5  878.5
##   ELLO  793.5  806.5  891.0
##   ESEN  770.0  828.0  978.5
##   GEOR  627.5  825.5  583.5
##   GICH  560.5  802.0  823.0
##   GNKS 1075.5  974.0  942.0
##   HEAD  895.5  894.0  957.5
##   HLOC  855.0 1028.0  834.0
##   HTON  930.0 1200.5 1022.0
##   JAIN  853.0  812.0  868.0
##   JBRY  929.5  946.0  868.0
##   JDER  981.5  839.0  945.0
##   JDYE  816.0  934.0  584.5
##   JERT  842.0  590.5  682.5
##   JGER  797.5  675.5  814.5
##   JGGS  743.0  741.0  850.5
##   JILL  540.5  640.5  634.0
##   JKIN  959.0  692.0  728.0
##   JLAR  617.0  647.5  590.0
##   JMAN  917.5  810.5  712.0
##   JOSH  883.5  901.5  968.5
##   JRON  689.0  540.0  590.5
##   JROS 1068.5 1092.0 1007.0
##   JSEL  749.0  642.5  772.5
##   JSER  816.5  866.5  849.0
##   JSOM  930.5  981.5 1090.5
##   JSON  642.5  244.0  398.0
##   JTIL  817.5  947.5  782.5
##   KDGE  714.5  910.5  904.5
##   KERD  785.0 1029.5  954.0
##   KETI 1113.5 1026.5  857.5
##   KIMB  667.0  556.5  462.0
##   KIRK  890.5  915.0 1000.0
##   KNDY  969.0  842.5  951.5
##   KOFF 1071.5  887.0  725.5
##   KSON  882.5  942.0  842.5
##   KTON 1048.0  859.5  837.0
##   LARD  798.5  665.0  728.0
##   LAUR  615.5  648.0  479.5
##   LNIN  957.0  832.5  969.5
##   LYNH  926.5 1064.0 1044.0
##   MDER  877.0  437.5  810.5
##   MINS  728.5  810.0  876.0
##   MIOT  914.0  665.0  635.5
##   MKLE  715.0  788.0  770.0
##   MMBO 1022.0  783.0 1143.5
##   MSON 1013.0 1048.0  820.0
##   MTOK  847.5  980.0 1069.0
##   NHAW  813.5  626.0  599.0
##   NIHA  778.5  750.5  748.5
##   PUNO  703.0  570.0  753.5
##   REEN  755.5  586.0  635.0
##   RLLS  825.0  801.5  754.0
##   RNKE 1046.0 1238.5  946.5
##   RUCH  836.5  899.5  865.0
##   SALE  952.0  712.5 1080.0
##   SANO  798.5 1015.0  816.0
##   SARS  680.5  647.0  644.5
##   SART  825.0 1082.5  909.0
##   SEAD 1134.0 1015.0  805.0
##   SETT 1023.0  652.5  860.0
##   SIST  668.5  613.5  617.5
##   SLER  666.0  630.0  696.5
##   SLEY  660.5  820.0  693.0
##   SSKI  844.0  941.5 1058.0
##   STEP  761.0  664.5  804.5
##   SVES  670.5  776.5  587.0
##   TITH  626.5  712.5  674.0
##   TSBY  577.5 1126.5  618.5
mlm1 <- lm(crossTabulatedData ~ 1)

position <- factor(colnames(crossTabulatedData))
library(car)
mlm1.aov <- Anova(mlm1, idata = data.frame(position), idesign = ~position, type = "III")

summary(mlm1.aov, multivariate = FALSE)
## 
## Univariate Type III Repeated-Measures ANOVA Assuming Sphericity
## 
##                Sum Sq num Df Error SS den Df F value Pr(>F)    
## (Intercept) 193414190      1  5093672     96 3645.26 <2e-16 ***
## position        55829      2  2492861    192    2.15 0.1193    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Mauchly Tests for Sphericity
## 
##          Test statistic  p-value
## position        0.92105 0.020116
## 
## 
## Greenhouse-Geisser and Huynh-Feldt Corrections
##  for Departure from Sphericity
## 
##           GG eps Pr(>F[GG])
## position 0.92683     0.1233
## 
##             HF eps Pr(>F[HF])
## position 0.9443014  0.1223673
# Pairwise t-tests
contrastD <- crossTabulatedData[, 1] - crossTabulatedData[, 2]
t.test(contrastD)
## 
##  One Sample t-test
## 
## data:  contrastD
## t = 1.0218, df = 96, p-value = 0.3095
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -17.76518  55.45591
## sample estimates:
## mean of x 
##  18.84536
contrastD <- crossTabulatedData[, 1] - crossTabulatedData[, 3]
t.test(contrastD)
## 
##  One Sample t-test
## 
## data:  contrastD
## t = 2.3142, df = 96, p-value = 0.02279
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   4.816731 62.894609
## sample estimates:
## mean of x 
##  33.85567
contrastD <- crossTabulatedData[, 2] - crossTabulatedData[, 3]
t.test(contrastD)
## 
##  One Sample t-test
## 
## data:  contrastD
## t = 0.95138, df = 96, p-value = 0.3438
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -16.30744  46.32806
## sample estimates:
## mean of x 
##  15.01031

Repeated-measure ANOVA (Two-way)

# Read data from local disk
thisData <- read.table("data/Pitt_Shoaf3.txt", header = TRUE)

# Cross-tabulate by two factors
crossTabulatedData <- xtabs(rt ~ subj + position + overlap, data = thisData)

# SPSS requires a strange data format for two-way RM ANOVA
temp.threeOverlap <- as.data.frame(crossTabulatedData[, , 1])
temp.zeroOverlap <- as.data.frame(crossTabulatedData[, , 2])
colnames(temp.threeOverlap) <- paste(colnames(temp.threeOverlap), "3", sep = "_")
colnames(temp.zeroOverlap) <- paste(colnames(temp.zeroOverlap), "0", sep = "_")

data4SPSS <- data.frame(temp.threeOverlap, temp.zeroOverlap, check.names = TRUE, check.rows = TRUE)

library(psych)
describe(aggregatedData)
## aggregatedData 
## 
##  3  Variables      291  Observations
## --------------------------------------------------------------------------------
## subj 
##        n  missing distinct 
##      291        0       97 
## 
## lowest : AALA AAMS ADAN AING AITT, highest: SSKI STEP SVES TITH TSBY
## --------------------------------------------------------------------------------
## position 
##        n  missing distinct 
##      291        0        3 
##                             
## Value      early  late   mid
## Frequency     97    97    97
## Proportion 0.333 0.333 0.333
## --------------------------------------------------------------------------------
## rt 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      291        0      260        1    815.3    184.1    578.2    616.0 
##      .25      .50      .75      .90      .95 
##    690.5    816.0    930.2   1032.5   1077.8 
## 
## lowest : 244    398    437.5  462    479.5 , highest: 1134   1143.5 1200.5 1238.5 1267  
## --------------------------------------------------------------------------------
# Conduct two-way factorial RM ANOVA with aov()
fit <- aov(rt ~ position * overlap + Error(subj / (position * overlap)), data = thisData)
summary(fit)
## 
## Error: subj
##           Df   Sum Sq Mean Sq F value Pr(>F)
## Residuals 96 10187344  106118               
## 
## Error: subj:position
##            Df  Sum Sq Mean Sq F value Pr(>F)
## position    2  111658   55829    2.15  0.119
## Residuals 192 4985721   25967               
## 
## Error: subj:overlap
##           Df  Sum Sq Mean Sq F value Pr(>F)
## overlap    1   19758   19758   1.269  0.263
## Residuals 96 1494378   15566               
## 
## Error: subj:position:overlap
##                   Df  Sum Sq Mean Sq F value    Pr(>F)    
## position:overlap   2  284424  142212   10.12 0.0000662 ***
## Residuals        192 2697804   14051                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Mixed-design ANOVA (W12)

Field Textbook Ch.14 Mixed-design ANOVA (Demo Using R)

library(foreign)
library(reshape)

# Load the data
thisData <- read.spss("data/LooksOrPersonalityGraphs.sav", to.data.frame = TRUE)

# Subset the data
thisData.subset <- subset(thisData, !is.na(gender), select = c(
  "gender", "att_high", "av_high", "ug_high",
  "att_some", "av_some", "ug_some",
  "att_none", "av_none", "ug_none"
))

# Add subject identifier
thisData.subset$subject <- as.factor(rownames(thisData.subset))

# Melt the data
newData <- melt(thisData.subset, id.vars = c("subject", "gender"))
newData$subject <- factor(newData$subject)
# Create factors for 'charisma' and 'looks'
newData$charisma <- factor(
  ifelse(grepl("_high", newData$variable), "high",
         ifelse(grepl("_some", newData$variable), "some", "dullard")),
  levels = c("high", "some", "dullard")
)

newData$looks <- factor(
  ifelse(grepl("att_", newData$variable), "attractive",
         ifelse(grepl("av_", newData$variable), "average", "ugly")),
  levels = c("attractive", "average", "ugly")
)

# Rename value column
colnames(newData)[4] <- "rate"
# Incorrect model (ignores repeated measures)
model1 <- aov(rate ~ charisma * looks, data = newData)
summary(model1)
##                 Df Sum Sq Mean Sq F value   Pr(>F)    
## charisma         2  23234   11617  124.59  < 2e-16 ***
## looks            2  20780   10390  111.43  < 2e-16 ***
## charisma:looks   4   4055    1014   10.87 7.06e-08 ***
## Residuals      171  15944      93                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Correct mixed-design ANOVA
model2 <- aov(rate ~ gender * charisma * looks + Error(subject / (charisma * looks)), data = newData)
summary(model2)
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## gender     1    0.2    0.20   0.005  0.946
## Residuals 18  760.2   42.23               
## 
## Error: subject:charisma
##                 Df Sum Sq Mean Sq F value   Pr(>F)    
## charisma         2  23234   11617  328.25  < 2e-16 ***
## gender:charisma  2   4420    2210   62.45 1.97e-12 ***
## Residuals       36   1274      35                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Error: subject:looks
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## looks         2  20780   10390  423.73  < 2e-16 ***
## gender:looks  2   3944    1972   80.43 5.23e-14 ***
## Residuals    36    883      25                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Error: subject:charisma:looks
##                       Df Sum Sq Mean Sq F value   Pr(>F)    
## charisma:looks         4   4055  1013.8   36.63  < 2e-16 ***
## gender:charisma:looks  4   2670   667.4   24.12 1.11e-12 ***
## Residuals             72   1993    27.7                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Bar Plots

library(gplots)

# Define sample size
num <- length(unique(newData$subject))

# Bar plot for gender
means_gender <- tapply(newData$rate, newData$gender, mean)
sds_gender <- tapply(newData$rate, newData$gender, sd)
barplot2(means_gender,
         beside = TRUE,
         main = "Bar plots with 95% CI (not adjusted for RM)",
         col = "lightgreen",
         ylab = "Mean Attitude",
         xlab = "Gender",
         ci.l = means_gender - ((sds_gender / sqrt(num)) * qt(0.975, df = (num - 1))),
         ci.u = means_gender + ((sds_gender / sqrt(num)) * qt(0.975, df = (num - 1))),
         plot.ci = TRUE)

# Interaction plots
interaction.plot(newData$looks, newData$gender, newData$rate, main = "Interaction plot: looks by gender", type = "b", pch = c(19, 15), col = c("orange", "blue"))

interaction.plot(newData$charisma, newData$gender, newData$rate, main = "Interaction plot: charisma by gender", type = "b", pch = c(19, 15), col = c("orange", "blue"))

interaction.plot(newData$looks, newData$charisma, newData$rate, main = "Interaction plot: looks by charisma", type = "b", pch = c(19, 15, 17), col = c("orange", "blue", "red"))


Mixed-design ANOVA (Using ez Package)

# Read data from local disk
thisData <- read.table("data/spanengRT.txt", header = TRUE)
thisData$listener <- as.factor(thisData$listener)

# Subset the data
subsetData <- subset(thisData, pair %in% c("d_r", "d_th", "r_th"))
library(gdata)
subsetData <- drop.levels(subsetData)
#library(ez)
## Conduct mixed-design ANOVA
#model.ezAnova <- ezANOVA(
#  data = thisData,
#  dv = .(MedianRT),
#  wid = .(listener),
#  within = .(pair),
#  between = .(group)
#)
#model.ezAnova

Logistic Regression (W14)

Logistic Regression Implemented with glm()

Family Type of Regression Default Link Function
binomial Logistic regression binomial("logit")
gaussian Gaussian gaussian("identity")
Gamma Gamma distribution Gamma("inverse")
inverse.gaussian Inverse Gaussian inverse.gaussian("1/mu^2")
poisson Poisson poisson("log")
quasibinomial Quasi-binomial quasibinomial("logit")
quasipoisson Quasi-Poisson quasipoisson("log")

Logistic Regression Example

library(psych)

# Read the data
thisData <- read.csv("data/DDRASSTR.txt", sep = "\t")

# Convert variables to factors
thisData$strnumbers <- factor(thisData$strnumbers)
thisData$style <- factor(thisData$style)
thisData$region <- factor(thisData$region)
thisData$banknumbers <- factor(thisData$banknumbers)
thisData$classnumbers <- factor(thisData$classnumbers)
thisData$participant.number <- factor(thisData$participant.number)

summary(thisData)
##      str            strnumbers   emphatic            gender          style  
##  Length:240         0:183      Length:240         Length:240         0:120  
##  Class :character   1: 57      Class :character   Class :character   1:120  
##  Mode  :character              Mode  :character   Mode  :character          
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   ageletters            age            region     Mall           banknumbers
##  Length:240         Length:240         1:81   Length:240         2:34       
##  Class :character   Class :character   2:80   Class :character   3:38       
##  Mode  :character   Mode  :character   3:79   Mode  :character   4:30       
##                                                                  5:80       
##                                                                  6:58       
##                                                                             
##                                                                             
##      bank              store           participant.number     job           
##  Length:240         Length:240         1      :  2        Length:240        
##  Class :character   Class :character   2      :  2        Class :character  
##  Mode  :character   Mode  :character   3      :  2        Mode  :character  
##                                        4      :  2                          
##                                        5      :  2                          
##                                        6      :  2                          
##                                        (Other):228                          
##  classnumbers    class          
##  1:73         Length:240        
##  2:88         Class :character  
##  3:79         Mode  :character  
##                                 
##                                 
##                                 
## 
describe(thisData)
## thisData 
## 
##  16  Variables      240  Observations
## --------------------------------------------------------------------------------
## str 
##        n  missing distinct 
##      240        0        2 
##                       
## Value       shtr   str
## Frequency     57   183
## Proportion 0.238 0.762
## --------------------------------------------------------------------------------
## strnumbers 
##        n  missing distinct 
##      240        0        2 
##                       
## Value          0     1
## Frequency    183    57
## Proportion 0.762 0.238
## --------------------------------------------------------------------------------
## emphatic 
##        n  missing distinct 
##      240        0        2 
##                     
## Value      less more
## Frequency   120  120
## Proportion  0.5  0.5
## --------------------------------------------------------------------------------
## gender 
##        n  missing distinct 
##      240        0        2 
##                   
## Value        m   w
## Frequency  120 120
## Proportion 0.5 0.5
## --------------------------------------------------------------------------------
## style 
##        n  missing distinct 
##      240        0        2 
##                   
## Value        0   1
## Frequency  120 120
## Proportion 0.5 0.5
## --------------------------------------------------------------------------------
## ageletters 
##        n  missing distinct 
##      240        0        3 
##                             
## Value          a     b     c
## Frequency     80    80    80
## Proportion 0.333 0.333 0.333
## --------------------------------------------------------------------------------
## age 
##        n  missing distinct 
##      240        0        3 
##                             
## Value        mid   old young
## Frequency     80    80    80
## Proportion 0.333 0.333 0.333
## --------------------------------------------------------------------------------
## region 
##        n  missing distinct 
##      240        0        3 
##                             
## Value          1     2     3
## Frequency     81    80    79
## Proportion 0.338 0.333 0.329
## --------------------------------------------------------------------------------
## Mall 
##        n  missing distinct 
##      240        0        3 
##                                            
## Value      CityCenter     Easton    Polaris
## Frequency          79         80         81
## Proportion      0.329      0.333      0.338
## --------------------------------------------------------------------------------
## banknumbers 
##        n  missing distinct 
##      240        0        5 
##                                         
## Value          2     3     4     5     6
## Frequency     34    38    30    80    58
## Proportion 0.142 0.158 0.125 0.333 0.242
## --------------------------------------------------------------------------------
## bank 
##        n  missing distinct 
##      240        0        5 
##                                         
## Value        LMC   MMC   MWC   UMC   UWC
## Frequency     30    80    34    58    38
## Proportion 0.125 0.333 0.142 0.242 0.158
## --------------------------------------------------------------------------------
## store 
##        n  missing distinct 
##      240        0       35 
## 
## lowest : aeagle    anntaylor banana    best buy  brepub   
## highest: sbarro    sears     target    udf       walmart  
## --------------------------------------------------------------------------------
## participant.number 
##        n  missing distinct 
##      240        0      120 
## 
## lowest : 1   2   3   4   5  , highest: 121 122 123 124 125
## --------------------------------------------------------------------------------
## job 
##        n  missing distinct 
##      240        0        3 
##                             
## Value          c     f     g
## Frequency    236     2     2
## Proportion 0.983 0.008 0.008
## --------------------------------------------------------------------------------
## classnumbers 
##        n  missing distinct 
##      240        0        3 
##                             
## Value          1     2     3
## Frequency     73    88    79
## Proportion 0.304 0.367 0.329
## --------------------------------------------------------------------------------
## class 
##        n  missing distinct 
##      240        0        3 
##                             
## Value        LMC   UMC    WC
## Frequency     88    79    73
## Proportion 0.367 0.329 0.304
## --------------------------------------------------------------------------------
## Relevel 'emphatic' variable
#thisData$emphatic <- relevel(thisData$emphatic, ref = "more")

# Fit logistic regression model
#fit1 <- glm(str ~ emphatic, data = thisData, family = binomial("logit"))
#summary(fit1)

# Exponentiate coefficients to get odds ratios
#exp(coef(fit1))

Non-parametric Tests (W13)

Rank-sum Test (Wilcoxon Rank-sum Test or Mann-Whitney Test)

x <- c(1.83,  0.50,  1.62,  2.48, 1.68, 1.88, 1.55, 3.06, 1.30)
y <- c(0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29)
wilcox.test(x, y, paired = TRUE, alternative = "greater")
## 
##  Wilcoxon signed rank exact test
## 
## data:  x and y
## V = 40, p-value = 0.01953
## alternative hypothesis: true location shift is greater than 0

Binomial Test

# Binomial test example
binom.test(x = 52, n = 156, p = 0.25)
## 
##  Exact binomial test
## 
## data:  52 and 156
## number of successes = 52, number of trials = 156, p-value = 0.02043
## alternative hypothesis: true probability of success is not equal to 0.25
## 95 percent confidence interval:
##  0.2599810 0.4131553
## sample estimates:
## probability of success 
##              0.3333333

Chi-square Test

Assumptions of chi-square test:

  • All variables must be categorical.
  • Random sample data.
  • Sufficiently large sample in each cell (at least 5 in each cell).
  • Independence of each observation.
# Chi-square test example
x <- matrix(c(16, 4, 8, 12), nrow = 2, byrow = TRUE)
chisq.test(x, correct = FALSE)
## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 6.6667, df = 1, p-value = 0.009823

Fisher’s Exact Test

TeaTasting <- matrix(c(3, 1, 1, 3), nrow = 2,
                     dimnames = list(Guess = c("Milk", "Tea"), Truth = c("Milk", "Tea")))
fisher.test(TeaTasting, alternative = "greater")
## 
##  Fisher's Exact Test for Count Data
## 
## data:  TeaTasting
## p-value = 0.2429
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  0.3135693       Inf
## sample estimates:
## odds ratio 
##   6.408309

Kruskal-Wallis Rank Sum Test

uncooperativeness <- c(6,7,10,8,8,9,8,10,6,8,7,3,7,1)
treatment <- factor(rep(1:3, c(6, 4, 4)), labels = c("Method A", "Method B", "Method C"))
uncooperativenessData <- data.frame(uncooperativeness, treatment)

kruskal.test(uncooperativeness ~ treatment, data = uncooperativenessData)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  uncooperativeness by treatment
## Kruskal-Wallis chi-squared = 4.664, df = 2, p-value = 0.0971
pairwise.t.test(uncooperativeness, treatment, p.adj = "bonferroni")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  uncooperativeness and treatment 
## 
##          Method A Method B
## Method B 1.000    -       
## Method C 0.064    0.097   
## 
## P value adjustment method: bonferroni
pairwise.t.test(uncooperativeness, treatment, p.adj = "holm")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  uncooperativeness and treatment 
## 
##          Method A Method B
## Method B 1.000    -       
## Method C 0.064    0.065   
## 
## P value adjustment method: holm