# clear the cache
rm(list = ls())
# load a package. Install it if it's not on the system yet.
if(!require("gdata")) {install.packages("gdata")}
#install.packages(c("psych","gdata","reshape","xtable","gplots","nlme","foreign","ez"))
#library(ggplot2); library(gdata); library(ggthemes); library(plyr);
#library(memisc); library(reshape); library(reshape2); library(devtools)
#library(tidyverse); library(dplyr)
#library(devtools); library(ipeds); library(xtable); library(openxlsx); library(foreign);
#install.packages("acs"); install.packages("ggplot2");xt install.packages("maps"); install.packages("mapdata");
data(airquality)
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
getRversion()
## [1] '4.3.1'
#package-version()
# save history
#savehistory("name.Rhistory")
#loadhistory("name.Rhistory")
# save hisotry
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
string <- "This is a sample string"
cat(substring(string,1,10),substring(string,15,19),"\n",sep="\t")
## This is a le st
cat("Roots of two: ",format(2^(1:10),digit=2),"\n")
## Roots of two: 2 4 8 16 32 64 128 256 512 1024
# make a matrix with array or matrix
matrix1 <- array(1:10, dim=c(2,5))
matrix1
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 3 5 7 9
## [2,] 2 4 6 8 10
matrix2 <- matrix(1:10,2,5)
matrix2
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 3 5 7 9
## [2,] 2 4 6 8 10
matrix3 <- 1:10;
dim(matrix3) <- c(2,5)
matrix3
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 3 5 7 9
## [2,] 2 4 6 8 10
class(matrix1)
## [1] "matrix" "array"
# numbers are recycled if the number of cells is larger
array(1:10, dim=c(5,10))
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 6 1 6 1 6 1 6 1 6
## [2,] 2 7 2 7 2 7 2 7 2 7
## [3,] 3 8 3 8 3 8 3 8 3 8
## [4,] 4 9 4 9 4 9 4 9 4 9
## [5,] 5 10 5 10 5 10 5 10 5 10
# this performs cell-by-cell calculation
m1 <- array(1:10, dim=c(2,5))
m1 + m1
## [,1] [,2] [,3] [,4] [,5]
## [1,] 2 6 10 14 18
## [2,] 4 8 12 16 20
m1 * m1
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 9 25 49 81
## [2,] 4 16 36 64 100
# matrix calculus from J.Penzer's note (2006)
m4 <- array(1:3, c(4,2))
m5 <- array(3:8, c(2,3))
m4 %*% m5
## [,1] [,2] [,3]
## [1,] 11 17 23
## [2,] 18 28 38
## [3,] 13 21 29
## [4,] 11 17 23
m6 <- array(c(1,3,2,1),c(2,2))
m6
## [,1] [,2]
## [1,] 1 2
## [2,] 3 1
v1 <- array(c(1,0), c(2,1))
solve(m6,v1)
## [,1]
## [1,] -0.2
## [2,] 0.6
solve(m6) # inverts m6
## [,1] [,2]
## [1,] -0.2 0.4
## [2,] 0.6 -0.2
# does the same as solve(m6,v1)
solve(m6) %*% v1
## [,1]
## [1,] -0.2
## [2,] 0.6
function
keywordsum_numbers <- function(a, b) {
sum <- a + b
return(sum)
}
# Call the function
result <- sum_numbers(3, 4)
print(result) # Output: 7
## [1] 7
# change the default width
width.default <- getOption("width"); options(width=120)
# restoring default
options(width=width.default)
# changing the number of digits
options(digits=7)
# put biase toward non-scientific notations
options(scipen=3)
# list of all color names
# also see ?palette
head(colors())
## [1] "white" "aliceblue" "antiquewhite" "antiquewhite1"
## [5] "antiquewhite2" "antiquewhite3"
# different color combo
colors3 <- c("lightblue","lightyellow","lightgreen")
colors5 = c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99")
# pre-set color functions
# it looks better to set the alpha level to less than 1
rainbow(10)
## [1] "#FF0000" "#FF9900" "#CCFF00" "#33FF00" "#00FF66" "#00FFFF" "#0066FF"
## [8] "#3300FF" "#CC00FF" "#FF0099"
heat.colors(10, alpha = 1)
## [1] "#FF0000FF" "#FF2400FF" "#FF4900FF" "#FF6D00FF" "#FF9200FF" "#FFB600FF"
## [7] "#FFDB00FF" "#FFFF00FF" "#FFFF40FF" "#FFFFBFFF"
terrain.colors(10, alpha = 1)
## [1] "#00A600FF" "#2DB600FF" "#63C600FF" "#A0D600FF" "#E6E600FF" "#E8C32EFF"
## [7] "#EBB25EFF" "#EDB48EFF" "#F0C9C0FF" "#F2F2F2FF"
topo.colors(10, alpha = 1)
## [1] "#4C00FFFF" "#0019FFFF" "#0080FFFF" "#00E5FFFF" "#00FF4DFF" "#4DFF00FF"
## [7] "#E6FF00FF" "#FFFF00FF" "#FFDE59FF" "#FFE0B3FF"
cm.colors(10, alpha = 1)
## [1] "#80FFFFFF" "#99FFFFFF" "#B3FFFFFF" "#CCFFFFFF" "#E6FFFFFF" "#FFE6FFFF"
## [7] "#FFCCFFFF" "#FFB3FFFF" "#FF99FFFF" "#FF80FFFF"
# with color brewer
library(RColorBrewer)
display.brewer.all() # show all pre-set colors
colorRampPalette(brewer.pal(9,"Blues"))(15) # for more than 9 colors
## [1] "#F7FBFF" "#E8F1FA" "#DAE8F5" "#CCDFF1" "#BAD6EB" "#A3CCE3" "#88BEDC"
## [8] "#6BAED6" "#539ECC" "#3D8DC3" "#2A7AB9" "#1967AD" "#0B559F" "#084287"
## [15] "#08306B"
display.brewer.pal(6,"BuGn")
brewer.pal(6,"BuGn")
## [1] "#EDF8FB" "#CCECE6" "#99D8C9" "#66C2A4" "#2CA25F" "#006D2C"
# make a dummy data set
x = c(rep(1,5),rep(2,3),rep(3,5))
y = c(7,5,6,8,4,5,4,6,3,5,4,6,2)
z = rnorm(13,mean=10,sd=5)
thisData = cbind(x,y,z)
thisData = as.data.frame(thisData)
thisData$x = factor(thisData$x)
levels(thisData$x) = c("A","B","C")
summary(thisData)
## x y z
## A:5 Min. :2 Min. : 1.428
## B:3 1st Qu.:4 1st Qu.: 4.732
## C:5 Median :5 Median : 6.436
## Mean :5 Mean : 7.044
## 3rd Qu.:6 3rd Qu.: 7.773
## Max. :8 Max. :15.603
library(psych)
describe(thisData)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## x* 1 13 2.00 0.91 2.00 2.00 1.48 1.00 3.0 2.00 0.00 -1.89 0.25
## y 2 13 5.00 1.63 5.00 5.00 1.48 2.00 8.0 6.00 0.00 -0.84 0.45
## z 3 13 7.04 4.39 6.44 6.78 2.53 1.43 15.6 14.18 0.76 -0.41 1.22
head(thisData)
## x y z
## 1 A 7 5.303664
## 2 A 5 8.842583
## 3 A 6 7.772980
## 4 A 8 4.732023
## 5 A 4 15.603343
## 6 B 5 15.504260
thisData[1:3,]
## x y z
## 1 A 7 5.303664
## 2 A 5 8.842583
## 3 A 6 7.772980
matrix(letters[1:10],nrow=5,ncol=2)
## [,1] [,2]
## [1,] "a" "f"
## [2,] "b" "g"
## [3,] "c" "h"
## [4,] "d" "i"
## [5,] "e" "j"
matrix(letters[1:10],nrow=5,ncol=2,byrow=TRUE)
## [,1] [,2]
## [1,] "a" "b"
## [2,] "c" "d"
## [3,] "e" "f"
## [4,] "g" "h"
## [5,] "i" "j"
# mean, SD, and n from the published research
mean = 5; SD = 0.8; n = 300
X0 <- rnorm(n)
SE = SD/sqrt(n)
thisData <- cbind(mean+SE*sqrt(length(X0))*(X0-mean(X0)/sd(X0)),"some_level")
# or use rnorm(n, mean = mean_val, sd = sd_val)
mean = 7; SD = 0.5; n = 100
X0 <- rnorm(n)
SE = SD/sqrt(n)
thisData <- rbind(thisData,cbind(mean+SE*sqrt(length(X0))*(X0-mean(X0)/sd(X0)),"another_level"))
thisData <- data.frame(as.numeric(thisData[,1]),thisData[,-1])
colnames(thisData) <- c("value","factor")
head(thisData)
## value factor
## 1 5.571832 some_level
## 2 4.231714 some_level
## 3 3.551474 some_level
## 4 5.262986 some_level
## 5 4.891005 some_level
## 6 4.967727 some_level
# checking the accuracy
mean(thisData$value)
## [1] 5.500525
tapply(thisData$value,thisData$factor,mean)
## another_level some_level
## 6.998733 5.001122
sd(thisData$value)
## [1] 1.116399
tapply(thisData$value,thisData$factor,sd)
## another_level some_level
## 0.5121005 0.7585546
library(gdata)
x = c(rep(1,5),rep(2,3),rep(3,5))
y = c(7,5,6,8,4,5,4,6,3,5,4,6,2)
z = rnorm(13,mean=10,sd=5)
thisData = cbind(x,y,z)
thisData = as.data.frame(thisData)
thisData$x = factor(thisData$x)
levels(thisData$x) = c("A","B","C")
for (i in 1:length(levels(thisData$x))) {
print(levels(thisData$x)[i])
tempData <- drop.levels(thisData[thisData$x == levels(thisData$x)[i],],reorder=FALSE)
print(summary(tempData))
}
## [1] "A"
## x y z
## A:5 Min. :4 Min. :10.78
## 1st Qu.:5 1st Qu.:13.11
## Median :6 Median :13.90
## Mean :6 Mean :13.71
## 3rd Qu.:7 3rd Qu.:14.69
## Max. :8 Max. :16.07
## [1] "B"
## x y z
## B:3 Min. :4.0 Min. :0.5999
## 1st Qu.:4.5 1st Qu.:3.8907
## Median :5.0 Median :7.1815
## Mean :5.0 Mean :5.7888
## 3rd Qu.:5.5 3rd Qu.:8.3833
## Max. :6.0 Max. :9.5850
## [1] "C"
## x y z
## C:5 Min. :2 Min. : 5.818
## 1st Qu.:3 1st Qu.: 6.972
## Median :4 Median : 7.959
## Mean :4 Mean :10.485
## 3rd Qu.:5 3rd Qu.: 9.121
## Max. :6 Max. :22.556
library(dplyr)
library(foreign)
library(haven)
# Set the seed for reproducibility
set.seed(123)
# Generate fake data
fake_data <- data.frame(
ID = 1:100, # ID column from 1 to 100
Age = sample(18:70, 100, replace = TRUE), # Age column with random values between 18 and 70
Gender = sample(c("Male", "Female"), 100, replace = TRUE), # Gender column with random values of "Male" and "Female"
Weight = round(rnorm(100, mean = 70, sd = 10), digits = 2), # Weight column with random normal distribution between 50 and 90 with mean 70, standard deviation 10, and rounded to 2 decimal places
Height = round(rnorm(100, mean = 170, sd = 10), digits = 2), # Height column with random normal distribution between 150 and 190 with mean 170, standard deviation 10, and rounded to 2 decimal places
Income = round(runif(100, min = 1000, max = 10000), digits = 2) # Income column with random uniform distribution between 1000 and 10000 and rounded to 2 decimal places
)
write.table(fake_data, file = "fake_data.csv", sep = ",", col.names = NA, qmethod = "double")
write_sav(fake_data, "fake_data.sav")
# Define the path in Unix
# setwd('/media/MYUSB/PATH/file.txt')
# Define the path in Windows
# setwd('F:/PATH/file.txt')
# from a text file
ratings = read.table("data/fake_data.csv",sep=" ",header=TRUE)
summary(ratings)
## X.ID.Age.Gender.Weight.Height.Income
## Length:100
## Class :character
## Mode :character
# from SPSS file
ratings = read.spss("data/fake_data.sav", to.data.frame=TRUE)
summary(ratings)
## ID Age Gender Weight
## Min. : 1.00 Min. :20.00 Length:100 Min. :45.05
## 1st Qu.: 25.75 1st Qu.:32.00 Class :character 1st Qu.:62.91
## Median : 50.50 Median :44.00 Mode :character Median :69.81
## Mean : 50.50 Mean :45.03 Mean :70.58
## 3rd Qu.: 75.25 3rd Qu.:57.25 3rd Qu.:78.03
## Max. :100.00 Max. :70.00 Max. :93.75
## Height Income
## Min. :136.9 Min. :1181
## 1st Qu.:161.2 1st Qu.:3526
## Median :169.8 Median :5779
## Mean :169.1 Mean :5650
## 3rd Qu.:176.7 3rd Qu.:7869
## Max. :197.1 Max. :9937
normal <- rnorm(100,10,10)
t <- rt(100,10)*10
f <- rf(100,10,10)*10
pdf("plot.pdf", width = 8, height = 6, onefile = TRUE, pointsize = 9)
plot(rnorm(100,10,10),col="blue",pch=78)
points(t,col="red",pch=84)
points(f,col="green",pch=70)
dev.off()
## quartz_off_screen
## 2
# Output with sink() (the layout is aligned)
# change STDOUT
sink("output.txt")
i = 1:10
outer(i, i, "*")
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 2 3 4 5 6 7 8 9 10
## [2,] 2 4 6 8 10 12 14 16 18 20
## [3,] 3 6 9 12 15 18 21 24 27 30
## [4,] 4 8 12 16 20 24 28 32 36 40
## [5,] 5 10 15 20 25 30 35 40 45 50
## [6,] 6 12 18 24 30 36 42 48 54 60
## [7,] 7 14 21 28 35 42 49 56 63 70
## [8,] 8 16 24 32 40 48 56 64 72 80
## [9,] 9 18 27 36 45 54 63 72 81 90
## [10,] 10 20 30 40 50 60 70 80 90 100
unlink("output.txt")
# printing dataframe or table
x = data.frame(a="a", b=pi)
# sep can be changed to "\t" or " "
write.table(x, file = "foo.csv", sep = ",", col.names = NA, qmethod = "double")
library(dplyr)
library(tidyr)
# Create and define the data frame
thisData <- data.frame(
early = c(4, 10, 5, 8, 2),
mid = c(6, 9, 4, 6, 2),
late = c(2, 8, 3, 7, 2),
gender = factor(c(1, 1, 1, 2, 2), levels = c(1, 2), labels = c("male", "female"))
)
# Melting the data with gather from tidyr package
moltenData <- thisData %>%
mutate(subject = row_number()) %>%
gather(key = "time", value = "score", -subject, -gender)
# Recovering the original data format using pivot_wider from tidyr
originalData <- moltenData %>%
pivot_wider(names_from = "time", values_from = "score")
# Cross-tabulating the data
crossTabulatedData <- xtabs(score ~ subject + time, data = moltenData)
print(thisData)
## early mid late gender
## 1 4 6 2 male
## 2 10 9 8 male
## 3 5 4 3 male
## 4 8 6 7 female
## 5 2 2 2 female
print(moltenData)
## gender subject time score
## 1 male 1 early 4
## 2 male 2 early 10
## 3 male 3 early 5
## 4 female 4 early 8
## 5 female 5 early 2
## 6 male 1 mid 6
## 7 male 2 mid 9
## 8 male 3 mid 4
## 9 female 4 mid 6
## 10 female 5 mid 2
## 11 male 1 late 2
## 12 male 2 late 8
## 13 male 3 late 3
## 14 female 4 late 7
## 15 female 5 late 2
print(originalData)
## # A tibble: 5 × 5
## gender subject early mid late
## <fct> <int> <dbl> <dbl> <dbl>
## 1 male 1 4 6 2
## 2 male 2 10 9 8
## 3 male 3 5 4 3
## 4 female 4 8 6 7
## 5 female 5 2 2 2
print(crossTabulatedData)
## time
## subject early late mid
## 1 4 2 6
## 2 10 8 9
## 3 5 3 4
## 4 8 7 6
## 5 2 2 2
library(gdata)
# Create a dummy data set
thisData <- data.frame(
Q1 = sample(1:5, 10, replace = TRUE),
Q2 = sample(1:5, 10, replace = TRUE),
midterm = sample(60:100, 10, replace = TRUE),
final = sample(60:100, 10, replace = TRUE),
presentation = sample(60:100, 10, replace = TRUE),
assignment = sample(60:100, 10, replace = TRUE),
prescore = sample(60:100, 10, replace = TRUE),
other = sample(1:5, 10, replace = TRUE)
)
head(thisData)
## Q1 Q2 midterm final presentation assignment prescore other
## 1 1 2 87 91 86 72 75 4
## 2 1 1 92 66 86 100 61 3
## 3 3 2 91 73 84 70 96 3
## 4 2 3 82 84 87 81 78 2
## 5 1 4 80 62 87 83 72 4
## 6 2 5 98 82 72 71 98 5
pattern <- "Q\\d+|^midterm$|^final$|^pres"
tempIndex <- grep(pattern, colnames(thisData))
newData <- drop.levels(thisData[, tempIndex, drop = FALSE])
# Summarize the new data
head(newData)
## Q1 Q2 midterm final presentation prescore
## 1 1 2 87 91 86 75
## 2 1 1 92 66 86 61
## 3 3 2 91 73 84 96
## 4 2 3 82 84 87 78
## 5 1 4 80 62 87 72
## 6 2 5 98 82 72 98
thisData = read.table("data/Daphnia.txt",header=TRUE)
attach(thisData)
# means of Growth.rate by Detergent
tapply(Growth.rate,Detergent,mean)
## BrandA BrandB BrandC BrandD
## 3.884832 4.010044 3.954512 3.558231
# use list() for two-dimensional tables
# tapply(VALUE,list(ROW,CLUMN),FUNCTION)
tapply(Growth.rate,list(Daphnia,Detergent),mean)
## BrandA BrandB BrandC BrandD
## Clone1 2.732227 2.929140 3.071335 2.626797
## Clone2 3.919002 4.402931 4.772805 5.213745
## Clone3 5.003268 4.698062 4.019397 2.834151
# using an anonymous function
tapply(Growth.rate,list(Daphnia,Detergent), function(x){sqrt(var(x)/length(x))})
## BrandA BrandB BrandC BrandD
## Clone1 0.2163448 0.2319320 0.3055929 0.1905771
## Clone2 0.4702855 0.3639819 0.5773096 0.5520220
## Clone3 0.2688604 0.2683660 0.5395750 0.4260212
# a three-dimensional table
tapply(Growth.rate,list(Daphnia,Detergent,Water),mean)
## , , Tyne
##
## BrandA BrandB BrandC BrandD
## Clone1 2.811265 2.775903 3.287529 2.597192
## Clone2 3.307634 4.191188 3.620532 4.105651
## Clone3 4.866524 4.766258 4.534902 3.365766
##
## , , Wear
##
## BrandA BrandB BrandC BrandD
## Clone1 2.653189 3.082377 2.855142 2.656403
## Clone2 4.530371 4.614673 5.925078 6.321838
## Clone3 5.140011 4.629867 3.503892 2.302537
# and use ftable() to flatten the table
ftable(tapply(Growth.rate,list(Daphnia,Detergent,Water),mean))
## Tyne Wear
##
## Clone1 BrandA 2.811265 2.653189
## BrandB 2.775903 3.082377
## BrandC 3.287529 2.855142
## BrandD 2.597192 2.656403
## Clone2 BrandA 3.307634 4.530371
## BrandB 4.191188 4.614673
## BrandC 3.620532 5.925078
## BrandD 4.105651 6.321838
## Clone3 BrandA 4.866524 5.140011
## BrandB 4.766258 4.629867
## BrandC 4.534902 3.503892
## BrandD 3.365766 2.302537
# the fourth argument is an option for the function
tapply(Growth.rate,list(Daphnia,Detergent),mean,na.rm=TRUE)
## BrandA BrandB BrandC BrandD
## Clone1 2.732227 2.929140 3.071335 2.626797
## Clone2 3.919002 4.402931 4.772805 5.213745
## Clone3 5.003268 4.698062 4.019397 2.834151
as.data.frame.table(tapply(Growth.rate,list(Daphnia,Detergent),mean))
## Var1 Var2 Freq
## 1 Clone1 BrandA 2.732227
## 2 Clone2 BrandA 3.919002
## 3 Clone3 BrandA 5.003268
## 4 Clone1 BrandB 2.929140
## 5 Clone2 BrandB 4.402931
## 6 Clone3 BrandB 4.698062
## 7 Clone1 BrandC 3.071335
## 8 Clone2 BrandC 4.772805
## 9 Clone3 BrandC 4.019397
## 10 Clone1 BrandD 2.626797
## 11 Clone2 BrandD 5.213745
## 12 Clone3 BrandD 2.834151
detach(thisData)
# Assuming the previous setup of random number generation and assigning genders
cells <- rnbinom(10000, size=0.63, prob=0.63/1.83)
gender <- rep(c("male", "female"), c(5000, 5000))
# Generate and output frequency table for cells
freq_cells <- table(cells)
freq_cells
## cells
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 5161 2105 1149 661 356 188 138 101 42 33 24 19 11 1 2 4
## 16 18 19 20
## 2 1 1 1
# Generate and output cross tabulation of cells by gender
freq_cells_gender <- table(cells, gender)
freq_cells_gender
## gender
## cells female male
## 0 2601 2560
## 1 1055 1050
## 2 551 598
## 3 319 342
## 4 180 176
## 5 101 87
## 6 62 76
## 7 54 47
## 8 22 20
## 9 17 16
## 10 12 12
## 11 9 10
## 12 8 3
## 13 0 1
## 14 2 0
## 15 4 0
## 16 1 1
## 18 1 0
## 19 0 1
## 20 1 0
# Creating a matrix of counts
counts <- matrix(c(2, 2, 4, 3, 1, 4, 2, 9, 1, 5, 3, 3), nrow = 4)
# Creating row-wise probability table
rowwise_table <- prop.table(counts, 1)
print("Row-wise probability table:")
## [1] "Row-wise probability table:"
print(rowwise_table)
## [,1] [,2] [,3]
## [1,] 0.5000000 0.2500000 0.2500000
## [2,] 0.1818182 0.3636364 0.4545455
## [3,] 0.4444444 0.2222222 0.3333333
## [4,] 0.2000000 0.6000000 0.2000000
# Creating column-wise probability table
colwise_table <- prop.table(counts, 2)
print("Column-wise probability table:")
## [1] "Column-wise probability table:"
print(colwise_table)
## [,1] [,2] [,3]
## [1,] 0.1818182 0.0625 0.08333333
## [2,] 0.1818182 0.2500 0.41666667
## [3,] 0.3636364 0.1250 0.25000000
## [4,] 0.2727273 0.5625 0.25000000
# Creating table-wise probability table
tablewise_table <- prop.table(counts)
print("Table-wise probability table:")
## [1] "Table-wise probability table:"
print(tablewise_table)
## [,1] [,2] [,3]
## [1,] 0.05128205 0.02564103 0.02564103
## [2,] 0.05128205 0.10256410 0.12820513
## [3,] 0.10256410 0.05128205 0.07692308
## [4,] 0.07692308 0.23076923 0.07692308
# Load the gtools package
library(gtools)
# Define the vector of elements
x <- c("A", "B", "C", "D", "E", "F")
# Define the size of the target group (r)
r <- 4
# Calculate and print permutations of elements
perm <- permutations(n = length(x), r = r, v = x, repeats.allowed = FALSE)
head(perm) # There are 360 permutations of 6 elements taken 4 at a time
## [,1] [,2] [,3] [,4]
## [1,] "A" "B" "C" "D"
## [2,] "A" "B" "C" "E"
## [3,] "A" "B" "C" "F"
## [4,] "A" "B" "D" "C"
## [5,] "A" "B" "D" "E"
## [6,] "A" "B" "D" "F"
# Calculate and print combinations of elements
comb <- combinations(n = length(x), r = r, v = x)
print(comb) # There are 15 combinations of 6 elements taken 4 at a time
## [,1] [,2] [,3] [,4]
## [1,] "A" "B" "C" "D"
## [2,] "A" "B" "C" "E"
## [3,] "A" "B" "C" "F"
## [4,] "A" "B" "D" "E"
## [5,] "A" "B" "D" "F"
## [6,] "A" "B" "E" "F"
## [7,] "A" "C" "D" "E"
## [8,] "A" "C" "D" "F"
## [9,] "A" "C" "E" "F"
## [10,] "A" "D" "E" "F"
## [11,] "B" "C" "D" "E"
## [12,] "B" "C" "D" "F"
## [13,] "B" "C" "E" "F"
## [14,] "B" "D" "E" "F"
## [15,] "C" "D" "E" "F"
practiceCalculation = function (x) {
sumX = sum(x) # sum of vector x
sumOfSq = sum(x^2) # sum of squares of vector x
meanX = mean(x) # mean of vector x
SS = sum((meanX-x)^2) # sum of squares
n = length(x) # number of observations
sd = sqrt(SS / n) # population standard deviation
sampleSd = sqrt(SS / (n-1)) # sample standard deviation
seOfMean = sd / sqrt(n) # standard error of the mean
return(list(sum=sumX, sumOfSq=sumOfSq, SS=SS, sd=sd, sampleSd=sampleSd, seOfMean=seOfMean))
}
x1 = c(25.0, 5.0, 76.0, 43.54, 43.34, 154, 64.34, 54.6, 76, 3)
calcResults = practiceCalculation(x1)
calcResults
## $sum
## [1] 544.82
##
## $sumOfSq
## [1] 46821.88
##
## $SS
## [1] 17139
##
## $sd
## [1] 41.39927
##
## $sampleSd
## [1] 43.63867
##
## $seOfMean
## [1] 13.0916
# Built-in functions and the psych package can further describe the data
installationStatus <- require(psych)
if (!installationStatus) {
install.packages("psych")
library(psych)
}
describe(x1)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 10 54.48 43.64 49.07 48.48 37.81 3 154 151 0.87 0.1 13.8
sum(x1)
## [1] 544.82
sd(x1)
## [1] 43.63867
x = c(1,1,1,3,4,5)
# Create a histogram for continuous-like data.
hist(x, main="Histogram of x", xlab="Value", ylab="Frequency", col="blue")
# Create a barplot for discrete data.
# Count the number of occurrences of each unique value
counts <- table(x)
barplot(counts, main="Barplot of x", xlab="Value", ylab="Frequency", col="red")
# Create a histogram with densities instead of frequencies
hist(x, freq=FALSE, main="Histogram of x with Density", xlab="Value", ylab="Density", col="green")
x = c(rep(letters[1:5],3),rep(letters[6:10],4),rep(letters[11],8))
# Calculate mode
getMode <- function(v) {
vTable <- table(v)
vMode <- as.numeric(names(vTable[vTable == max(vTable)]))
if(is.na(vMode)) {
return(NA) # In case of no mode
}
return(vMode)
}
# Display mode
mode_x <- getMode(x)
## Warning in getMode(x): NAs introduced by coercion
# Create a numeric vector for summary
x_numeric <- c(1,3,3,5)
# Calculate mean
mean_x <- mean(x_numeric)
# Calculate sum of squares (SS)
ss_x <- sum((x_numeric - mean_x)^2)
# Calculate variance (sigma^2)
variance_x <- var(ss_x) / length(x_numeric)
# Calculate standard deviation (sigma)
sd_x <- sd(x_numeric)
# Use psych package for descriptive statistics
library(psych)
psych_desc <- describe(x_numeric)
# Output results
list(mode = mode_x, mean = mean_x, ss = ss_x, variance = variance_x, sd = sd_x, psych_desc = psych_desc)
## $mode
## [1] NA
##
## $mean
## [1] 3
##
## $ss
## [1] 8
##
## $variance
## [1] NA
##
## $sd
## [1] 1.632993
##
## $psych_desc
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 4 3 1.63 3 3 1.48 1 5 4 0 -1.88 0.82
Beta | beta | Log-normal | lnorm |
---|---|---|---|
Binomial | binom | Negative Binomial | nbinom |
Cauchy | cauchy | Normal | rnom |
Chisquare | chisq | Poisson | pois |
Exponental | exp | Student t | t |
Fisher F | f | Uniform | unif |
Gamma | gamma | Tukey | tukey |
Geometric | geom | Weibull | weib |
Hypergeometric | hyper | Wilcoxon | wilcox |
Logistic | logis | - | - |
name | description | usage |
---|---|---|
dname() | density or probability function | dnorm(0) 1/ sqrt(2pi); dnorm(1) exp(-1/2)/ sqrt(2pi); dnorm(1) == 1/ sqrt(2piexp(1)) |
pname() | cumulative density function | pnorm(-1.63)=0.05155 (the area under the standard normal curve to the left of -1.63) |
qname() | quantile function | qnorm(0.9)=1.2815 (1.2815 is the 90th percentile of the standard normal distribution) |
rname() | drandam deviates | rnorm(100,mean=50,sd=10) (generates 100 random deviates from a normal distribution with the define parameters) |
Distribution | p (probability) function | q (quantile) function | d (density) function | r (random) function |
---|---|---|---|---|
Beta | pbeta | qbeta | dbeta | rbeta |
Binomial | pbinom | qbinom | dbinom | rbinom |
Cauchy | pcauchy | qcauchy | dcauchy | rcauchy |
Chi-Square | pchisq | qchisq | dchisq | rchisq |
Exponential | pexp | qexp | dexp | rexp |
F | pf | qf | df | rf |
Gamma | pgamma | qgamma | dgamma | rgamma |
Geometric | pgeom | qgeom | dgeom | rgeom |
Hypergeometric | phyper | qhyper | dhyper | rhyper |
Logistic | plogis | qlogis | dlogis | rlogis |
Log Normal | plnorm | qlnorm | dlnorm | rlnorm |
Negative Binomial | pnbinom | qnbinom | dnbinom | rnbinom |
Normal | pnorm | qnorm | dnorm | rnorm |
Poisson | ppois | qpois | dpois | rpois |
Student t | pt | qt | dt | rt |
Studentized Range | ptukey | qtukey | dtukey | rtukey |
Uniform | punif | qunif | dunif | runif |
Weibull | pweibull | qweibull | dweibull | rweibull |
Wilcoxon Rank Sum Statistic | pwilcox | qwilcox | dwilcox | rwilcox |
Wilcoxon Signed Rank Statistic | psignrank | qsignrank | dsignrank | rsignrank |
# http://www.statmethods.net/advgraphs/probability.html
pnorm(-2.58)
## [1] 0.004940016
pnorm(-1.63)
## [1] 0.05155075
pnorm(1.63)
## [1] 0.9484493
pnorm(2.58)
## [1] 0.99506
qnorm(0.005)
## [1] -2.575829
qnorm(0.025)
## [1] -1.959964
qnorm(0.975)
## [1] 1.959964
qnorm(0.995)
## [1] 2.575829
# mu = 100, sigma = 15, select one individual who is larger than 118
mu = 100; sigma = 15; n = 1
se = sigma/sqrt(n)
z = (118-100)/se
1 - pnorm(z)
## [1] 0.1150697
# mu = 100, sigma = 15, select one individual between 79 and 94
mu = 100; sigma = 15; n = 1
se = sigma/sqrt(n)
z_low = (79-100)/se
z_high = (94-100)/se
pnorm(z_high) - pnorm(z_low)
## [1] 0.2638216
# mu = 100, sigma = 15, select 4 samples whose mean is larger than 118
mu = 100; sigma = 15; n = 4
se = sigma/sqrt(n)
# z-score for sample means
z = (118-100)/se
1 - pnorm(z)
## [1] 0.008197536
# mu = 100, sigma = 15, conf interval of 36 samples with 95% confidence
mu = 100; sigma = 15; n = 36
se = sigma/sqrt(n)
z_low = qnorm(0.025)
z_high = qnorm(0.975)
# x = z*se + mu
z_low*se + mu
## [1] 95.10009
z_high*se + mu
## [1] 104.8999
# mu = 100, sigma = 15, select one individual who is larger than 118
mu = 100; sigma = 15; n = 1
se = sigma/sqrt(n)
z = (118-100)/se
1 - pnorm(z)
## [1] 0.1150697
# mu = 100, sigma = 15, select one individual between 79 and 94
mu = 100; sigma = 15; n = 1
se = sigma/sqrt(n)
z_low = (79-100)/se
z_high = (94-100)/se
pnorm(z_high) - pnorm(z_low)
## [1] 0.2638216
# mu = 100, sigma = 15, select 4 samples whose mean is larger than 118
mu = 100; sigma = 15; n = 4
se = sigma/sqrt(n)
# z-score for sample means
z = (118-100)/se
1 - pnorm(z)
## [1] 0.008197536
# mu = 100, sigma = 15, conf interval of 36 samples with 95% confidence
mu = 100; sigma = 15; n = 36
se = sigma/sqrt(n)
z_low = qnorm(0.025)
z_high = qnorm(0.975)
# x = z*se + mu
z_low*se + mu
## [1] 95.10009
z_high*se + mu
## [1] 104.8999
# From Quick-R http://www.statmethods.net
# Display the Student's t distributions with various degrees of freedom and compare to the normal distribution
x = seq(-4, 4, length=100)
hx = dnorm(x)
degf = c(1, 3, 8, 30)
colors = c("red", "blue", "darkgreen", "gold")
labels = c("df=1", "df=3", "df=8", "df=30")
plot(x, hx, type="l", lty=2, xlab="x value", ylab="Density",
main="Comparison of t Distributions")
for (i in 1:4){
lines(x, dt(x, degf[i]), lwd=2, col=colors[i])
}
legend("topright", inset=.05, title="Distributions", c(labels, "normal"),
lwd=2, lty=c(1, 1, 1, 1, 2), col=c(colors, "black"))
# Given: mu = 50, sigma = 10, n = 10, x_bar = 58
mu = 50; sigma = 10; n = 10; x_bar = 58
se = sigma / sqrt(n)
z_low = qnorm(0.025)
z_high = qnorm(0.975)
# Confidence interval: x_bar ± z*se
lower = x_bar + z_low * se
upper = x_bar + z_high * se
c(lower, upper)
## [1] 51.80205 64.19795
# Function to calculate sample variance (unbiased estimator)
s2 = function(x) { sum((x - mean(x))^2) / (length(x) - 1) }
# Function to calculate population variance (biased estimator)
variance = function(x) { sum((x - mean(x))^2) / length(x) }
x = c(2, 4, 6, 8)
library(gtools)
sample_size = 2
perm = permutations(length(x), sample_size, x, repeats.allowed=TRUE)
# Calculating variance for all permutations
thisData = cbind(perm, apply(perm, 1, s2), apply(perm, 1, variance))
colnames(thisData) = c("X_1", "X_2", "sample_var", "population_var")
thisData = as.data.frame(thisData)
thisData
## X_1 X_2 sample_var population_var
## 1 2 2 0 0
## 2 2 4 2 1
## 3 2 6 8 4
## 4 2 8 18 9
## 5 4 2 2 1
## 6 4 4 0 0
## 7 4 6 2 1
## 8 4 8 8 4
## 9 6 2 8 4
## 10 6 4 2 1
## 11 6 6 0 0
## 12 6 8 2 1
## 13 8 2 18 9
## 14 8 4 8 4
## 15 8 6 2 1
## 16 8 8 0 0
dfs = c(5, 10, 20, 40, 80, 1000)
alpha_0.05 = qt(0.975, dfs) # Two-tailed test, alpha = 0.05
alpha_0.01 = qt(0.995, dfs) # Two-tailed test, alpha = 0.01
thisData = data.frame(df = dfs, alpha_0.05 = alpha_0.05, alpha_0.01 = alpha_0.01)
thisData
## df alpha_0.05 alpha_0.01
## 1 5 2.570582 4.032143
## 2 10 2.228139 3.169273
## 3 20 2.085963 2.845340
## 4 40 2.021075 2.704459
## 5 80 1.990063 2.638691
## 6 1000 1.962339 2.580755
# Given: mu = 50, n = 100, x_bar = 54, s = 19
mu = 50; n = 100; x_bar = 54; s = 19
se = s / sqrt(n)
t_value = qt(0.975, n - 1)
# Confidence interval: x_bar ± t*se
lower = x_bar - t_value * se
upper = x_bar + t_value * se
c(lower, upper)
## [1] 50.22999 57.77001
set.seed(123)
par(mfrow=c(1,2))
# Positive relationship
x = rnorm(100, mean=100, sd=10)
y_pos = jitter(x, factor=10, amount=10)
plot(x, y_pos, main="Positive Relationship", xlab="X", ylab="Y")
abline(lm(y_pos ~ x), col="red")
# Negative relationship
y_neg = jitter(-x, factor=10, amount=10)
plot(x, y_neg, main="Negative Relationship", xlab="X", ylab="Y")
abline(lm(y_neg ~ x), col="red")
# Data
x = seq(1, 6)
y = c(0.440, 0.515, 0.550, 0.575, 0.605, 0.615)
# Calculations
cov_xy = sum((x - mean(x)) * (y - mean(y))) / (length(x) - 1)
s_x = sd(x)
s_y = sd(y)
r = cov_xy / (s_x * s_y)
# Output
cov_xy
## [1] 0.117
s_x
## [1] 1.870829
s_y
## [1] 0.06511528
r
## [1] 0.9604371
x = seq(1, 6)
y = c(0.440, 0.515, 0.550, 0.575, 0.605, 0.615)
# Correlation test
cor.test(x, y)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 6.8973, df = 4, p-value = 0.002317
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6750314 0.9958104
## sample estimates:
## cor
## 0.9604371
# Descriptive statistics
library(psych)
describe(cbind(x, y))
## vars n mean sd median trimmed mad min max range skew kurtosis se
## x 1 6 3.50 1.87 3.50 3.50 2.22 1.00 6.00 5.00 0.00 -1.80 0.76
## y 2 6 0.55 0.07 0.56 0.55 0.07 0.44 0.62 0.17 -0.55 -1.37 0.03
x = seq(1, 6)
y = c(0.440, 0.515, 0.550, 0.575, 0.605, 0.615)
# Linear model
fit = lm(y ~ x)
summary(fit)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## 1 2 3 4 5 6
## -0.026429 0.015143 0.016714 0.008286 0.004857 -0.018571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.433000 0.018875 22.941 0.0000214 ***
## x 0.033429 0.004847 6.897 0.00232 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02027 on 4 degrees of freedom
## Multiple R-squared: 0.9224, Adjusted R-squared: 0.903
## F-statistic: 47.57 on 1 and 4 DF, p-value: 0.002317
# Descriptive statistics
library(psych)
describe(cbind(x, y))
## vars n mean sd median trimmed mad min max range skew kurtosis se
## x 1 6 3.50 1.87 3.50 3.50 2.22 1.00 6.00 5.00 0.00 -1.80 0.76
## y 2 6 0.55 0.07 0.56 0.55 0.07 0.44 0.62 0.17 -0.55 -1.37 0.03
# Generating data
set.seed(123)
thisData = data.frame(matrix(nrow=6, ncol=7))
for (i in 1:7) {
x = sort(rnorm(1000, mean=100, sd=i))
thisData[[i]] <- sample(x, 6)
}
# Correlation matrix
cor_matrix = cor(thisData)
cor_matrix
## X1 X2 X3 X4 X5 X6
## X1 1.00000000 0.3346496 -0.1396935 -0.2662075 -0.1443629 -0.01556647
## X2 0.33464956 1.0000000 -0.5594597 0.3145060 0.4879924 0.61960513
## X3 -0.13969355 -0.5594597 1.0000000 -0.3192769 0.3309353 -0.30697962
## X4 -0.26620746 0.3145060 -0.3192769 1.0000000 0.3812170 -0.30828010
## X5 -0.14436287 0.4879924 0.3309353 0.3812170 1.0000000 0.30524693
## X6 -0.01556647 0.6196051 -0.3069796 -0.3082801 0.3052469 1.00000000
## X7 -0.17853580 -0.2479134 0.2468131 -0.8615435 -0.1956732 0.56850026
## X7
## X1 -0.1785358
## X2 -0.2479134
## X3 0.2468131
## X4 -0.8615435
## X5 -0.1956732
## X6 0.5685003
## X7 1.0000000
# Computing p-value for one correlation
cor.test(thisData[[1]], thisData[[2]])
##
## Pearson's product-moment correlation
##
## data: thisData[[1]] and thisData[[2]]
## t = 0.71025, df = 4, p-value = 0.5168
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.6547285 0.9014007
## sample estimates:
## cor
## 0.3346496
# Function to compute p-values for correlation matrix
cor.prob <- function(X, dfr = nrow(X) - 2) {
R <- cor(X)
above <- row(R) < col(R)
r2 <- R[above]^2
Fstat <- r2 * dfr / (1 - r2)
R[above] <- 1 - pf(Fstat, 1, dfr)
R[!above] <- NA
diag(R) <- NA
R
}
# Correlation p-values
cor_pvalues = cor.prob(thisData)
cor_pvalues
## X1 X2 X3 X4 X5 X6 X7
## X1 NA 0.5167644 0.7918227 0.6101214 0.7849600 0.9766522 0.73504171
## X2 NA NA 0.2483645 0.5437955 0.3261158 0.1895288 0.63574843
## X3 NA NA NA 0.5373579 0.5217187 0.5539949 0.63729788
## X4 NA NA NA NA 0.4558750 0.5522288 0.02742819
## X5 NA NA NA NA NA 0.5563504 0.71023618
## X6 NA NA NA NA NA NA 0.23911713
## X7 NA NA NA NA NA NA NA
# Data
Subject = c("A", "B", "C", "D", "E", "F")
Unprimed = c(550, 1030, 760, 600, 940, 820)
Primed = c(510, 980, 770, 600, 870, 760)
# Calculations
Difference = Unprimed - Primed
mean_diff = mean(Difference)
sd_diff = sd(Difference)
n = length(Difference)
se = sd_diff / sqrt(n)
t_value = (mean_diff - 0) / se
df = n - 1
p_value = 2 * pt(-abs(t_value), df)
# Output
mean_diff
## [1] 35
sd_diff
## [1] 32.71085
t_value
## [1] 2.620908
df
## [1] 5
p_value
## [1] 0.0470462
x = c(550, 1030, 760, 600, 940, 820)
y = c(510, 980, 770, 600, 870, 760)
# Paired t-test
t.test(x, y, paired=TRUE)
##
## Paired t-test
##
## data: x and y
## t = 2.6209, df = 5, p-value = 0.04705
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 0.6720635 69.3279365
## sample estimates:
## mean difference
## 35
# Descriptive statistics
library(psych)
describe(cbind(x, y))
## vars n mean sd median trimmed mad min max range skew kurtosis
## x 1 6 783.33 187.26 790 783.33 252.04 550 1030 480 0.00 -1.86
## y 2 6 748.33 171.98 765 748.33 200.15 510 980 470 -0.08 -1.70
## se
## x 76.45
## y 70.21
The variance of the sum or difference of two independent variables is equal to the sum of their variances.
For independent samples t-test:
\[ \text{var}(\bar{x}_1 - \bar{x}_2) = \frac{s_1^2}{n_1} + \frac{s_2^2}{n_2} \]
Standard error of the difference:
\[ \text{SE}(\bar{x}_1 - \bar{x}_2) = \sqrt{\frac{s_1^2}{n_1} + \frac{s_2^2}{n_2}} \]
x = c(550, 1030, 760, 600, 940, 820)
y = c(510, 980, 770, 600, 870, 760)
# Independent t-test
t.test(x, y, paired=FALSE)
##
## Welch Two Sample t-test
##
## data: x and y
## t = 0.3372, df = 9.9284, p-value = 0.743
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -196.5012 266.5012
## sample estimates:
## mean of x mean of y
## 783.3333 748.3333
# Dependent t-test
t.test(x, y, paired=TRUE)
##
## Paired t-test
##
## data: x and y
## t = 2.6209, df = 5, p-value = 0.04705
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 0.6720635 69.3279365
## sample estimates:
## mean difference
## 35
# Descriptive statistics
library(psych)
describe(cbind(x, y))
## vars n mean sd median trimmed mad min max range skew kurtosis
## x 1 6 783.33 187.26 790 783.33 252.04 550 1030 480 0.00 -1.86
## y 2 6 748.33 171.98 765 748.33 200.15 510 980 470 -0.08 -1.70
## se
## x 76.45
## y 70.21
# Data
x = c(rep(0, 6), rep(1, 6))
y = c(55, 73, 76, 66, 72, 81, 71, 93, 77, 89, 101, 97)
# Linear model
model = lm(y ~ x)
summary(model)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.000 -6.125 2.000 6.375 13.000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.500 4.273 16.499 1.4e-08 ***
## x 17.500 6.043 2.896 0.0159 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.47 on 10 degrees of freedom
## Multiple R-squared: 0.4561, Adjusted R-squared: 0.4017
## F-statistic: 8.387 on 1 and 10 DF, p-value: 0.01594
# Independent t-test
t.test(y ~ x, var.equal=TRUE)
##
## Two Sample t-test
##
## data: y by x
## t = -2.896, df = 10, p-value = 0.01594
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -30.964425 -4.035575
## sample estimates:
## mean in group 0 mean in group 1
## 70.5 88.0
# Assuming 'das.txt' is available in the working directory
data = read.table("data/das.txt", header=TRUE)
# Simulating data similar to 'das.txt'
set.seed(123)
data = data.frame(y = rnorm(100))
par(mfrow=c(2,2))
plot(data$y, main="Scatter Plot of Y", ylab="Y", xlab="Index")
boxplot(data$y, main="Boxplot of Y", ylab="Y")
hist(data$y, main="Histogram of Y", xlab="Y", breaks=10)
# Introduce an outlier
data$y2 = data$y
data$y2[52] = 21.75
plot(data$y2, main="Scatter Plot with Outlier", ylab="Y", xlab="Index")
# Generating non-normal data
x = exp(rnorm(30))
# Shapiro-Wilk test
shapiro.test(x)
##
## Shapiro-Wilk normality test
##
## data: x
## W = 0.57719, p-value = 3.888e-08
y = c(1, -2, 2, 1, -4, 5, -8, -5, 2, 5, 6)
# Test whether mean of y equals 2
t.test(y, mu=2)
##
## One Sample t-test
##
## data: y
## t = -1.2678, df = 10, p-value = 0.2336
## alternative hypothesis: true mean is not equal to 2
## 95 percent confidence interval:
## -2.762942 3.308396
## sample estimates:
## mean of x
## 0.2727273
# Data
x = factor(c(rep(1, 8), rep(2, 5)), labels = c("A", "B"))
y = c(7, 5, 6, 8, 4, 5, 4, 6, 3, 5, 4, 6, 2)
# Independent t-test
t.test(y ~ x)
##
## Welch Two Sample t-test
##
## data: y by x
## t = 1.8792, df = 7.8456, p-value = 0.09775
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
## -0.3759343 3.6259343
## sample estimates:
## mean in group A mean in group B
## 5.625 4.000
x = c(7, 5, 6, 8, 4, 5, 4, 6)
y = c(3, 5, 4, 6, 2)
# Independent t-test
t.test(x, y)
##
## Welch Two Sample t-test
##
## data: x and y
## t = 1.8792, df = 7.8456, p-value = 0.09775
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3759343 3.6259343
## sample estimates:
## mean of x mean of y
## 5.625 4.000
x = c(4, 5, 6, 4, 5, 6, 6, 4)
y = c(2, 1, 4, 5, 6, 9, 2, 2)
# Paired t-test
t.test(x, y, paired=TRUE)
##
## Paired t-test
##
## data: x and y
## t = 1.2567, df = 7, p-value = 0.2492
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.9917538 3.2417538
## sample estimates:
## mean difference
## 1.125
# Function to compare two correlation coefficients
compcorr = function(n1, r1, n2, r2){
# Fisher's Z-transform
zf1 = 0.5 * log((1 + r1) / (1 - r1))
zf2 = 0.5 * log((1 + r2) / (1 - r2))
# Difference between Z-transformed correlations
dz = (zf1 - zf2) / sqrt(1 / (n1 - 3) + 1 / (n2 - 3))
# p-value (two-tailed)
p_value = 2 * (1 - pnorm(abs(dz)))
return(list(diff = dz, p_value = p_value))
}
# Example data
r_male = -0.506; n_male = 52
r_female = -0.381; n_female = 51
compcorr(n_male, r_male, n_female, r_female)
## $diff
## [1] -0.7687093
##
## $p_value
## [1] 0.4420659
# Partial correlation function
partialCor = function(x, y, z){
r_xy = cor(x, y)
r_xz = cor(x, z)
r_yz = cor(y, z)
numerator = r_xy - r_xz * r_yz
denominator = sqrt((1 - r_xz^2) * (1 - r_yz^2))
partial_corr = numerator / denominator
return(partial_corr)
}
# Semi-partial correlation function
semipartialCor = function(x, y, z){
r_xy = cor(x, y)
r_xz = cor(x, z)
numerator = r_xy - cor(z, y) * r_xz
denominator = sqrt(1 - r_xz^2)
semipartial_corr = numerator / denominator
return(semipartial_corr)
}
# Example data
child = letters[1:8]
shoe_size = c(5.2, 4.7, 7.0, 5.8, 7.2, 6.9, 7.7, 8.0)
reading_level = c(1.7, 1.5, 2.7, 3.1, 3.9, 4.5, 5.1, 7.4)
age = c(5, 6, 7, 8, 9, 10, 11, 12)
# Correlation matrix
library(Hmisc)
rcorr_matrix = rcorr(cbind(reading_level, age, shoe_size))
rcorr_matrix$r # Correlation coefficients
## reading_level age shoe_size
## reading_level 1.0000000 0.9603441 0.8699218
## age 0.9603441 1.0000000 0.8719177
## shoe_size 0.8699218 0.8719177 1.0000000
# Partial correlation between reading_level and shoe_size controlling for age
partial_correlation = partialCor(reading_level, shoe_size, age)
partial_correlation
## [1] 0.2386455
# Semi-partial correlation
semipartial_correlation = semipartialCor(reading_level, shoe_size, age)
semipartial_correlation
## [1] 0.1168533
# Assuming 'mtcars.txt' is available in the working directory
thisData = read.table("data/mtcars.txt", header=TRUE)
data(mtcars)
thisData = mtcars
# Total Sum of Squares (SST)
SST_manual = sum((thisData$disp - mean(thisData$disp))^2)
SST_formula = deviance(lm(disp ~ 1, data=thisData))
SST_manual
## [1] 476184.8
SST_formula
## [1] 476184.8
# Sum of Squares Error (SSE)
SSE = deviance(lm(disp ~ cyl, data=thisData))
SSE
## [1] 88730.7
# Linear model with intercept
fit1 = lm(disp ~ cyl, data=thisData)
summary(fit1)
##
## Call:
## lm(formula = disp ~ cyl, data = thisData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -73.985 -45.233 3.565 26.688 127.818
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -156.609 35.181 -4.452 0.000109 ***
## cyl 62.599 5.469 11.445 1.8e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54.38 on 30 degrees of freedom
## Multiple R-squared: 0.8137, Adjusted R-squared: 0.8075
## F-statistic: 131 on 1 and 30 DF, p-value: 1.803e-12
# Linear model without intercept
fit2 = lm(disp ~ cyl - 1, data=thisData)
summary(fit2)
##
## Call:
## lm(formula = disp ~ cyl - 1, data = thisData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.07 -67.47 -36.06 26.34 158.57
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## cyl 39.179 1.895 20.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68.94 on 31 degrees of freedom
## Multiple R-squared: 0.9324, Adjusted R-squared: 0.9302
## F-statistic: 427.6 on 1 and 31 DF, p-value: < 2.2e-16
# Assuming 'ozone.data.txt' is available
ozone.pollution = read.table("data/ozone.data.txt", header=TRUE)
# Simulated ozone pollution data
set.seed(123)
ozone.pollution = data.frame(
ozone = rnorm(100, mean=50, sd=10),
rad = rnorm(100, mean=200, sd=50),
temp = rnorm(100, mean=70, sd=10),
wind = rnorm(100, mean=10, sd=2)
)
attach(ozone.pollution)
# Scatterplot matrix
pairs(ozone.pollution, panel=panel.smooth)
# Generalized additive model (GAM)
library(mgcv)
model = gam(ozone ~ s(rad) + s(temp) + s(wind))
# Plot GAM
par(mfrow=c(2,2))
plot(model)
# Decision tree
library(tree)
## Warning: package 'tree' was built under R version 4.3.3
tree_model = tree(ozone ~ ., data=ozone.pollution)
plot(tree_model)
text(tree_model, pretty=0)
detach(ozone.pollution)
# Given correlations
r_1y = 0.4; r_2y = 0.3; r_12 = 0.2
# Calculating standardized beta coefficients
B_1 = (r_1y - r_2y * r_12) / (1 - r_12^2)
B_2 = (r_2y - r_1y * r_12) / (1 - r_12^2)
B_1
## [1] 0.3541667
B_2
## [1] 0.2291667
# Calculating R and R^2
R = sqrt(B_1 * r_1y + B_2 * r_2y)
R
## [1] 0.458712
R^2
## [1] 0.2104167
# Predictors positively correlated with Y but negatively correlated with each other
r_1y = 0.4; r_2y = 0.3; r_12 = -0.2
B_1 = (r_1y - r_2y * r_12) / (1 - r_12^2)
B_2 = (r_2y - r_1y * r_12) / (1 - r_12^2)
B_1
## [1] 0.4791667
B_2
## [1] 0.3958333
# Predictor 2 is not correlated with Y but correlated with Predictor 1
r_1y = 0.4; r_2y = 0; r_12 = 0.5
B_1 = (r_1y - r_2y * r_12) / (1 - r_12^2)
B_2 = (r_2y - r_1y * r_12) / (1 - r_12^2)
B_1
## [1] 0.5333333
B_2
## [1] -0.2666667
# Loading mtcars dataset
data(mtcars)
thisData = mtcars
attach(thisData)
par(mfrow=c(2,2))
# Model with one predictor
plot(hp, disp, main="Model with One Predictor", ylab="Displacement (cu.in.)", xlab="Gross horsepower")
abline(lm(disp ~ hp), col="red")
# Model with two predictors (3D scatter plot)
library(scatterplot3d)
s3d = scatterplot3d(hp, wt, disp, pch=16, highlight.3d=TRUE, type="h",
main="Model with Two Predictors", ylab="Weight (lb/1000)",
xlab="Gross horsepower", zlab="Displacement (cu.in.)")
fit = lm(disp ~ hp + wt)
s3d$plane3d(fit)
# Diagnostic plots
plot(fit)
detach(thisData)
data(mtcars)
thisData = mtcars
# Initial model
fit_initial = lm(disp ~ mpg + cyl + hp + drat + wt + qsec + gear, data=thisData)
# Stepwise regression (both directions)
library(MASS)
stepwise_model = stepAIC(fit_initial, direction="both")
## Start: AIC=242.45
## disp ~ mpg + cyl + hp + drat + wt + qsec + gear
##
## Df Sum of Sq RSS AIC
## - drat 1 142.5 38024 240.57
## - cyl 1 1678.2 39560 241.84
## <none> 37881 242.45
## - qsec 1 2645.1 40527 242.61
## - mpg 1 2674.4 40556 242.63
## - hp 1 4247.6 42129 243.85
## - gear 1 5179.1 43061 244.55
## - wt 1 23343.6 61225 255.81
##
## Step: AIC=240.57
## disp ~ mpg + cyl + hp + wt + qsec + gear
##
## Df Sum of Sq RSS AIC
## - cyl 1 2197.3 40221 240.37
## <none> 38024 240.57
## - qsec 1 2507.7 40532 240.61
## - mpg 1 2565.0 40589 240.66
## - hp 1 4189.2 42213 241.91
## + drat 1 142.5 37881 242.45
## - gear 1 6027.8 44052 243.28
## - wt 1 23416.5 61441 253.92
##
## Step: AIC=240.37
## disp ~ mpg + hp + wt + qsec + gear
##
## Df Sum of Sq RSS AIC
## - mpg 1 2108.9 42330 240.00
## <none> 40221 240.37
## + cyl 1 2197.3 38024 240.57
## + drat 1 661.6 39560 241.84
## - hp 1 7854.7 48076 244.07
## - qsec 1 8990.9 49212 244.82
## - gear 1 19274.5 59496 250.89
## - wt 1 31014.4 71236 256.66
##
## Step: AIC=240
## disp ~ hp + wt + qsec + gear
##
## Df Sum of Sq RSS AIC
## <none> 42330 240.00
## + mpg 1 2109 40221 240.37
## + cyl 1 1741 40589 240.66
## + drat 1 349 41981 241.74
## - hp 1 6275 48605 242.42
## - qsec 1 7434 49764 243.18
## - gear 1 17301 59632 248.97
## - wt 1 35075 77405 257.31
stepwise_model$anova # Display the steps
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## disp ~ mpg + cyl + hp + drat + wt + qsec + gear
##
## Final Model:
## disp ~ hp + wt + qsec + gear
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 24 37881.49 242.4474
## 2 - drat 1 142.4784 25 38023.97 240.5676
## 3 - cyl 1 2197.3139 26 40221.29 240.3653
## 4 - mpg 1 2108.8563 27 42330.14 240.0006
# Power analysis for two-sample t-test
library(pwr)
# Parameters
d = 0.4 # Effect size
alpha = 0.05
power = 0.8
# Calculating sample size for independent samples
pwr.t.test(d = d, sig.level = alpha, power = power, type = "two.sample", alternative = "two.sided")
##
## Two-sample t test power calculation
##
## n = 99.08032
## d = 0.4
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
# Calculating sample size for paired samples
pwr.t.test(d = d, sig.level = alpha, power = power, type = "paired", alternative = "two.sided")
##
## Paired t test power calculation
##
## n = 51.00945
## d = 0.4
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number of *pairs*
# Levene's test function
levene.test = function(y, group) {
meds = tapply(y, group, median, na.rm=TRUE)
resp = abs(y - meds[group])
table = anova(lm(resp ~ group))
row.names(table)[2] = " "
cat("Levene's Test for Homogeneity of Variance\n\n")
table[, c(1, 4, 5)]
}
# Example data
group = factor(c(rep(1, 5), rep(2, 3), rep(3, 5)))
score = c(7, 5, 6, 8, 4, 5, 4, 6, 3, 5, 4, 6, 2)
thisData = data.frame(group = group, score = score)
# Performing one-way ANOVA
anova_result = aov(score ~ group, data = thisData)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 10 5.0 2.273 0.154
## Residuals 10 22 2.2
# Performing Levene's test
levene.test(thisData$score, thisData$group)
## Levene's Test for Homogeneity of Variance
## Df F value Pr(>F)
## group 2 0.5237 0.6077
## 10
Don’t forget to check the data when you transform the format (with
melt()
or apply()
). Fix the factor variables
with as.factor()
if necessary. Set sum-to-zero so that the
results are comparable with other statistical software such as SPSS.
# Set contrasts to sum-to-zero
options(contrasts = c("contr.sum", "contr.poly"))
The data format for repeated measures in R is different from that for SPSS. In R (and most other statistical software), the repeated measure should be one of the factors. In other words, each line represents one data point.
# Example data in R format
gender <- c("male", "male", "male", "female", "female", "male", "male", "male", "female", "female", "male", "male", "male", "female", "female")
variable <- c(rep("early", 5), rep("mid", 5), rep("late", 5))
value <- c(4, 10, 5, 8, 2, 6, 9, 4, 6, 2, 2, 8, 3, 7, 2)
data_r <- data.frame(gender, variable, value)
data_r
## gender variable value
## 1 male early 4
## 2 male early 10
## 3 male early 5
## 4 female early 8
## 5 female early 2
## 6 male mid 6
## 7 male mid 9
## 8 male mid 4
## 9 female mid 6
## 10 female mid 2
## 11 male late 2
## 12 male late 8
## 13 male late 3
## 14 female late 7
## 15 female late 2
In SPSS, the repeated measure is represented by multiple data points in one row. In other words, within-factors are represented by the row.
# Example data in SPSS format
early <- c(4, 10, 5, 8, 2)
mid <- c(6, 9, 4, 6, 2)
late <- c(2, 8, 3, 7, 2)
gender <- c("male", "male", "male", "female", "female")
data_spss <- data.frame(early, mid, late, gender)
data_spss
## early mid late gender
## 1 4 6 2 male
## 2 10 9 8 male
## 3 5 4 3 male
## 4 8 6 7 female
## 5 2 2 2 female
Even in SPSS, the data format similar to that for R is required for
more complex analyses (e.g., one that involves more than two within
factors). In such cases, dummy coding is used (thus, the data format is
almost identical to that for R). Use melt()
in the
reshape
package to transform the data format from SPSS to
that for R.
SS_total = SS_bet_sub + SS_within_sub
# Make a dummy data set
thisData <- data.frame(
early = c(4, 10, 5, 8, 2),
mid = c(6, 9, 4, 6, 2),
late = c(2, 8, 3, 7, 2)
)
# SS from each score (total variability)
# sum((score - grand_mean)^2)
grand_mean <- mean(as.matrix(thisData))
SS_total <- sum((as.matrix(thisData) - grand_mean)^2)
SS_total
## [1] 106.4
# SS from row means (within-subject variability)
# sum((score - row_mean)^2)
row_mean <- rowMeans(thisData)
SS_within <- sum((as.matrix(thisData) - row_mean)^2)
SS_within
## [1] 14
# Between-subject variability
# sum(n * (row_mean - grand_mean)^2)
n <- ncol(thisData)
SS_bet_sub <- sum(n * (row_mean - grand_mean)^2)
SS_bet_sub
## [1] 92.4
# Column means (RM variability)
# sum(n * (col_mean - grand_mean)^2)
n <- nrow(thisData)
col_mean <- colMeans(thisData)
SS_rm <- sum(n * (col_mean - grand_mean)^2)
SS_rm
## [1] 5.2
# Load necessary libraries
library(foreign)
library(reshape)
library(psych)
library(nlme)
library(car)
library(ez)
library(gplots)
# Load the data (see https://discoverspss.com/pages/data)
thisData <- read.spss("data/Bushtucker.sav", to.data.frame = TRUE)
summary(thisData)
## stick testicle eye witchetty
## Min. : 5.000 Min. :2.00 Min. :1.000 Min. :1.00
## 1st Qu.: 6.750 1st Qu.:2.75 1st Qu.:1.750 1st Qu.:4.25
## Median : 8.000 Median :4.50 Median :4.000 Median :6.50
## Mean : 8.125 Mean :4.25 Mean :4.125 Mean :5.75
## 3rd Qu.: 9.250 3rd Qu.:5.25 3rd Qu.:6.250 3rd Qu.:8.00
## Max. :12.000 Max. :7.00 Max. :8.000 Max. :9.00
describe(thisData) # p.474
## thisData
##
## 4 Variables 8 Observations
## --------------------------------------------------------------------------------
## stick
## n missing distinct Info Mean Gmd
## 8 0 7 0.988 8.125 2.679
##
## Value 5 6 7 8 9 10 12
## Frequency 1 1 1 2 1 1 1
## Proportion 0.125 0.125 0.125 0.250 0.125 0.125 0.125
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## testicle
## n missing distinct Info Mean Gmd
## 8 0 6 0.976 4.25 2.214
##
## Value 2 3 4 5 6 7
## Frequency 2 1 1 2 1 1
## Proportion 0.250 0.125 0.125 0.250 0.125 0.125
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## eye
## n missing distinct Info Mean Gmd
## 8 0 7 0.988 4.125 3.321
##
## Value 1 2 3 5 6 7 8
## Frequency 2 1 1 1 1 1 1
## Proportion 0.250 0.125 0.125 0.125 0.125 0.125 0.125
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## witchetty
## n missing distinct Info Mean Gmd
## 8 0 7 0.988 5.75 3.429
##
## Value 1 2 5 6 7 8 9
## Frequency 1 1 1 1 1 2 1
## Proportion 0.125 0.125 0.125 0.125 0.125 0.250 0.125
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
# Change to the sum-to-zero convention (cf. Type II and Type III sums of squares)
# options(contrasts = c("contr.sum", "contr.poly"))
Anova()
in the car
Package# Fit the model using lm()
fit1 <- lm(as.matrix(thisData) ~ 1)
# Prepare the factors
factors.thisData <- colnames(thisData)
# Perform ANOVA
#fit1.aov <- Anova(fit1, idata = data.frame(factors.thisData), idesign = ~factors.thisData, type = "III")
#summary(fit1.aov)
# Add subject identifier
# thisData$subject <- as.factor(rownames(thisData))
# Melt the data
# thisData.molten <- melt(thisData, id.vars = "subject")
# head(thisData.molten)
lme()
and aov()
Note: aov()
and lme()
do not support
Mauchly test and Tukey HSD.
# Fit the model using lme()
#fit2 <- lme(value ~ variable, random = ~1 | subject / variable, data = thisData.molten)
#summary(fit2)
# Fit the model using aov()
#fit3 <- aov(value ~ variable + Error(subject / variable), data = thisData.molten)
#summary(fit3)
ezANOVA()
# Fit the model using ezANOVA
#fit4 <- ezANOVA(data = thisData.molten, dv = .(value), wid = .(subject), within = .(variable))
#fit4
# Load the data
thisData <- read.spss("data/Attitude.sav", to.data.frame = TRUE)
describe(thisData)
## thisData
##
## 9 Variables 20 Observations
## --------------------------------------------------------------------------------
## beerpos
## n missing distinct Info Mean Gmd .05 .10
## 20 0 14 0.994 21.05 15.25 1.00 6.40
## .25 .50 .75 .90 .95
## 12.75 18.50 31.00 40.00 40.15
##
## Value 1 7 8 9 14 15 17 20 22 26 30 34 40
## Frequency 2 1 1 1 1 3 1 1 1 1 2 2 2
## Proportion 0.10 0.05 0.05 0.05 0.05 0.15 0.05 0.05 0.05 0.05 0.10 0.10 0.10
##
## Value 43
## Frequency 1
## Proportion 0.05
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## beerneg
## n missing distinct Info Mean Gmd .05 .10
## 20 0 17 0.996 4.45 20.11 -18.05 -17.10
## .25 .50 .75 .90 .95
## -9.50 0.00 20.25 27.30 30.00
##
## Value -19 -18 -17 -12 -11 -9 -8 -6 6 12 15 17 20
## Frequency 1 1 1 1 1 1 1 3 1 1 1 1 1
## Proportion 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.15 0.05 0.05 0.05 0.05 0.05
##
## Value 21 23 27 30
## Frequency 1 1 1 2
## Proportion 0.05 0.05 0.05 0.10
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## beerneut
## n missing distinct Info Mean Gmd .05 .10
## 20 0 18 0.998 10 11.82 -5.25 -0.50
## .25 .50 .75 .90 .95
## 4.00 8.00 16.00 26.10 27.05
##
## Value -10 -5 0 3 4 5 6 7 8 9 12 13 15
## Frequency 1 1 1 1 2 1 1 1 2 1 1 1 1
## Proportion 0.05 0.05 0.05 0.05 0.10 0.05 0.05 0.05 0.10 0.05 0.05 0.05 0.05
##
## Value 19 21 26 27 28
## Frequency 1 1 1 1 1
## Proportion 0.05 0.05 0.05 0.05 0.05
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## winepos
## n missing distinct Info Mean Gmd .05 .10
## 20 0 14 0.994 25.35 7.774 14.80 16.80
## .25 .50 .75 .90 .95
## 22.25 25.00 29.25 34.00 34.20
##
## Value 11 15 17 20 23 24 26 27 28 29 30 32 34
## Frequency 1 1 1 2 2 3 1 1 2 1 1 1 2
## Proportion 0.05 0.05 0.05 0.10 0.10 0.15 0.05 0.05 0.10 0.05 0.05 0.05 0.10
##
## Value 38
## Frequency 1
## Proportion 0.05
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## wineneg
## n missing distinct Info Mean Gmd .05 .10
## 20 0 16 0.992 -12 7.179 -22.05 -18.40
## .25 .50 .75 .90 .95
## -15.25 -13.50 -6.75 -3.80 -2.00
##
## Value -23 -22 -18 -17 -16 -15 -14 -13 -12 -10 -9 -7 -6
## Frequency 1 1 1 1 1 4 1 1 1 1 1 1 1
## Proportion 0.05 0.05 0.05 0.05 0.05 0.20 0.05 0.05 0.05 0.05 0.05 0.05 0.05
##
## Value -5 -4 -2
## Frequency 1 1 2
## Proportion 0.05 0.05 0.10
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## wineneut
## n missing distinct Info Mean Gmd .05 .10
## 20 0 13 0.993 11.65 7.289 3.80 4.00
## .25 .50 .75 .90 .95
## 6.00 12.50 16.50 19.10 20.05
##
## Value 0 4 6 7 12 13 14 15 16 18 19 20 21
## Frequency 1 2 3 2 2 1 2 1 1 1 2 1 1
## Proportion 0.05 0.10 0.15 0.10 0.10 0.05 0.10 0.05 0.05 0.05 0.10 0.05 0.05
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## waterpos
## n missing distinct Info Mean Gmd .05 .10
## 20 0 15 0.996 17.4 8.158 8.85 9.00
## .25 .50 .75 .90 .95
## 12.00 17.00 21.00 27.20 29.20
##
## Value 6 9 10 12 13 15 16 17 19 20 21 23 27
## Frequency 1 2 1 2 1 1 1 2 2 1 2 1 1
## Proportion 0.05 0.10 0.05 0.10 0.05 0.05 0.05 0.10 0.10 0.05 0.10 0.05 0.05
##
## Value 29 33
## Frequency 1 1
## Proportion 0.05 0.05
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## waterneg
## n missing distinct Info Mean Gmd .05 .10
## 20 0 14 0.994 -9.2 7.937 -19.05 -17.20
## .25 .50 .75 .90 .95
## -14.50 -10.00 -4.00 -1.00 -0.70
##
## Value -20 -19 -17 -16 -14 -12 -11 -10 -9 -6 -4 -2 -1
## Frequency 1 1 2 1 1 1 1 3 1 2 2 1 2
## Proportion 0.05 0.05 0.10 0.05 0.05 0.05 0.05 0.15 0.05 0.10 0.10 0.05 0.10
##
## Value 5
## Frequency 1
## Proportion 0.05
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## waterneut
## n missing distinct Info Mean Gmd .05 .10
## 20 0 13 0.993 2.35 7.574 -13.0 -5.8
## .25 .50 .75 .90 .95
## 0.0 2.5 8.0 10.0 10.1
##
## Value -13 -5 -2 0 1 2 3 4 5 8 9 10 12
## Frequency 2 1 1 2 1 3 1 2 1 2 1 2 1
## Proportion 0.10 0.05 0.05 0.10 0.05 0.15 0.05 0.10 0.05 0.10 0.05 0.10 0.05
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
# Add subject identifier
thisData$subject <- as.factor(rownames(thisData))
# Melt the data
thisData.molten <- melt(thisData, id.vars = "subject")
# Create factors for 'drink' and 'image'
thisData.molten$drink <- factor(ifelse(grepl("beer", thisData.molten$variable), "beer",
ifelse(grepl("wine", thisData.molten$variable), "wine", "water")),
levels = c("beer", "wine", "water"))
thisData.molten$image <- factor(ifelse(grepl("pos", thisData.molten$variable), "sexy",
ifelse(grepl("neg", thisData.molten$variable), "corpse", "person in armchair")),
levels = c("sexy", "corpse", "person in armchair"))
# Fit the model using ezANOVA
fit5 <- ezANOVA(data = thisData.molten, dv = .(value), wid = .(subject), within = .(drink, image))
fit5
## $ANOVA
## Effect DFn DFd F p p<.05 ges
## 2 drink 2 38 5.105981 1.086293e-02 * 0.1158687
## 3 image 2 38 122.564825 2.680197e-17 * 0.5753191
## 4 drink:image 4 76 17.154922 4.589040e-10 * 0.1411741
##
## $`Mauchly's Test for Sphericity`
## Effect W p p<.05
## 2 drink 0.2672411 0.000006952302 *
## 3 image 0.6621013 0.024452301563 *
## 4 drink:image 0.5950440 0.435658665787
##
## $`Sphericity Corrections`
## Effect GGe p[GG] p[GG]<.05 HFe p[HF] p[HF]<.05
## 2 drink 0.5771143 2.976868e-02 * 0.5907442 2.881391e-02 *
## 3 image 0.7474407 1.757286e-13 * 0.7968420 3.142804e-14 *
## 4 drink:image 0.7983979 1.900249e-08 * 0.9785878 6.809640e-10 *
# Define colors
color <- c("lightyellow", "lightblue", "lightgreen")
# Calculate means and standard deviations
thisData.mean <- tapply(thisData.molten$value, thisData.molten$drink, mean)
thisData.sd <- tapply(thisData.molten$value, thisData.molten$drink, sd)
thisData.n <- tapply(thisData.molten$value, thisData.molten$drink, length)
# Create bar plot with confidence intervals
library(gplots)
barplot2(thisData.mean,
ylim = c(0, 20),
beside = TRUE,
main = "Field Ch.13 (p.494)",
ylab = "Mean Attitude",
xlab = "Drink",
col = color,
ci.l = thisData.mean - ((thisData.sd / sqrt(thisData.n)) * qt(0.975, df = (thisData.n - 1))),
ci.u = thisData.mean + ((thisData.sd / sqrt(thisData.n)) * qt(0.975, df = (thisData.n - 1))),
plot.ci = TRUE)
legend("topright", levels(thisData.molten$drink), fill = color)
# Create a dummy data set
thisData <- data.frame(
early = c(4, 10, 5, 8, 2),
mid = c(6, 9, 4, 6, 2),
late = c(2, 8, 3, 7, 2)
)
library(reshape)
# Add subject identifier
thisData$subject <- as.factor(rownames(thisData))
# Melt the data
thisData.molten <- melt(thisData, id.vars = "subject")
head(thisData.molten)
## subject variable value
## 1 1 early 4
## 2 2 early 10
## 3 3 early 5
## 4 4 early 8
## 5 5 early 2
## 6 1 mid 6
# Conduct one-way RM ANOVA
fit <- aov(value ~ variable + Error(subject / variable), data = thisData.molten)
summary(fit)
##
## Error: subject
## Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 4 92.4 23.1
##
## Error: subject:variable
## Df Sum Sq Mean Sq F value Pr(>F)
## variable 2 5.2 2.6 2.364 0.156
## Residuals 8 8.8 1.1
# Read data from local disk
thisData <- read.table("data/Pitt_Shoaf3.txt", header = TRUE)
# Aggregate the data (i.e., collapsing the 'overlap' factor)
aggregatedData <- aggregate(thisData$rt, list(thisData$subj, thisData$position), mean)
colnames(aggregatedData) <- c("subj", "position", "rt")
library(psych)
by(aggregatedData$rt, list(aggregatedData$position), describe)
## : early
## dd[x, ]
## n missing distinct Info Mean Gmd .05 .10
## 97 0 93 1 832.8 169 615.9 636.5
## .25 .50 .75 .90 .95
## 714.5 825.0 930.0 1044.8 1072.3
##
## lowest : 540.5 560.5 577.5 609.5 615.5 , highest: 1075.5 1107 1113.5 1125 1134
## ------------------------------------------------------------
## : late
## dd[x, ]
## n missing distinct Info Mean Gmd .05 .10
## 97 0 93 1 814 207.8 565.3 588.7
## .25 .50 .75 .90 .95
## 665.0 812.0 941.5 1048.0 1094.7
##
## lowest : 244 437.5 539.5 540 556.5 , highest: 1105.5 1126.5 1200.5 1238.5 1267
## ------------------------------------------------------------
## : mid
## dd[x, ]
## n missing distinct Info Mean Gmd .05 .10
## 97 0 93 1 799 176.1 583.5 595.6
## .25 .50 .75 .90 .95
## 692.5 810.5 898.5 994.3 1060.2
##
## lowest : 398 462 479.5 501.5 583.5 , highest: 1069 1080 1087 1090.5 1143.5
# Conduct one-way RM ANOVA with aov()
fit <- aov(rt ~ position + Error(subj / position), data = aggregatedData)
summary(fit)
##
## Error: subj
## Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 96 5093672 53059
##
## Error: subj:position
## Df Sum Sq Mean Sq F value Pr(>F)
## position 2 55829 27914 2.15 0.119
## Residuals 192 2492861 12984
# Conduct one-way RM ANOVA with lme()
library(nlme)
fit2 <- lme(rt ~ position, random = ~1 | subj, data = aggregatedData)
summary(fit2)
## Linear mixed-effects model fit by REML
## Data: aggregatedData
## AIC BIC logLik
## 3706.147 3724.462 -1848.074
##
## Random effects:
## Formula: ~1 | subj
## (Intercept) Residual
## StdDev: 115.579 113.9458
##
## Fixed effects: rt ~ position
## Value Std.Error DF t-value p-value
## (Intercept) 815.2629 13.503110 192 60.37594 0.0000
## position1 17.5670 9.446408 192 1.85965 0.0645
## position2 -1.2784 9.446408 192 -0.13533 0.8925
## Correlation:
## (Intr) postn1
## position1 0.0
## position2 0.0 -0.5
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -2.59527641 -0.56553153 -0.05408611 0.57800835 3.01507827
##
## Number of Observations: 291
## Number of Groups: 97
anova(fit2)
## numDF denDF F-value p-value
## (Intercept) 1 192 3645.254 <.0001
## position 2 192 2.150 0.1193
# Conduct one-way RM ANOVA with multivariate approach
crossTabulatedData <- xtabs(rt ~ subj + position, data = thisData) / 2
crossTabulatedData
## position
## subj early late mid
## AALA 1107.0 724.5 773.0
## AAMS 702.5 751.5 853.0
## ADAN 1064.5 609.0 959.0
## AING 884.0 998.5 804.0
## AITT 822.0 724.0 846.5
## ALER 1125.0 1105.5 870.5
## AMAM 703.0 680.0 619.0
## AMAN 710.5 827.0 686.5
## AOOD 908.0 893.5 805.5
## AOPE 609.5 579.0 501.5
## ASAO 793.0 1267.0 1087.0
## BECK 835.5 654.0 813.5
## BHEL 686.5 911.5 659.0
## BHER 1005.0 802.5 791.5
## BKIS 1021.5 862.0 695.0
## BREK 880.5 941.0 957.0
## BUHN 893.0 922.0 898.5
## CLET 943.5 539.5 708.5
## CMER 889.0 665.5 583.5
## CNAN 794.5 567.5 709.0
## COTS 836.5 826.0 850.5
## DDUM 616.0 735.5 650.5
## DIED 620.0 726.0 692.5
## DKEL 1032.5 975.0 990.5
## DLER 734.5 678.5 711.5
## DSER 1044.0 1048.0 783.0
## DYNE 809.5 1055.0 825.5
## EEVA 651.5 569.0 685.0
## ELES 759.5 844.5 878.5
## ELLO 793.5 806.5 891.0
## ESEN 770.0 828.0 978.5
## GEOR 627.5 825.5 583.5
## GICH 560.5 802.0 823.0
## GNKS 1075.5 974.0 942.0
## HEAD 895.5 894.0 957.5
## HLOC 855.0 1028.0 834.0
## HTON 930.0 1200.5 1022.0
## JAIN 853.0 812.0 868.0
## JBRY 929.5 946.0 868.0
## JDER 981.5 839.0 945.0
## JDYE 816.0 934.0 584.5
## JERT 842.0 590.5 682.5
## JGER 797.5 675.5 814.5
## JGGS 743.0 741.0 850.5
## JILL 540.5 640.5 634.0
## JKIN 959.0 692.0 728.0
## JLAR 617.0 647.5 590.0
## JMAN 917.5 810.5 712.0
## JOSH 883.5 901.5 968.5
## JRON 689.0 540.0 590.5
## JROS 1068.5 1092.0 1007.0
## JSEL 749.0 642.5 772.5
## JSER 816.5 866.5 849.0
## JSOM 930.5 981.5 1090.5
## JSON 642.5 244.0 398.0
## JTIL 817.5 947.5 782.5
## KDGE 714.5 910.5 904.5
## KERD 785.0 1029.5 954.0
## KETI 1113.5 1026.5 857.5
## KIMB 667.0 556.5 462.0
## KIRK 890.5 915.0 1000.0
## KNDY 969.0 842.5 951.5
## KOFF 1071.5 887.0 725.5
## KSON 882.5 942.0 842.5
## KTON 1048.0 859.5 837.0
## LARD 798.5 665.0 728.0
## LAUR 615.5 648.0 479.5
## LNIN 957.0 832.5 969.5
## LYNH 926.5 1064.0 1044.0
## MDER 877.0 437.5 810.5
## MINS 728.5 810.0 876.0
## MIOT 914.0 665.0 635.5
## MKLE 715.0 788.0 770.0
## MMBO 1022.0 783.0 1143.5
## MSON 1013.0 1048.0 820.0
## MTOK 847.5 980.0 1069.0
## NHAW 813.5 626.0 599.0
## NIHA 778.5 750.5 748.5
## PUNO 703.0 570.0 753.5
## REEN 755.5 586.0 635.0
## RLLS 825.0 801.5 754.0
## RNKE 1046.0 1238.5 946.5
## RUCH 836.5 899.5 865.0
## SALE 952.0 712.5 1080.0
## SANO 798.5 1015.0 816.0
## SARS 680.5 647.0 644.5
## SART 825.0 1082.5 909.0
## SEAD 1134.0 1015.0 805.0
## SETT 1023.0 652.5 860.0
## SIST 668.5 613.5 617.5
## SLER 666.0 630.0 696.5
## SLEY 660.5 820.0 693.0
## SSKI 844.0 941.5 1058.0
## STEP 761.0 664.5 804.5
## SVES 670.5 776.5 587.0
## TITH 626.5 712.5 674.0
## TSBY 577.5 1126.5 618.5
mlm1 <- lm(crossTabulatedData ~ 1)
position <- factor(colnames(crossTabulatedData))
library(car)
mlm1.aov <- Anova(mlm1, idata = data.frame(position), idesign = ~position, type = "III")
summary(mlm1.aov, multivariate = FALSE)
##
## Univariate Type III Repeated-Measures ANOVA Assuming Sphericity
##
## Sum Sq num Df Error SS den Df F value Pr(>F)
## (Intercept) 193414190 1 5093672 96 3645.26 <2e-16 ***
## position 55829 2 2492861 192 2.15 0.1193
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Mauchly Tests for Sphericity
##
## Test statistic p-value
## position 0.92105 0.020116
##
##
## Greenhouse-Geisser and Huynh-Feldt Corrections
## for Departure from Sphericity
##
## GG eps Pr(>F[GG])
## position 0.92683 0.1233
##
## HF eps Pr(>F[HF])
## position 0.9443014 0.1223673
# Pairwise t-tests
contrastD <- crossTabulatedData[, 1] - crossTabulatedData[, 2]
t.test(contrastD)
##
## One Sample t-test
##
## data: contrastD
## t = 1.0218, df = 96, p-value = 0.3095
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -17.76518 55.45591
## sample estimates:
## mean of x
## 18.84536
contrastD <- crossTabulatedData[, 1] - crossTabulatedData[, 3]
t.test(contrastD)
##
## One Sample t-test
##
## data: contrastD
## t = 2.3142, df = 96, p-value = 0.02279
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 4.816731 62.894609
## sample estimates:
## mean of x
## 33.85567
contrastD <- crossTabulatedData[, 2] - crossTabulatedData[, 3]
t.test(contrastD)
##
## One Sample t-test
##
## data: contrastD
## t = 0.95138, df = 96, p-value = 0.3438
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -16.30744 46.32806
## sample estimates:
## mean of x
## 15.01031
# Read data from local disk
thisData <- read.table("data/Pitt_Shoaf3.txt", header = TRUE)
# Cross-tabulate by two factors
crossTabulatedData <- xtabs(rt ~ subj + position + overlap, data = thisData)
# SPSS requires a strange data format for two-way RM ANOVA
temp.threeOverlap <- as.data.frame(crossTabulatedData[, , 1])
temp.zeroOverlap <- as.data.frame(crossTabulatedData[, , 2])
colnames(temp.threeOverlap) <- paste(colnames(temp.threeOverlap), "3", sep = "_")
colnames(temp.zeroOverlap) <- paste(colnames(temp.zeroOverlap), "0", sep = "_")
data4SPSS <- data.frame(temp.threeOverlap, temp.zeroOverlap, check.names = TRUE, check.rows = TRUE)
library(psych)
describe(aggregatedData)
## aggregatedData
##
## 3 Variables 291 Observations
## --------------------------------------------------------------------------------
## subj
## n missing distinct
## 291 0 97
##
## lowest : AALA AAMS ADAN AING AITT, highest: SSKI STEP SVES TITH TSBY
## --------------------------------------------------------------------------------
## position
## n missing distinct
## 291 0 3
##
## Value early late mid
## Frequency 97 97 97
## Proportion 0.333 0.333 0.333
## --------------------------------------------------------------------------------
## rt
## n missing distinct Info Mean Gmd .05 .10
## 291 0 260 1 815.3 184.1 578.2 616.0
## .25 .50 .75 .90 .95
## 690.5 816.0 930.2 1032.5 1077.8
##
## lowest : 244 398 437.5 462 479.5 , highest: 1134 1143.5 1200.5 1238.5 1267
## --------------------------------------------------------------------------------
# Conduct two-way factorial RM ANOVA with aov()
fit <- aov(rt ~ position * overlap + Error(subj / (position * overlap)), data = thisData)
summary(fit)
##
## Error: subj
## Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 96 10187344 106118
##
## Error: subj:position
## Df Sum Sq Mean Sq F value Pr(>F)
## position 2 111658 55829 2.15 0.119
## Residuals 192 4985721 25967
##
## Error: subj:overlap
## Df Sum Sq Mean Sq F value Pr(>F)
## overlap 1 19758 19758 1.269 0.263
## Residuals 96 1494378 15566
##
## Error: subj:position:overlap
## Df Sum Sq Mean Sq F value Pr(>F)
## position:overlap 2 284424 142212 10.12 0.0000662 ***
## Residuals 192 2697804 14051
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(foreign)
library(reshape)
# Load the data
thisData <- read.spss("data/LooksOrPersonalityGraphs.sav", to.data.frame = TRUE)
# Subset the data
thisData.subset <- subset(thisData, !is.na(gender), select = c(
"gender", "att_high", "av_high", "ug_high",
"att_some", "av_some", "ug_some",
"att_none", "av_none", "ug_none"
))
# Add subject identifier
thisData.subset$subject <- as.factor(rownames(thisData.subset))
# Melt the data
newData <- melt(thisData.subset, id.vars = c("subject", "gender"))
newData$subject <- factor(newData$subject)
# Create factors for 'charisma' and 'looks'
newData$charisma <- factor(
ifelse(grepl("_high", newData$variable), "high",
ifelse(grepl("_some", newData$variable), "some", "dullard")),
levels = c("high", "some", "dullard")
)
newData$looks <- factor(
ifelse(grepl("att_", newData$variable), "attractive",
ifelse(grepl("av_", newData$variable), "average", "ugly")),
levels = c("attractive", "average", "ugly")
)
# Rename value column
colnames(newData)[4] <- "rate"
# Incorrect model (ignores repeated measures)
model1 <- aov(rate ~ charisma * looks, data = newData)
summary(model1)
## Df Sum Sq Mean Sq F value Pr(>F)
## charisma 2 23234 11617 124.59 < 2e-16 ***
## looks 2 20780 10390 111.43 < 2e-16 ***
## charisma:looks 4 4055 1014 10.87 7.06e-08 ***
## Residuals 171 15944 93
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Correct mixed-design ANOVA
model2 <- aov(rate ~ gender * charisma * looks + Error(subject / (charisma * looks)), data = newData)
summary(model2)
##
## Error: subject
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 0.2 0.20 0.005 0.946
## Residuals 18 760.2 42.23
##
## Error: subject:charisma
## Df Sum Sq Mean Sq F value Pr(>F)
## charisma 2 23234 11617 328.25 < 2e-16 ***
## gender:charisma 2 4420 2210 62.45 1.97e-12 ***
## Residuals 36 1274 35
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Error: subject:looks
## Df Sum Sq Mean Sq F value Pr(>F)
## looks 2 20780 10390 423.73 < 2e-16 ***
## gender:looks 2 3944 1972 80.43 5.23e-14 ***
## Residuals 36 883 25
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Error: subject:charisma:looks
## Df Sum Sq Mean Sq F value Pr(>F)
## charisma:looks 4 4055 1013.8 36.63 < 2e-16 ***
## gender:charisma:looks 4 2670 667.4 24.12 1.11e-12 ***
## Residuals 72 1993 27.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(gplots)
# Define sample size
num <- length(unique(newData$subject))
# Bar plot for gender
means_gender <- tapply(newData$rate, newData$gender, mean)
sds_gender <- tapply(newData$rate, newData$gender, sd)
barplot2(means_gender,
beside = TRUE,
main = "Bar plots with 95% CI (not adjusted for RM)",
col = "lightgreen",
ylab = "Mean Attitude",
xlab = "Gender",
ci.l = means_gender - ((sds_gender / sqrt(num)) * qt(0.975, df = (num - 1))),
ci.u = means_gender + ((sds_gender / sqrt(num)) * qt(0.975, df = (num - 1))),
plot.ci = TRUE)
# Interaction plots
interaction.plot(newData$looks, newData$gender, newData$rate, main = "Interaction plot: looks by gender", type = "b", pch = c(19, 15), col = c("orange", "blue"))
interaction.plot(newData$charisma, newData$gender, newData$rate, main = "Interaction plot: charisma by gender", type = "b", pch = c(19, 15), col = c("orange", "blue"))
interaction.plot(newData$looks, newData$charisma, newData$rate, main = "Interaction plot: looks by charisma", type = "b", pch = c(19, 15, 17), col = c("orange", "blue", "red"))
ez
Package)# Read data from local disk
thisData <- read.table("data/spanengRT.txt", header = TRUE)
thisData$listener <- as.factor(thisData$listener)
# Subset the data
subsetData <- subset(thisData, pair %in% c("d_r", "d_th", "r_th"))
library(gdata)
subsetData <- drop.levels(subsetData)
#library(ez)
## Conduct mixed-design ANOVA
#model.ezAnova <- ezANOVA(
# data = thisData,
# dv = .(MedianRT),
# wid = .(listener),
# within = .(pair),
# between = .(group)
#)
#model.ezAnova
glm()
Family | Type of Regression | Default Link Function |
---|---|---|
binomial |
Logistic regression | binomial("logit") |
gaussian |
Gaussian | gaussian("identity") |
Gamma |
Gamma distribution | Gamma("inverse") |
inverse.gaussian |
Inverse Gaussian | inverse.gaussian("1/mu^2") |
poisson |
Poisson | poisson("log") |
quasibinomial |
Quasi-binomial | quasibinomial("logit") |
quasipoisson |
Quasi-Poisson | quasipoisson("log") |
library(psych)
# Read the data
thisData <- read.csv("data/DDRASSTR.txt", sep = "\t")
# Convert variables to factors
thisData$strnumbers <- factor(thisData$strnumbers)
thisData$style <- factor(thisData$style)
thisData$region <- factor(thisData$region)
thisData$banknumbers <- factor(thisData$banknumbers)
thisData$classnumbers <- factor(thisData$classnumbers)
thisData$participant.number <- factor(thisData$participant.number)
summary(thisData)
## str strnumbers emphatic gender style
## Length:240 0:183 Length:240 Length:240 0:120
## Class :character 1: 57 Class :character Class :character 1:120
## Mode :character Mode :character Mode :character
##
##
##
##
## ageletters age region Mall banknumbers
## Length:240 Length:240 1:81 Length:240 2:34
## Class :character Class :character 2:80 Class :character 3:38
## Mode :character Mode :character 3:79 Mode :character 4:30
## 5:80
## 6:58
##
##
## bank store participant.number job
## Length:240 Length:240 1 : 2 Length:240
## Class :character Class :character 2 : 2 Class :character
## Mode :character Mode :character 3 : 2 Mode :character
## 4 : 2
## 5 : 2
## 6 : 2
## (Other):228
## classnumbers class
## 1:73 Length:240
## 2:88 Class :character
## 3:79 Mode :character
##
##
##
##
describe(thisData)
## thisData
##
## 16 Variables 240 Observations
## --------------------------------------------------------------------------------
## str
## n missing distinct
## 240 0 2
##
## Value shtr str
## Frequency 57 183
## Proportion 0.238 0.762
## --------------------------------------------------------------------------------
## strnumbers
## n missing distinct
## 240 0 2
##
## Value 0 1
## Frequency 183 57
## Proportion 0.762 0.238
## --------------------------------------------------------------------------------
## emphatic
## n missing distinct
## 240 0 2
##
## Value less more
## Frequency 120 120
## Proportion 0.5 0.5
## --------------------------------------------------------------------------------
## gender
## n missing distinct
## 240 0 2
##
## Value m w
## Frequency 120 120
## Proportion 0.5 0.5
## --------------------------------------------------------------------------------
## style
## n missing distinct
## 240 0 2
##
## Value 0 1
## Frequency 120 120
## Proportion 0.5 0.5
## --------------------------------------------------------------------------------
## ageletters
## n missing distinct
## 240 0 3
##
## Value a b c
## Frequency 80 80 80
## Proportion 0.333 0.333 0.333
## --------------------------------------------------------------------------------
## age
## n missing distinct
## 240 0 3
##
## Value mid old young
## Frequency 80 80 80
## Proportion 0.333 0.333 0.333
## --------------------------------------------------------------------------------
## region
## n missing distinct
## 240 0 3
##
## Value 1 2 3
## Frequency 81 80 79
## Proportion 0.338 0.333 0.329
## --------------------------------------------------------------------------------
## Mall
## n missing distinct
## 240 0 3
##
## Value CityCenter Easton Polaris
## Frequency 79 80 81
## Proportion 0.329 0.333 0.338
## --------------------------------------------------------------------------------
## banknumbers
## n missing distinct
## 240 0 5
##
## Value 2 3 4 5 6
## Frequency 34 38 30 80 58
## Proportion 0.142 0.158 0.125 0.333 0.242
## --------------------------------------------------------------------------------
## bank
## n missing distinct
## 240 0 5
##
## Value LMC MMC MWC UMC UWC
## Frequency 30 80 34 58 38
## Proportion 0.125 0.333 0.142 0.242 0.158
## --------------------------------------------------------------------------------
## store
## n missing distinct
## 240 0 35
##
## lowest : aeagle anntaylor banana best buy brepub
## highest: sbarro sears target udf walmart
## --------------------------------------------------------------------------------
## participant.number
## n missing distinct
## 240 0 120
##
## lowest : 1 2 3 4 5 , highest: 121 122 123 124 125
## --------------------------------------------------------------------------------
## job
## n missing distinct
## 240 0 3
##
## Value c f g
## Frequency 236 2 2
## Proportion 0.983 0.008 0.008
## --------------------------------------------------------------------------------
## classnumbers
## n missing distinct
## 240 0 3
##
## Value 1 2 3
## Frequency 73 88 79
## Proportion 0.304 0.367 0.329
## --------------------------------------------------------------------------------
## class
## n missing distinct
## 240 0 3
##
## Value LMC UMC WC
## Frequency 88 79 73
## Proportion 0.367 0.329 0.304
## --------------------------------------------------------------------------------
## Relevel 'emphatic' variable
#thisData$emphatic <- relevel(thisData$emphatic, ref = "more")
# Fit logistic regression model
#fit1 <- glm(str ~ emphatic, data = thisData, family = binomial("logit"))
#summary(fit1)
# Exponentiate coefficients to get odds ratios
#exp(coef(fit1))
x <- c(1.83, 0.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30)
y <- c(0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29)
wilcox.test(x, y, paired = TRUE, alternative = "greater")
##
## Wilcoxon signed rank exact test
##
## data: x and y
## V = 40, p-value = 0.01953
## alternative hypothesis: true location shift is greater than 0
# Binomial test example
binom.test(x = 52, n = 156, p = 0.25)
##
## Exact binomial test
##
## data: 52 and 156
## number of successes = 52, number of trials = 156, p-value = 0.02043
## alternative hypothesis: true probability of success is not equal to 0.25
## 95 percent confidence interval:
## 0.2599810 0.4131553
## sample estimates:
## probability of success
## 0.3333333
Assumptions of chi-square test:
# Chi-square test example
x <- matrix(c(16, 4, 8, 12), nrow = 2, byrow = TRUE)
chisq.test(x, correct = FALSE)
##
## Pearson's Chi-squared test
##
## data: x
## X-squared = 6.6667, df = 1, p-value = 0.009823
TeaTasting <- matrix(c(3, 1, 1, 3), nrow = 2,
dimnames = list(Guess = c("Milk", "Tea"), Truth = c("Milk", "Tea")))
fisher.test(TeaTasting, alternative = "greater")
##
## Fisher's Exact Test for Count Data
##
## data: TeaTasting
## p-value = 0.2429
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 0.3135693 Inf
## sample estimates:
## odds ratio
## 6.408309
uncooperativeness <- c(6,7,10,8,8,9,8,10,6,8,7,3,7,1)
treatment <- factor(rep(1:3, c(6, 4, 4)), labels = c("Method A", "Method B", "Method C"))
uncooperativenessData <- data.frame(uncooperativeness, treatment)
kruskal.test(uncooperativeness ~ treatment, data = uncooperativenessData)
##
## Kruskal-Wallis rank sum test
##
## data: uncooperativeness by treatment
## Kruskal-Wallis chi-squared = 4.664, df = 2, p-value = 0.0971
pairwise.t.test(uncooperativeness, treatment, p.adj = "bonferroni")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: uncooperativeness and treatment
##
## Method A Method B
## Method B 1.000 -
## Method C 0.064 0.097
##
## P value adjustment method: bonferroni
pairwise.t.test(uncooperativeness, treatment, p.adj = "holm")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: uncooperativeness and treatment
##
## Method A Method B
## Method B 1.000 -
## Method C 0.064 0.065
##
## P value adjustment method: holm