Das Beispiel aus sheet_manova.html wird um eine zweite kategoriale erklärende Variable erweitert. Bei der Erweiterung des Datensatzes handelt es sich um generierte Daten, nur zu Demonstrationszwecken. Die Messwiederholungsstruktur wird nicht berücksichtigt.
Verhältnis von erklärter Varianz zu Residualvarianz wird ausgedrückt
in der Matrix \(HE^{-1}\). Sie entsteht
durch die Multiplikation der Model-SSCP mit der Inversen der
Residual-SSCP (entspricht Division). (SSCP Sum of Square and Cross
Product Matrix). \(HE^{-1} = H *
E^{-1}\) In der Diagonale stehen die Eigenwerte, die konzeptuell
ähnlich den F-Ratios aufgefasst werden können. Die verschiedenen Tests,
Pillai-Bartlett, Hotellings T^2, Wilks Lamda und Roys largest root
können in R’s manova()
angefordert werden durch den
Parameter
test = c("Pillai", "Wilks", "Hotelling-Lawley", "Roy")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.6 ✔ dplyr 1.0.8
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
d2 <- read_delim("https://md.psych.bio.uni-goettingen.de/mv/data/div/ocd_data.dat", delim = "\t")
## Rows: 30 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): group
## dbl (2): actions, thoughts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d2$t <- 2
# we double the data and create a second data collection
d1 <- d2
d1$t <- 1
set.seed(123)
d1$actions <- d1$actions + round(rnorm(30) * 2) - 1
d1$thoughts <- d1$thoughts + round(rnorm(30) * 3) -2
dd <- rbind(d1, d2)
# we set factors
dd$group <- dd$group %>%
factor(levels = c("No Treatment Control", "BT", "CBT"),
labels = c("NT", "BT", "CBT"))
dd$t <- dd$t %>% factor(levels = c('1', '2'), labels=c('post', 'followup'))
# we create a ggplot object
baseplot <- ggplot(dd, aes(x = actions, y = thoughts, group=t, color=t))
# we add points
(scatterplot <- baseplot + geom_point())
# we add regression lines
(lineplot <- scatterplot + geom_smooth(method = "lm"))
## `geom_smooth()` using formula 'y ~ x'
# we do it groupwise
(groupplot <- lineplot + facet_wrap(~ group))
## `geom_smooth()` using formula 'y ~ x'
# we leave standard reference coding, see below for other possibilities for contrasts
# reference groups are "NT" for group and "post" for t
# we build the outcome matrix
outcome <- cbind(dd$actions, dd$thoughts)
# we adapt a model
m.1 <- manova(outcome ~ group, data = dd)
summary(m.1)
## Df Pillai approx F num Df den Df Pr(>F)
## group 2 0.093441 1.3968 4 114 0.2396
## Residuals 57
summary(m.1, intercept = TRUE)
## Df Pillai approx F num Df den Df Pr(>F)
## (Intercept) 1 0.96580 790.69 2 56 <2e-16 ***
## group 2 0.09344 1.40 4 114 0.2396
## Residuals 57
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(m.1)
## Response 1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 12.033 6.0167 1.5663 0.2177
## Residuals 57 218.950 3.8412
##
## Response 2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 26.23 13.1167 1.4211 0.2499
## Residuals 57 526.10 9.2298
summary(lm(m.1))
## Response Y1 :
##
## Call:
## lm(formula = Y1 ~ group)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.20 -1.40 -0.20 1.55 5.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.2000 0.4382 9.584 1.75e-13 ***
## groupBT -0.8000 0.6198 -1.291 0.202
## groupCBT 0.2500 0.6198 0.403 0.688
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.96 on 57 degrees of freedom
## Multiple R-squared: 0.0521, Adjusted R-squared: 0.01884
## F-statistic: 1.566 on 2 and 57 DF, p-value: 0.2177
##
##
## Response Y2 :
##
## Call:
## lm(formula = Y2 ~ group)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.35 -1.35 -0.25 1.65 8.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.3500 0.6793 21.124 <2e-16 ***
## groupBT -0.1000 0.9607 -0.104 0.917
## groupCBT -1.4500 0.9607 -1.509 0.137
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.038 on 57 degrees of freedom
## Multiple R-squared: 0.0475, Adjusted R-squared: 0.01407
## F-statistic: 1.421 on 2 and 57 DF, p-value: 0.2499
# we include t as second predictive variable
m.2 <- manova(outcome ~ group + t, data = dd)
summary(m.2)
## Df Pillai approx F num Df den Df Pr(>F)
## group 2 0.096809 1.4243 4 112 0.23056
## t 1 0.141671 4.5390 2 55 0.01498 *
## Residuals 56
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m.2, intercept=TRUE)
## Df Pillai approx F num Df den Df Pr(>F)
## (Intercept) 1 0.96989 885.77 2 55 < 2e-16 ***
## group 2 0.09681 1.42 4 112 0.23056
## t 1 0.14167 4.54 2 55 0.01498 *
## Residuals 56
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(m.2))
## Response 1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 12.033 6.0167 1.6603 0.19930
## t 1 16.017 16.0167 4.4198 0.04003 *
## Residuals 56 202.933 3.6238
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 26.23 13.1167 1.4788 0.23667
## t 1 29.40 29.4000 3.3147 0.07401 .
## Residuals 56 496.70 8.8696
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(m.2))
## Response Y1 :
##
## Call:
## lm(formula = Y1 ~ group + t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6833 -0.9667 0.0667 1.0833 6.1167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6833 0.4915 7.494 5.28e-10 ***
## groupBT -0.8000 0.6020 -1.329 0.189
## groupCBT 0.2500 0.6020 0.415 0.680
## tfollowup 1.0333 0.4915 2.102 0.040 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.904 on 56 degrees of freedom
## Multiple R-squared: 0.1214, Adjusted R-squared: 0.07437
## F-statistic: 2.58 on 3 and 56 DF, p-value: 0.06254
##
##
## Response Y2 :
##
## Call:
## lm(formula = Y2 ~ group + t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.650 -1.650 -0.125 1.800 9.350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.6500 0.7690 17.751 <2e-16 ***
## groupBT -0.1000 0.9418 -0.106 0.916
## groupCBT -1.4500 0.9418 -1.540 0.129
## tfollowup 1.4000 0.7690 1.821 0.074 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.978 on 56 degrees of freedom
## Multiple R-squared: 0.1007, Adjusted R-squared: 0.05255
## F-statistic: 2.091 on 3 and 56 DF, p-value: 0.1117
# we include interaction term
m.3 <- manova(outcome ~ group*t, data = dd)
summary(m.3)
## Df Pillai approx F num Df den Df Pr(>F)
## group 2 0.097687 1.3865 4 108 0.24346
## t 1 0.142493 4.4035 2 53 0.01701 *
## group:t 2 0.016511 0.2248 4 108 0.92405
## Residuals 54
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m.3, intercept=TRUE)
## Df Pillai approx F num Df den Df Pr(>F)
## (Intercept) 1 0.97000 856.97 2 53 < 2e-16 ***
## group 2 0.09769 1.39 4 108 0.24346
## t 1 0.14249 4.40 2 53 0.01701 *
## group:t 2 0.01651 0.22 4 108 0.92405
## Residuals 54
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# we can use other traces using test = c("Pillai", "Wilks", "Hotelling-Lawley", "Roy")
summary(m.3, intercept=TRUE, test="Wilks")
## Df Wilks approx F num Df den Df Pr(>F)
## (Intercept) 1 0.03000 856.97 2 53 < 2e-16 ***
## group 2 0.90407 1.37 4 106 0.24911
## t 1 0.85751 4.40 2 53 0.01701 *
## group:t 2 0.98353 0.22 4 106 0.92624
## Residuals 54
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(m.3))
## Response 1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 12.033 6.0167 1.6221 0.20696
## t 1 16.017 16.0167 4.3180 0.04248 *
## group:t 2 2.633 1.3167 0.3550 0.70282
## Residuals 54 200.300 3.7093
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 26.23 13.1167 1.4321 0.2477
## t 1 29.40 29.4000 3.2099 0.0788 .
## group:t 2 2.10 1.0500 0.1146 0.8919
## Residuals 54 494.60 9.1593
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(m.3))
## Response Y1 :
##
## Call:
## lm(formula = Y1 ~ group * t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.400 -1.025 0.000 1.025 5.900
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.4000 0.6090 5.583 7.89e-07 ***
## groupBT -0.3000 0.8613 -0.348 0.7290
## groupCBT 0.6000 0.8613 0.697 0.4890
## tfollowup 1.6000 0.8613 1.858 0.0687 .
## groupBT:tfollowup -1.0000 1.2181 -0.821 0.4153
## groupCBT:tfollowup -0.7000 1.2181 -0.575 0.5679
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.926 on 54 degrees of freedom
## Multiple R-squared: 0.1328, Adjusted R-squared: 0.05255
## F-statistic: 1.654 on 5 and 54 DF, p-value: 0.1615
##
##
## Response Y2 :
##
## Call:
## lm(formula = Y2 ~ group * t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7 -1.7 -0.3 1.7 9.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.700 0.957 14.315 <2e-16 ***
## groupBT -0.400 1.353 -0.296 0.769
## groupCBT -1.300 1.353 -0.961 0.341
## tfollowup 1.300 1.353 0.961 0.341
## groupBT:tfollowup 0.600 1.914 0.313 0.755
## groupCBT:tfollowup -0.300 1.914 -0.157 0.876
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.026 on 54 degrees of freedom
## Multiple R-squared: 0.1045, Adjusted R-squared: 0.02161
## F-statistic: 1.261 on 5 and 54 DF, p-value: 0.2943
# package "car" has an command "Anova()" (upper "A") which gives us the possibility to set type of Sums of Squares (unbalanced data)
require(car)
## Lade nötiges Paket: car
## Lade nötiges Paket: carData
##
## Attache Paket: 'car'
##
## Das folgende Objekt ist maskiert 'package:dplyr':
##
## recode
##
## Das folgende Objekt ist maskiert 'package:purrr':
##
## some
car::Anova(m.3, Type='III') # three uppercase i
##
## Type II MANOVA Tests: Pillai test statistic
## Df test stat approx F num Df den Df Pr(>F)
## group 2 0.097687 1.3865 4 108 0.24346
## t 1 0.142493 4.4035 2 53 0.01701 *
## group:t 2 0.016511 0.2248 4 108 0.92405
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# we could do a follow up discriminant analysis
# see below for more details about DA ...
# require(MASS)
# dfa <- MASS::lda(group ~ actions + thoughts, data=dd)
# dfa
# we take a look at the contrasts
out1.1 <- lm(actions ~ group, data = dd)
summary(out1.1)
##
## Call:
## lm(formula = actions ~ group, data = dd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.20 -1.40 -0.20 1.55 5.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.2000 0.4382 9.584 1.75e-13 ***
## groupBT -0.8000 0.6198 -1.291 0.202
## groupCBT 0.2500 0.6198 0.403 0.688
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.96 on 57 degrees of freedom
## Multiple R-squared: 0.0521, Adjusted R-squared: 0.01884
## F-statistic: 1.566 on 2 and 57 DF, p-value: 0.2177
out1.2 <- lm(actions ~ group + t, data = dd)
summary(out1.2)
##
## Call:
## lm(formula = actions ~ group + t, data = dd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6833 -0.9667 0.0667 1.0833 6.1167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6833 0.4915 7.494 5.28e-10 ***
## groupBT -0.8000 0.6020 -1.329 0.189
## groupCBT 0.2500 0.6020 0.415 0.680
## tfollowup 1.0333 0.4915 2.102 0.040 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.904 on 56 degrees of freedom
## Multiple R-squared: 0.1214, Adjusted R-squared: 0.07437
## F-statistic: 2.58 on 3 and 56 DF, p-value: 0.06254
out1.3 <- lm(actions ~ group * t, data = dd)
summary(out1.3)
##
## Call:
## lm(formula = actions ~ group * t, data = dd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.400 -1.025 0.000 1.025 5.900
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.4000 0.6090 5.583 7.89e-07 ***
## groupBT -0.3000 0.8613 -0.348 0.7290
## groupCBT 0.6000 0.8613 0.697 0.4890
## tfollowup 1.6000 0.8613 1.858 0.0687 .
## groupBT:tfollowup -1.0000 1.2181 -0.821 0.4153
## groupCBT:tfollowup -0.7000 1.2181 -0.575 0.5679
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.926 on 54 degrees of freedom
## Multiple R-squared: 0.1328, Adjusted R-squared: 0.05255
## F-statistic: 1.654 on 5 and 54 DF, p-value: 0.1615
out2.1 <- lm(thoughts ~ group, data = dd)
summary(out2.1)
##
## Call:
## lm(formula = thoughts ~ group, data = dd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.35 -1.35 -0.25 1.65 8.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.3500 0.6793 21.124 <2e-16 ***
## groupBT -0.1000 0.9607 -0.104 0.917
## groupCBT -1.4500 0.9607 -1.509 0.137
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.038 on 57 degrees of freedom
## Multiple R-squared: 0.0475, Adjusted R-squared: 0.01407
## F-statistic: 1.421 on 2 and 57 DF, p-value: 0.2499
out2.2 <- lm(thoughts ~ group + t, data = dd)
summary(out2.2)
##
## Call:
## lm(formula = thoughts ~ group + t, data = dd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.650 -1.650 -0.125 1.800 9.350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.6500 0.7690 17.751 <2e-16 ***
## groupBT -0.1000 0.9418 -0.106 0.916
## groupCBT -1.4500 0.9418 -1.540 0.129
## tfollowup 1.4000 0.7690 1.821 0.074 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.978 on 56 degrees of freedom
## Multiple R-squared: 0.1007, Adjusted R-squared: 0.05255
## F-statistic: 2.091 on 3 and 56 DF, p-value: 0.1117
out2.3 <- lm(thoughts ~ group * t, data = dd)
summary(out2.3)
##
## Call:
## lm(formula = thoughts ~ group * t, data = dd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7 -1.7 -0.3 1.7 9.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.700 0.957 14.315 <2e-16 ***
## groupBT -0.400 1.353 -0.296 0.769
## groupCBT -1.300 1.353 -0.961 0.341
## tfollowup 1.300 1.353 0.961 0.341
## groupBT:tfollowup 0.600 1.914 0.313 0.755
## groupCBT:tfollowup -0.300 1.914 -0.157 0.876
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.026 on 54 degrees of freedom
## Multiple R-squared: 0.1045, Adjusted R-squared: 0.02161
## F-statistic: 1.261 on 5 and 54 DF, p-value: 0.2943
# we play to understand parameters here
# we look for all combinations of explanatory variables and store an observation of each
t.o <- as.numeric(rownames(unique(model.matrix(m.2))))
# for ease of understanding we use m.2, remember:
# m.2 <- manova(outcome ~ group + t, data = dd)
head(cbind(dd$actions, dd$thoughts, m.2$fitted.values, model.matrix(m.2)))
## (Intercept) groupBT groupCBT tfollowup
## 1 3 13 3.933333 12.2 1 0 1 0
## 2 4 8 3.933333 12.2 1 0 1 0
## 3 6 17 3.933333 12.2 1 0 1 0
## 4 3 14 3.933333 12.2 1 0 1 0
## 5 4 12 3.933333 12.2 1 0 1 0
## 6 5 14 3.933333 12.2 1 0 1 0
cbind(dd$actions, dd$thoughts, m.2$fitted.values, model.matrix(m.2))[t.o,]
## (Intercept) groupBT groupCBT tfollowup
## 1 3 13 3.933333 12.20 1 0 1 0
## 11 5 10 2.883333 13.55 1 1 0 0
## 21 1 12 3.683333 13.65 1 0 0 0
## 31 5 14 4.966667 13.60 1 0 1 1
## 41 4 14 3.916667 14.95 1 1 0 1
## 51 4 13 4.716667 15.05 1 0 0 1
# coefficients:
m.2$coefficients
## [,1] [,2]
## (Intercept) 3.683333 13.65
## groupBT -0.800000 -0.10
## groupCBT 0.250000 -1.45
## tfollowup 1.033333 1.40
# for reaction variable actions we get fitted values by calculating the linear combination of explanatory variables (dummies) and first column of coefficients
p.actions <- model.matrix(m.2) %*% m.2$coefficients[,1]
cbind(dd$actions, dd$thoughts, m.2$fitted.values, model.matrix(m.2), p.actions)[t.o,]
## (Intercept) groupBT groupCBT tfollowup
## 1 3 13 3.933333 12.20 1 0 1 0 3.933333
## 11 5 10 2.883333 13.55 1 1 0 0 2.883333
## 21 1 12 3.683333 13.65 1 0 0 0 3.683333
## 31 5 14 4.966667 13.60 1 0 1 1 4.966667
## 41 4 14 3.916667 14.95 1 1 0 1 3.916667
## 51 4 13 4.716667 15.05 1 0 0 1 4.716667
# for every observation with the same combination of predictive subgroup levels we get the same fitted values in the reaction variables
# we estimate a reaction plane on base of the subgoup combination of our observations
d2 <- read_delim("https://md.psych.bio.uni-goettingen.de/mv/data/div/ocd_data.dat", delim = "\t")
## Rows: 30 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): group
## dbl (2): actions, thoughts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d2$t <- 2
# we double the data and create a second data collection
d1 <- d2
d1$t <- 1
set.seed(123)
d1$actions <- d1$actions + round(rnorm(30) * 2) - 1
d1$thoughts <- d1$thoughts + round(rnorm(30) * 3) -2
dd <- rbind(d1, d2)
# we set factors
dd$group <- dd$group %>%
factor(levels = c("No Treatment Control", "BT", "CBT"),
labels = c("NT", "BT", "CBT"))
dd$t <- dd$t %>% factor(levels = c('1', '2'), labels=c('post', 'followup'))
# we set contrasts
.bt.vs.nt <- c(0,1,0) # Kontrast 1
.cbt.vs.nt <- c(0,0,1) # Kontrast 2
# and bind the contrasts to the apropriate factor variables
contrasts(dd$group) <- cbind(.bt.vs.nt, .cbt.vs.nt)
c.manual <- lm(actions ~ group, data = dd)
unique(model.matrix(c.manual))
## (Intercept) group.bt.vs.nt group.cbt.vs.nt
## 1 1 0 1
## 11 1 1 0
## 21 1 0 0
contrasts(dd$group) <- contr.treatment
c.treat <- lm(actions ~ group, data = dd)
unique(model.matrix(c.treat))
## (Intercept) group2 group3
## 1 1 0 1
## 11 1 1 0
## 21 1 0 0
summary(c.treat)
##
## Call:
## lm(formula = actions ~ group, data = dd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.20 -1.40 -0.20 1.55 5.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.2000 0.4382 9.584 1.75e-13 ***
## group2 -0.8000 0.6198 -1.291 0.202
## group3 0.2500 0.6198 0.403 0.688
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.96 on 57 degrees of freedom
## Multiple R-squared: 0.0521, Adjusted R-squared: 0.01884
## F-statistic: 1.566 on 2 and 57 DF, p-value: 0.2177
contrasts(dd$group) <- contr.sum
c.sum <- lm(actions ~ group, data = dd)
unique(model.matrix(c.sum))
## (Intercept) group1 group2
## 1 1 -1 -1
## 11 1 0 1
## 21 1 1 0
summary(c.sum)
##
## Call:
## lm(formula = actions ~ group, data = dd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.20 -1.40 -0.20 1.55 5.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.0167 0.2530 15.875 <2e-16 ***
## group1 0.1833 0.3578 0.512 0.6104
## group2 -0.6167 0.3578 -1.723 0.0902 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.96 on 57 degrees of freedom
## Multiple R-squared: 0.0521, Adjusted R-squared: 0.01884
## F-statistic: 1.566 on 2 and 57 DF, p-value: 0.2177
contrasts(dd$group) <- contr.poly
c.poly <- lm(actions ~ group, data = dd)
unique(model.matrix(c.poly))
## (Intercept) group.L group.Q
## 1 1 7.071068e-01 0.4082483
## 11 1 -7.850462e-17 -0.8164966
## 21 1 -7.071068e-01 0.4082483
summary(c.poly)
##
## Call:
## lm(formula = actions ~ group, data = dd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.20 -1.40 -0.20 1.55 5.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.0167 0.2530 15.875 <2e-16 ***
## group.L 0.1768 0.4382 0.403 0.6882
## group.Q 0.7553 0.4382 1.723 0.0902 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.96 on 57 degrees of freedom
## Multiple R-squared: 0.0521, Adjusted R-squared: 0.01884
## F-statistic: 1.566 on 2 and 57 DF, p-value: 0.2177
contrasts(dd$group) <- contr.helmert
c.helmert <- lm(actions ~ group, data = dd)
unique(model.matrix(c.helmert))
## (Intercept) group1 group2
## 1 1 0 2
## 11 1 1 -1
## 21 1 -1 -1
summary(c.helmert)
##
## Call:
## lm(formula = actions ~ group, data = dd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.20 -1.40 -0.20 1.55 5.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.0167 0.2530 15.875 <2e-16 ***
## group1 -0.4000 0.3099 -1.291 0.202
## group2 0.2167 0.1789 1.211 0.231
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.96 on 57 degrees of freedom
## Multiple R-squared: 0.0521, Adjusted R-squared: 0.01884
## F-statistic: 1.566 on 2 and 57 DF, p-value: 0.2177
# descriptives
mean(dd$actions)
## [1] 4.016667
dd %>% group_by(group) %>% summarize(mean(actions))
## # A tibble: 3 × 2
## group `mean(actions)`
## <fct> <dbl>
## 1 NT 4.2
## 2 BT 3.4
## 3 CBT 4.45
inspired by and taken from
Discriminant analysis is a Manova with outcome variables and explanatory variables interchanged. In DA we have a categorial outcome variable and interval scaled explanatory variables. In MANOVA we have a group of interval scaled reaction variables (bound together) and one or more categorial explanatory variables. MANOVA – The tests of significance are the same as for discriminant function analysis, but MANOVA gives no information on the individual dimensions.
from Linear discriminant analysis (LDA): Uses linear combinations of predictors to predict the class of a given observation. Assumes that the predictor variables (p) are normally distributed and the classes have identical variances (for univariate analysis, p = 1) or identical covariance matrices (for multivariate analysis, p > 1).
The LDA algorithm starts by finding directions that maximize the separation between classes, then use these directions to predict the class of individuals. These directions, called linear discriminants, are a linear combinations of predictor variables.
LDA assumes that predictors are normally distributed (Gaussian distribution) and that the different classes have class-specific means and equal variance/covariance.
Before performing LDA, consider:
Inspecting the univariate distributions of each variable and make sure that they are normally distribute. If not, you can transform them using log and root for exponential distributions and Box-Cox for skewed distributions. removing outliers from your data and standardize the variables to make their scale comparable. The linear discriminant analysis can be easily computed using the function lda() [MASS package].
A large international air carrier has collected data on employees in three different job classifications:
The director of Human Resources wants to know if these three job
classifications appeal to different personality types.
Each employee is administered a battery of psychological tests which
include measures of
# dd <- foreign::read.spss("https://stats.idre.ucla.edu/wp-content/uploads/2016/02/discrim.sav")
# dd <- as.data.frame(dd)
# getwd()
# readr::write_tsv(dd, "personality_job.tsv")
# saveRDS(dd, "personality_job.rds")
# we can also get the uploaded data via
dd <- readRDS(gzcon(url("http://md.psych.bio.uni-goettingen.de/mv/data/div/personality_job.rds")))
# we take a look at the data
# psych::describe(dd[,c("outdoor", "social", "conservative")])
dd %>% dplyr::select(outdoor, social, conservative) %>% psych::describe()
## vars n mean sd median trimmed mad min max range skew
## outdoor 1 244 15.64 4.84 16 15.84 4.45 0 28 28 -0.41
## social 2 244 20.68 5.48 21 20.76 5.93 7 35 28 -0.09
## conservative 3 244 10.59 3.73 11 10.53 4.45 0 20 20 0.05
## kurtosis se
## outdoor 0.22 0.31
## social -0.69 0.35
## conservative -0.36 0.24
# job type specific descriptives
dd %>% dplyr::select(outdoor, social, conservative) %>% psych::describeBy(dd$job)
##
## Descriptive statistics by group
## group: customer service
## vars n mean sd median trimmed mad min max range skew
## outdoor 1 85 12.52 4.65 13 12.59 4.45 0 22 22 -0.25
## social 2 85 24.22 4.34 25 24.38 4.45 12 35 23 -0.35
## conservative 3 85 9.02 3.14 9 8.96 4.45 2 17 15 0.17
## kurtosis se
## outdoor -0.08 0.50
## social 0.17 0.47
## conservative -0.73 0.34
## ------------------------------------------------------------
## group: mechanic
## vars n mean sd median trimmed mad min max range skew
## outdoor 1 93 18.54 3.56 18 18.49 4.45 11 28 17 0.12
## social 2 93 21.14 4.55 21 21.23 4.45 9 29 20 -0.16
## conservative 3 93 10.14 3.24 11 10.23 2.97 0 17 17 -0.42
## kurtosis se
## outdoor -0.54 0.37
## social -0.58 0.47
## conservative 0.27 0.34
## ------------------------------------------------------------
## group: dispatch
## vars n mean sd median trimmed mad min max range skew
## outdoor 1 66 15.58 4.11 16.0 15.81 4.45 4 25 21 -0.53
## social 2 66 15.45 3.77 15.5 15.37 3.71 7 26 19 0.25
## conservative 3 66 13.24 3.69 13.0 13.39 3.71 4 20 16 -0.38
## kurtosis se
## outdoor 0.52 0.51
## social -0.20 0.46
## conservative -0.47 0.45
# correlations
cor(dd[,c("outdoor", "social", "conservative")])
## outdoor social conservative
## outdoor 1.00000000 -0.07130338 0.07938108
## social -0.07130338 1.00000000 -0.23586453
## conservative 0.07938108 -0.23586453 1.00000000
table(dd$job)
##
## customer service mechanic dispatch
## 85 93 66
prop.table(table(dd$job))
##
## customer service mechanic dispatch
## 0.3483607 0.3811475 0.2704918
# library(MASS)
# Fit the model
model <- MASS::lda(job ~ outdoor + social + conservative, data = dd)
model
## Call:
## lda(job ~ outdoor + social + conservative, data = dd)
##
## Prior probabilities of groups:
## customer service mechanic dispatch
## 0.3483607 0.3811475 0.2704918
##
## Group means:
## outdoor social conservative
## customer service 12.51765 24.22353 9.023529
## mechanic 18.53763 21.13978 10.139785
## dispatch 15.57576 15.45455 13.242424
##
## Coefficients of linear discriminants:
## LD1 LD2
## outdoor 0.09198065 -0.22501431
## social -0.19427415 -0.04978105
## conservative 0.15499199 0.08734288
##
## Proportion of trace:
## LD1 LD2
## 0.7712 0.2288
# Make predictions
predictions <- model %>% predict(dd)
names(predictions)
## [1] "class" "posterior" "x"
# predicted class for every observation
head(predictions$class)
## [1] customer service mechanic customer service customer service
## [5] customer service mechanic
## Levels: customer service mechanic dispatch
# predicted probability to be member of a group for every observation
head(predictions$posterior)
## customer service mechanic dispatch
## 1 0.9037622 0.08894785 0.0072899882
## 2 0.3677743 0.48897890 0.1432467601
## 3 0.7302117 0.26946971 0.0003186265
## 4 0.8100756 0.18217319 0.0077512155
## 5 0.7677607 0.22505382 0.0071854904
## 6 0.1682521 0.78482488 0.0469230463
# linear dicriminant scores for every observation
head(predictions$x)
## LD1 LD2
## 1 -1.6423155 0.71477348
## 2 -0.1480302 0.15096436
## 3 -2.6415213 -1.68326115
## 4 -1.5493681 0.07764901
## 5 -1.5472314 -0.15994117
## 6 -0.2203876 -1.07331266
# Model accuracy
mean(predictions$class==dd$job)
## [1] 0.7581967
# plot the first two dimensions against each other
plot(model)
# plot via ggplot
lda.data <- cbind(dd, predict(model)$x)
ggplot(lda.data, aes(LD1, LD2)) +
geom_point(aes(color = job))
LDA determines group means and computes, for each individual, the probability of belonging to the different groups. The individual is then affected to the group with the highest probability score.
The lda() outputs contain the following elements:
Prior probabilities of groups: the proportion of training observations in each group. For example, there are 38% 0.3811475 of the observations in the mechanic group. Group means: group center of gravity. Shows the mean of each variable in each group.
Coefficients of linear discriminants: Shows the linear combination of predictor variables that are used to form the LDA decision rule. For example, LD1 = 0.0919806 * outdoor + 0.0919806 * social + 0.154992 * conservative. Similarly, LD2 = NA * outdoor + NA * social + NA * conservative.
Using the function plot()
produces plots of the linear
discriminants, obtained by computing LD1 and LD2 for each of the
observations.
The predict()
function returns the following
elements:
Das DA Vorgehen oben angewandt auf die OCD-Data aus dem Manova-Beispiel am Anfang dieser Unit.
require(tidyverse)
d2 <- read_delim("https://md.psych.bio.uni-goettingen.de/mv/data/div/ocd_data.dat", delim = "\t")
## Rows: 30 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): group
## dbl (2): actions, thoughts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d2$t <- 2
# we double the data and create a second data collection
d1 <- d2
d1$t <- 1
set.seed(123)
d1$actions <- d1$actions + round(rnorm(30) * 2) - 1
d1$thoughts <- d1$thoughts + round(rnorm(30) * 3) -2
dd <- rbind(d1, d2)
# we set factors
dd$group <- dd$group %>%
factor(levels = c("No Treatment Control", "BT", "CBT"),
labels = c("NT", "BT", "CBT"))
dd$t <- dd$t %>% factor(levels = c('1', '2'), labels=c('post', 'followup'))
# we build the outcome matrix
outcome <- cbind(dd$actions, dd$thoughts)
# we do a discriminant analysis
require(MASS)
## Lade nötiges Paket: MASS
##
## Attache Paket: 'MASS'
##
## Das folgende Objekt ist maskiert 'package:dplyr':
##
## select
dfa <- MASS::lda(group ~ actions + thoughts, data=dd)
dfa
## Call:
## lda(group ~ actions + thoughts, data = dd)
##
## Prior probabilities of groups:
## NT BT CBT
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## actions thoughts
## NT 4.20 14.35
## BT 3.40 14.25
## CBT 4.45 12.90
##
## Coefficients of linear discriminants:
## LD1 LD2
## actions 0.3619347 0.3625689
## thoughts -0.2119702 0.2535649
##
## Proportion of trace:
## LD1 LD2
## 0.7914 0.2086
# the first group memberships
head(dd$group)
## [1] CBT CBT CBT CBT CBT CBT
## Levels: NT BT CBT
# ... are predicted as
head(predict(dfa)$class)
## [1] BT CBT NT BT CBT NT
## Levels: NT BT CBT
# ... and are the highest value of
head(predict(dfa)$posterior)
## NT BT CBT
## 1 0.3067381 0.3743552 0.3189067
## 2 0.2241720 0.2485428 0.5272852
## 3 0.4487448 0.2647626 0.2864926
## 4 0.3243231 0.3867800 0.2888969
## 5 0.3050306 0.3083495 0.3866199
## 6 0.3632280 0.2837741 0.3529979
head(cbind(as.character(dd$group), as.character(predict(dfa)$class), predict(dfa)$posterior), n=20)
## NT BT CBT
## 1 "CBT" "BT" "0.306738141084304" "0.374355161955394" "0.318906696960302"
## 2 "CBT" "CBT" "0.224171986026851" "0.248542789831365" "0.527285224141784"
## 3 "CBT" "NT" "0.448744797212114" "0.264762556274188" "0.286492646513698"
## 4 "CBT" "BT" "0.324323135522401" "0.386779957244864" "0.288896907232735"
## 5 "CBT" "CBT" "0.305030645433023" "0.308349499923169" "0.386619854643808"
## 6 "CBT" "NT" "0.363227989845631" "0.283774110518339" "0.35299789963603"
## 7 "CBT" "CBT" "0.335527850584888" "0.179841327112196" "0.484630822302916"
## 8 "CBT" "BT" "0.285450973722585" "0.430421401271387" "0.284127625006028"
## 9 "CBT" "CBT" "0.325335336383161" "0.321366792296564" "0.353297871320276"
## 10 "CBT" "CBT" "0.206254661735408" "0.349072928785552" "0.444672409479041"
## 11 "BT" "CBT" "0.27319655706081" "0.234093168742039" "0.492710274197151"
## 12 "BT" "CBT" "0.305030645433023" "0.308349499923169" "0.386619854643808"
## 13 "BT" "CBT" "0.180213677339583" "0.385634999386131" "0.434151323274286"
## 14 "BT" "BT" "0.297196320129654" "0.595550016565727" "0.10725366330462"
## 15 "BT" "BT" "0.343504367614631" "0.472252795013007" "0.184242837372362"
## 16 "BT" "CBT" "0.401275329709135" "0.134539803700489" "0.464184866590376"
## 17 "BT" "CBT" "0.27319655706081" "0.234093168742039" "0.492710274197151"
## 18 "BT" "BT" "0.258561461807626" "0.568275146370853" "0.173163391821521"
## 19 "BT" "BT" "0.300639122244661" "0.442973564400645" "0.256387313354693"
## 20 "BT" "BT" "0.341466906830728" "0.397928127081761" "0.260604966087511"
# Model accuracy
predictions <- dfa %>% predict(dd)
mean(predictions$class==dd$group)
## [1] 0.4166667
# plot via ggplot
lda.data <- cbind(dd, predict(dfa)$x)
ggplot(lda.data, aes(LD1, LD2)) +
geom_point(aes(color = group))
LD1 und LD2 sind die Eigenvektoren der ersten bzw. zweiten
Komponente. Die Komponenten sind Linearkombinationen der Prädiktoren
actions
und thoughts
und die
Eigenvector-Koordinaten deren jeweilige Gewichte.
Canonical correlation returns a measure of association between two sets of variables. We can see one side to be a set of explanatory variables, as in multiple regression. The response variable is not a single one, but also a set of variables.
A from
The data and the explanations are taken from this example
“A researcher has collected data on three psychological variables, four academic variables (standardized test scores) and gender for 600 college freshman. She is interested in how the set of psychological variables relates to the academic variables and gender. In particular, the researcher is interested in how many dimensions (canonical variables) are necessary to understand the association between the two sets of variables.”
This is also topic in a stackoverflow page.
See also this.
p.asym()
prüft (asymptotisch) die Canonischen
Korrelationen auf Signifikanzrequire(tidyverse)
require(GGally)
## Lade nötiges Paket: GGally
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
require(CCA)
## Lade nötiges Paket: CCA
## Lade nötiges Paket: fda
## Lade nötiges Paket: splines
## Lade nötiges Paket: fds
## Lade nötiges Paket: rainbow
## Lade nötiges Paket: pcaPP
## Lade nötiges Paket: RCurl
##
## Attache Paket: 'RCurl'
## Das folgende Objekt ist maskiert 'package:tidyr':
##
## complete
## Lade nötiges Paket: deSolve
##
## Attache Paket: 'fda'
## Das folgende Objekt ist maskiert 'package:graphics':
##
## matplot
## Lade nötiges Paket: fields
## Lade nötiges Paket: spam
## Spam version 2.8-0 (2022-01-05) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attache Paket: 'spam'
## Die folgenden Objekte sind maskiert von 'package:base':
##
## backsolve, forwardsolve
## Lade nötiges Paket: viridis
## Lade nötiges Paket: viridisLite
##
## Try help(fields) to get started.
require(CCP)
## Lade nötiges Paket: CCP
# dd <- read.csv("https://stats.idre.ucla.edu/stat/data/mmreg.csv")
# setwd("/Users/pzezula/owncloud/lehre_ss_2021/unit/manova")
# write_csv(dd, "mmreg.csv")
#mm <- read.csv("mmreg.csv")
mm <- read.csv("https://md.psych.bio.uni-goettingen.de/mv/data/div/mmreg.csv")
colnames(mm) <- c("Control", "Concept", "Motivation", "Read", "Write", "Math", "Science", "Sex")
summary(mm)
## Control Concept Motivation Read
## Min. :-2.23000 Min. :-2.620000 Min. :0.0000 Min. :28.3
## 1st Qu.:-0.37250 1st Qu.:-0.300000 1st Qu.:0.3300 1st Qu.:44.2
## Median : 0.21000 Median : 0.030000 Median :0.6700 Median :52.1
## Mean : 0.09653 Mean : 0.004917 Mean :0.6608 Mean :51.9
## 3rd Qu.: 0.51000 3rd Qu.: 0.440000 3rd Qu.:1.0000 3rd Qu.:60.1
## Max. : 1.36000 Max. : 1.190000 Max. :1.0000 Max. :76.0
## Write Math Science Sex
## Min. :25.50 Min. :31.80 Min. :26.00 Min. :0.000
## 1st Qu.:44.30 1st Qu.:44.50 1st Qu.:44.40 1st Qu.:0.000
## Median :54.10 Median :51.30 Median :52.60 Median :1.000
## Mean :52.38 Mean :51.85 Mean :51.76 Mean :0.545
## 3rd Qu.:59.90 3rd Qu.:58.38 3rd Qu.:58.65 3rd Qu.:1.000
## Max. :67.10 Max. :75.50 Max. :74.20 Max. :1.000
xtabs(~Sex, data = mm)
## Sex
## 0 1
## 273 327
# we define our two sets of variables that we want to cancorr
# psych <- mm[, 1:3]
psych <- mm[c("Read", "Write", "Math", "Science", "Sex")]
# acad <- mm[, 4:8] #
acad <- mm[c("Control", "Concept", "Motivation")]
ggpairs(psych)
# correlations for both variable sets and also their covariances
matcor(psych, acad)
## $Xcor
## Read Write Math Science Sex
## Read 1.00000000 0.6285909 0.6792757 0.6906929 -0.04174278
## Write 0.62859089 1.0000000 0.6326664 0.5691498 0.24433183
## Math 0.67927568 0.6326664 1.0000000 0.6495261 -0.04821830
## Science 0.69069291 0.5691498 0.6495261 1.0000000 -0.13818587
## Sex -0.04174278 0.2443318 -0.0482183 -0.1381859 1.00000000
##
## $Ycor
## Control Concept Motivation
## Control 1.0000000 0.1711878 0.2451323
## Concept 0.1711878 1.0000000 0.2885707
## Motivation 0.2451323 0.2885707 1.0000000
##
## $XYcor
## Read Write Math Science Sex Control
## Read 1.00000000 0.62859089 0.6792757 0.69069291 -0.04174278 0.3735650
## Write 0.62859089 1.00000000 0.6326664 0.56914983 0.24433183 0.3588768
## Math 0.67927568 0.63266640 1.0000000 0.64952612 -0.04821830 0.3372690
## Science 0.69069291 0.56914983 0.6495261 1.00000000 -0.13818587 0.3246269
## Sex -0.04174278 0.24433183 -0.0482183 -0.13818587 1.00000000 0.1134108
## Control 0.37356505 0.35887684 0.3372690 0.32462694 0.11341075 1.0000000
## Concept 0.06065584 0.01944856 0.0535977 0.06982633 -0.12595132 0.1711878
## Motivation 0.21060992 0.25424818 0.1950135 0.11566948 0.09810277 0.2451323
## Concept Motivation
## Read 0.06065584 0.21060992
## Write 0.01944856 0.25424818
## Math 0.05359770 0.19501347
## Science 0.06982633 0.11566948
## Sex -0.12595132 0.09810277
## Control 0.17118778 0.24513227
## Concept 1.00000000 0.28857075
## Motivation 0.28857075 1.00000000
# the canonical correlation is run by cc() of library(CCA)
cc1 <- CCA::cc(psych, acad)
# display the canonical correlations
cc1$cor
## [1] 0.4640861 0.1675092 0.1039911
# raw canonical coefficients
cc1[3:4]
## $xcoef
## [,1] [,2] [,3]
## Read -0.044620600 -0.004910024 0.021380576
## Write -0.035877112 0.042071478 0.091307329
## Math -0.023417185 0.004229478 0.009398182
## Science -0.005025152 -0.085162184 -0.109835014
## Sex -0.632119234 1.084642326 -1.794647036
##
## $ycoef
## [,1] [,2] [,3]
## Control -1.2538339 -0.6214776 -0.6616896
## Concept 0.3513499 -1.1876866 0.8267210
## Motivation -1.2624204 2.0272641 2.0002283
# compute canonical loadings
# canonical loadings are correlations between dimensions and explanatory variables
cc2 <- comput(psych, acad, cc1)
# display canonical loadings
cc2[3:6]
## $corr.X.xscores
## [,1] [,2] [,3]
## Read -0.8404480 -0.35882541 0.1353635
## Write -0.8765429 0.06483674 0.2545608
## Math -0.7639483 -0.29794884 0.1477611
## Science -0.6584139 -0.67679761 -0.2303551
## Sex -0.3641127 0.75492811 -0.5434036
##
## $corr.Y.xscores
## [,1] [,2] [,3]
## Control -0.419555307 -0.06527635 -0.01826320
## Concept -0.009673069 -0.11872021 0.07333073
## Motivation -0.263206910 0.05877699 0.07748681
##
## $corr.X.yscores
## [,1] [,2] [,3]
## Read -0.3900402 -0.06010654 0.01407661
## Write -0.4067914 0.01086075 0.02647207
## Math -0.3545378 -0.04990916 0.01536585
## Science -0.3055607 -0.11336980 -0.02395489
## Sex -0.1689796 0.12645737 -0.05650916
##
## $corr.Y.yscores
## [,1] [,2] [,3]
## Control -0.90404631 -0.3896883 -0.1756227
## Concept -0.02084327 -0.7087386 0.7051632
## Motivation -0.56715106 0.3508882 0.7451289
# tests of canonical dimensions
rho <- cc1$cor
## Define number of observations, number of variables in first set, and number of variables in the second set.
n <- dim(psych)[1] # dim() ergibt Anzahl der Zeilen (erster Wert) und Anzahl der Spalten (zweiter Wert), hier also Anzahl der Beobachtungen (Vpn)
p <- length(psych) # Anzahl der Variablen im Bündel psych
q <- length(acad) # Anzahl der Variablen im Bündel acad
## Calculate p-values using the F-approximations of different test statistics:
p.asym(rho, n, p, q, tstat = "Wilks")
## Wilks' Lambda, using F-approximation (Rao's F):
## stat approx df1 df2 p.value
## 1 to 3: 0.7543611 11.715733 15 1634.653 0.000000000
## 2 to 3: 0.9614300 2.944459 8 1186.000 0.002905057
## 3 to 3: 0.9891858 2.164612 3 594.000 0.091092180
p.asym(rho, n, p, q, tstat = "Hotelling")
## Hotelling-Lawley Trace, using F-approximation:
## stat approx df1 df2 p.value
## 1 to 3: 0.31429738 12.376333 15 1772 0.000000000
## 2 to 3: 0.03980175 2.948647 8 1778 0.002806614
## 3 to 3: 0.01093238 2.167041 3 1784 0.090013176
p.asym(rho, n, p, q, tstat = "Pillai")
## Pillai-Bartlett Trace, using F-approximation:
## stat approx df1 df2 p.value
## 1 to 3: 0.25424936 11.000571 15 1782 0.000000000
## 2 to 3: 0.03887348 2.934093 8 1788 0.002932565
## 3 to 3: 0.01081416 2.163421 3 1794 0.090440474
p.asym(rho, n, p, q, tstat = "Roy")
## Roy's Largest Root, using F-approximation:
## stat approx df1 df2 p.value
## 1 to 1: 0.2153759 54.53313 3 596 0
##
## F statistic for Roy's Greatest Root is an upper bound.
# we can also apply a permutation test
p.perm(psych, acad, nboot = 999, rhostart = 1, type = "Wilks")
## Permutation resampling using Wilks 's statistic:
## stat0 mstat nboot nexcess p
## 0.7543611 0.9747621 999 0 0
# standardized psych canonical coefficients diagonal matrix of psych sd's
s1 <- diag(sqrt(diag(cov(psych))))
s1 %*% cc1$xcoef
## [,1] [,2] [,3]
## [1,] -0.45080116 -0.04960589 0.21600760
## [2,] -0.34895712 0.40920634 0.88809662
## [3,] -0.22046662 0.03981942 0.08848141
## [4,] -0.04877502 -0.82659938 -1.06607828
## [5,] -0.31503962 0.54057096 -0.89442764
# standardized acad canonical coefficients diagonal matrix of acad sd's
s2 <- diag(sqrt(diag(cov(acad))))
s2 %*% cc1$ycoef
## [,1] [,2] [,3]
## [1,] -0.8404196 -0.4165639 -0.4435172
## [2,] 0.2478818 -0.8379278 0.5832620
## [3,] -0.4326685 0.6948029 0.6855370
# using cancor() of base package, CCA is based on that
cc2 <- cancor(psych, acad)
summary(cc2)
## Length Class Mode
## cor 3 -none- numeric
## xcoef 25 -none- numeric
## ycoef 9 -none- numeric
## xcenter 5 -none- numeric
## ycenter 3 -none- numeric
Pillai’s Test
This test is considered to be the most powerful and robust statistic for general use, especially for departures from assumptions. For example, if the MANOVA assumption of homogeneity of variance-covariance is violated, Pillai’s is your best option. It is also a good choice when you have uneven cell sizes or small sample sizes (i.e. when n is small).
However, when the hypothesis degrees of freedom is greater than one, Pillai’s tends to be less powerful than the other three. If you have a large deviation from the null hypothesis or the eigenvalues have large differences, Roy’s Maximum Root is a far better option (Seber 1984).
Source of the data
The example data come from a firm that surveyed a random sample of n = 50 of its employees in an attempt to determine which factors influence sales performance. Two collections of variables were measured:
Sales Performance:
Test Scores as a Measure of Intelligence
The source of the code
# this is, how the data were prepared
# dd <- readr::read_delim("https://online.stat.psu.edu/onlinecourses/sites/stat505/files/data/sales.txt", delim=" ", col_names=FALSE)
# colnames(dd) <- c("s_growth", "s_profit", "s_new_account", "i_creativity", "i_mech_reasoning", "i_abstract_reasoning", "i_mathematics")
# dd$subj <- 1:nrow(dd)
# dd <- dd %>% mutate_at(vars(s_growth:i_mathematics), as.numeric)
# getwd()
# readr::write_tsv(dd, "sales_data.tsv")
# saveRDS(dd, "sales_data.rds")
library(tidyverse)
# we can also get the uploaded data via
dd <- readRDS(gzcon(url("http://md.psych.bio.uni-goettingen.de/mv/data/div/sales_data.rds")))
# we define the sets
s_vars <- dd %>% dplyr::select(starts_with("s_"))
i_vars <- dd %>% dplyr::select(starts_with("i_"))
#checking the between and within set associations
cormat<-CCA::matcor(s_vars, i_vars)
#extracting the within study correlations for set 1 and set 2 and between set cor
round(cormat$Ycor, 4)
## i_creativity i_mech_reasoning i_abstract_reasoning
## i_creativity 1.0000 0.5907 0.1469
## i_mech_reasoning 0.5907 1.0000 0.3860
## i_abstract_reasoning 0.1469 0.3860 1.0000
## i_mathematics 0.4126 0.5746 0.5664
## i_mathematics
## i_creativity 0.4126
## i_mech_reasoning 0.5746
## i_abstract_reasoning 0.5664
## i_mathematics 1.0000
# The associations between the two sets can be extracted as;
# between set associations
cormat <- CCA::matcor(s_vars, i_vars)
round(cormat$XYcor, 4)
## s_growth s_profit s_new_account i_creativity
## s_growth 1.0000 0.9261 0.8840 0.5720
## s_profit 0.9261 1.0000 0.8425 0.5415
## s_new_account 0.8840 0.8425 1.0000 0.7004
## i_creativity 0.5720 0.5415 0.7004 1.0000
## i_mech_reasoning 0.7081 0.7459 0.6375 0.5907
## i_abstract_reasoning 0.6744 0.4654 0.6411 0.1469
## i_mathematics 0.9273 0.9443 0.8526 0.4126
## i_mech_reasoning i_abstract_reasoning i_mathematics
## s_growth 0.7081 0.6744 0.9273
## s_profit 0.7459 0.4654 0.9443
## s_new_account 0.6375 0.6411 0.8526
## i_creativity 0.5907 0.1469 0.4126
## i_mech_reasoning 1.0000 0.3860 0.5746
## i_abstract_reasoning 0.3860 1.0000 0.5664
## i_mathematics 0.5746 0.5664 1.0000
#obtaining the canonical correlations
can_cor1=CCA::cc(s_vars, i_vars)
# we get as much canonical correlations as the smaller set has variables
can_cor1$cor
## [1] 0.9944827 0.8781065 0.3836057
#raw canonical coefficients
# can_cor1[3:4]
can_cor1$xcoef
## [,1] [,2] [,3]
## s_growth -0.06237788 -0.1740703 0.3771529
## s_profit -0.02092564 0.2421641 -0.1035150
## s_new_account -0.07825817 -0.2382940 -0.3834151
can_cor1$ycoef
## [,1] [,2] [,3]
## i_creativity -0.06974814 -0.19239132 -0.24655659
## i_mech_reasoning -0.03073830 0.20157438 0.14189528
## i_abstract_reasoning -0.08956418 -0.49576326 0.28022405
## i_mathematics -0.06282997 0.06831607 -0.01133259
# we compute the canonical loadings
can_cor2=CCA::comput(s_vars, i_vars, can_cor1)
# we display the canonical loadings
can_cor2[3:6]
## $corr.X.xscores
## [,1] [,2] [,3]
## s_growth -0.9798776 0.0006477883 0.199598477
## s_profit -0.9464085 0.3228847489 -0.007504408
## s_new_account -0.9518620 -0.1863009724 -0.243414776
##
## $corr.Y.xscores
## [,1] [,2] [,3]
## i_creativity -0.6348095 -0.1894059 -0.24988439
## i_mech_reasoning -0.7171837 0.2086069 0.02598458
## i_abstract_reasoning -0.6436782 -0.4402237 0.22027544
## i_mathematics -0.9388771 0.1734549 0.03614570
##
## $corr.X.yscores
## [,1] [,2] [,3]
## s_growth -0.9744713 0.0005688272 0.076567107
## s_profit -0.9411869 0.2835272081 -0.002878734
## s_new_account -0.9466102 -0.1635921013 -0.093375287
##
## $corr.Y.yscores
## [,1] [,2] [,3]
## i_creativity -0.6383313 -0.2156981 -0.65140953
## i_mech_reasoning -0.7211626 0.2375644 0.06773775
## i_abstract_reasoning -0.6472493 -0.5013329 0.57422365
## i_mathematics -0.9440859 0.1975329 0.09422619
# To obtain the statistical significance of the dimensions, we are going to use the CCP package.
# test of canonical dimensions
rho=can_cor1$cor
rho
## [1] 0.9944827 0.8781065 0.3836057
# defining the number of observations, no of variables in first set,
# and number of variables in second set
n=dim(s_vars)[1]
p=length(s_vars)
q=length(i_vars)
# Calculating the F approximations using different test statistics
# using wilks test statistic
CCP::p.asym(rho,n,p,q,tstat="Wilks")
## Wilks' Lambda, using F-approximation (Rao's F):
## stat approx df1 df2 p.value
## 1 to 3: 0.002148472 87.391525 12 114.0588 0.000000e+00
## 2 to 3: 0.195241267 18.526265 6 88.0000 8.248957e-14
## 3 to 3: 0.852846693 3.882233 2 45.0000 2.783536e-02
# standardizing the first set of canonical coefficients(s_vars)
std_coef1 <- diag(sqrt(diag(cov(s_vars))))
std_coef1 %*% can_cor1$xcoef
## [,1] [,2] [,3]
## [1,] -0.4576880 -1.277214 2.767301
## [2,] -0.2118578 2.451745 -1.048019
## [3,] -0.3687696 -1.122893 -1.806735
# standardizing the coeficents of the second set (i_vars)
std_coef2 <- diag(sqrt(diag(cov(i_vars))))
std_coef2 %*% can_cor1$ycoef
## [,1] [,2] [,3]
## [1,] -0.2755155 -0.7599743 -0.9739351
## [2,] -0.1040424 0.6822849 0.4802843
## [3,] -0.1916330 -1.0607433 0.5995720
## [4,] -0.6620838 0.7198947 -0.1194195
# clean up
#rm(list=c(s_vars, i_vars, cormat, can_cor1, can_cor2, std_coef1, std_coef2))
rm(s_vars, i_vars, cormat, can_cor1, can_cor2, std_coef1, std_coef2)
another Tutorial
Der BoxM Test prüft auf multivariate Gleichheit der Varianzen und Covarianzen. Dies ist eine der Voraussetzungen zur Korrektheit der Ergebnisse für die Manova.
BoxM()
was part of library(“MVTests”) that doesn’t seem
to exist any more. The same for library(biotools).
But library(heplots)has a
boxM()`
# install.packages("heplots")
data(iris)
# results <- MVTests::boxM(data=iris[,1:4],group=iris[,5])
results <- heplots::boxM(iris[,1:4],group=iris[,5])
summary(results) # for details
## Summary for Box's M-test of Equality of Covariance Matrices
##
## Chi-Sq: 140.943
## df: 20
## p-value: < 2.2e-16
##
## log of Covariance determinants:
## setosa versicolor virginica pooled
## -13.067360 -10.874325 -8.927058 -9.958539
##
## Eigenvalues:
## setosa versicolor virginica pooled
## 1 0.236455690 0.487873944 0.69525484 0.44356592
## 2 0.036918732 0.072384096 0.10655123 0.08618331
## 3 0.026796399 0.054776085 0.05229543 0.05535235
## 4 0.009033261 0.009790365 0.03426585 0.02236372
##
## Statistics based on eigenvalues:
## setosa versicolor virginica pooled
## product 2.113088e-06 1.893828e-05 0.0001327479 4.732183e-05
## sum 3.092041e-01 6.248245e-01 0.8883673469 6.074653e-01
## precision 5.576122e-03 7.338788e-03 0.0169121236 1.304819e-02
## max 2.364557e-01 4.878739e-01 0.6952548382 4.435659e-01
boxM <-
function(mod, ...) UseMethod("boxM")
boxM.default <- function(Y, group)
{
dname <- deparse(substitute(Y))
if (!inherits(Y, c("data.frame", "matrix")))
stop(paste(dname, "must be a numeric data.frame or matrix!"))
if (length(group) != nrow(Y))
stop("incompatible dimensions!")
Y <- as.matrix(Y)
group <- as.factor(as.character(group))
p <- ncol(Y)
nlev <- nlevels(group)
lev <- levels(group)
dfs <- tapply(group, group, length) - 1
if (any(dfs < p))
warning("there are one or more levels with less observations than variables!")
mats <- aux <- list()
for(i in 1:nlev) {
mats[[i]] <- cov(Y[group == lev[i], ])
aux[[i]] <- mats[[i]] * dfs[i]
}
names(mats) <- lev
pooled <- Reduce("+", aux) / sum(dfs)
logdet <- log(unlist(lapply(mats, det)))
minus2logM <- sum(dfs) * log(det(pooled)) - sum(logdet * dfs)
sum1 <- sum(1 / dfs)
Co <- (((2 * p^2) + (3 * p) - 1) / (6 * (p + 1) *
(nlev - 1))) * (sum1 - (1 / sum(dfs)))
X2 <- minus2logM * (1 - Co)
dfchi <- (choose(p, 2) + p) * (nlev - 1)
pval <- pchisq(X2, dfchi, lower.tail = FALSE)
means <- aggregate(Y, list(group), mean)
rn <- as.character(means[,1])
means <- means[,-1]
means <- rbind( means, colMeans(Y) )
rownames(means) <- c(rn, "_all_")
logdet <- c(logdet, pooled=log(det(pooled)))
out <- structure(
list(statistic = c("Chi-Sq (approx.)" = X2),
parameter = c(df = dfchi),
p.value = pval,
cov = mats, pooled = pooled, logDet = logdet, means = means,
data.name = dname,
method = " Box's M-test for Homogeneity of Covariance Matrices"
),
class = c("htest", "boxM")
)
return(out)
}
boxM.formula <- function(formula, data, ...)
{
form <- formula
mf <- model.frame(form, data)
if (any(sapply(2:dim(mf)[2], function(j) is.numeric(mf[[j]]))))
stop("Box's M test is not appropriate with quantitative explanatory variables.")
Y <- mf[,1]
if (dim(Y)[2] < 2) stop("There must be two or more response variables.")
if(dim(mf)[2]==2) group <- mf[,2]
else {
if (length(grep("\\+ | \\| | \\^ | \\:",form))>0) stop("Model must be completely crossed formula only.")
group <- interaction(mf[,2:dim(mf)[2]])
}
boxM.default(Y=Y, group=group, ...)
}
boxM.lm <- function(y, ...) {
boxM.formula(formula(y), data=model.frame(y), ...)
}
procedure | outcome vars | explanatory vars | jump to … |
---|---|---|---|
manova | interval scaled | categorial | Manova |
discriminant analysis | categorial | interval scaled | Discriminant Analysis |
canonical correlation | interval scaled | interval scaled | Canonical Correlation |
All methods give us some kind of weights for our explanatory variables and our response variables. So we can get individual scores by multiplying the variables at each side with their appropriate weights and adding them up. In this context, “variables” means the reaction variable matrix and the model matrix of the model in question.