在此题中,欲验证不同播期对黄花蒿种子产量是否存在影响,即“播期”视为一个总体,实验中的几个日期是从整体中随机抽出的水平。故此题对该因素水平总体做推断,应采用随机效应模型。原假设为: \[ H_0 : \sigma^2 = 0 \] 备择假设 \[ H_1 : \sigma^2 > 0 \]
# 读取文件
res <- read.csv("hao.csv")
# 转换成3×4的矩阵
dat <- sapply(levels(factor(res[,1])), function(x){res[res[,1]==x, 2]})
# 验证数据正态性
apply(dat,2,function(x){shapiro.test(x)$p.value >= 0.05})
## 1 2 3 4
## TRUE TRUE TRUE TRUE
# 方差齐性检验
leveneTest(res$seed,as.factor(res$T),center=mean)
## Levene's Test for Homogeneity of Variance (center = mean)
## Df F value Pr(>F)
## group 3 2.992 0.09563 .
## 8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
可认为各总体数据均配合正态分布,而且方差基本一致
# 计算总平方和
sst <- sum((dat - mean(dat)) ^ 2)
# 计算处理间平方和
ssa <- dim(dat)[1] * sum(sapply(1:dim(dat)[2], function(x){(mean(dat[,x]) - mean(dat))^2}))
# 处理内平方和
sse <- sst - ssa
# 误差均方
mse <- sse / (dim(dat)[1] * dim(dat)[2] - dim(dat)[2])
# 处理均方
msa <- ssa / (dim(dat)[2] - 1)
# 检验统计量
f <- msa / mse
# 做上单尾检验
p <- 1- pf(f, dim(dat)[2] - 1, dim(dat)[1] * dim(dat)[2] - dim(dat)[2])
# 结果
print(paste("接受原假设的概率p =", p))
## [1] "接受原假设的概率p = 0.00119974080499152"
# R语言自带的做一遍, 结果一致
#summary(aov(seed ~ as.factor(T), res))
# 画图
tmp <- as.data.frame(list(day = as.factor(1:4), seed = sapply(1:4, function(x){mean(res$seed[res$T == x])}), sd = sapply(1:4, function(x){sd(res$seed[res$T == x])})))
ggplot(tmp, aes(day, seed))+
geom_bar(stat="identity")+
geom_errorbar(aes(ymin = seed - sd, ymax = seed + sd), width=0.2)+
theme_classic()
接受原假设的概率为0.00119974080499152,故可以在95%的置信水平下认为不同的播期对黄花蒿种子产量具有影响。柱状图示mean(+-)SD,即误差线示标准差。
对于此题而言,我们关心的是6种溶液是否分别都对小鼠雌激素水平有影响,加之溶液应该是严格控制的水平,所以此题应先用固定效应模型检验样本是否来自同一总体,再用多重比较6种溶液和对照组的平均值是否有显著差异。
# 读取文件
res <- read.csv("rat.csv")
# 转换成3×4的矩阵
dat <- sapply(levels(factor(res[,1])), function(x){res[res[,1]==x, 2]})
# 验证数据正态性
apply(dat,2,function(x){shapiro.test(x)$p.value >= 0.05})
## 1 2 3 4 5 6 7
## TRUE TRUE TRUE TRUE FALSE TRUE TRUE
# 方差齐性检验
leveneTest(res$Weight, as.factor(res$Type), center = "mean")
## Levene's Test for Homogeneity of Variance (center = "mean")
## Df F value Pr(>F)
## group 6 1.0877 0.4016
## 21
可认为各总体基本配合正态分布,而且方差基本一致
# 计算总平方和
sst <- sum((dat - mean(dat)) ^ 2)
# 计算处理间平方和
ssa <- dim(dat)[1] * sum(sapply(1:dim(dat)[2], function(x){(mean(dat[,x]) - mean(dat))^2}))
# 处理内平方和
sse <- sst - ssa
# 误差均方
mse <- sse / (dim(dat)[1] * dim(dat)[2] - dim(dat)[2])
# 处理均方
msa <- ssa / (dim(dat)[2] - 1)
# 检验统计量
f <- msa / mse
# 做上单尾检验
p <- 1- pf(f, dim(dat)[2] - 1, dim(dat)[1] * dim(dat)[2] - dim(dat)[2])
# 做LSD检验
tmp <- sapply(2:dim(dat)[2], function(x){
# 构建统计量
t <- abs(mean(dat[,x]) - mean(dat[,1])) / sqrt(2*mse / dim(dat)[1])
# 接受原假设的概率
p <- 1 - pt(t, dim(dat)[1] * dim(dat)[2] - dim(dat)[2])
print(paste(c("使用了试剂",x-1,"与对照组均值无差异的概率为",p,ifelse(p<=0.05,
",可认为该试剂对激素水平有影响",",可认为该试剂对激素水平无影响")), collapse = ''))
return(p)
})
## [1] "使用了试剂1与对照组均值无差异的概率为0.181163969319502,可认为该试剂对激素水平无影响"
## [1] "使用了试剂2与对照组均值无差异的概率为0.0117920702699164,可认为该试剂对激素水平有影响"
## [1] "使用了试剂3与对照组均值无差异的概率为0.00212748908812332,可认为该试剂对激素水平有影响"
## [1] "使用了试剂4与对照组均值无差异的概率为0.0997297761732258,可认为该试剂对激素水平无影响"
## [1] "使用了试剂5与对照组均值无差异的概率为0.0276153668268089,可认为该试剂对激素水平有影响"
## [1] "使用了试剂6与对照组均值无差异的概率为0.00302701111758175,可认为该试剂对激素水平有影响"
# 结果
print(paste("接受原假设,即变异性与使用的试剂无关的概率p =", p))
## [1] "接受原假设,即变异性与使用的试剂无关的概率p = 0.0400918201775531"
# 画图
tmp <- as.data.frame(list(type = c("CK", 1:6), weight = sapply(1:7, function(x){mean(res$Weight[res$Type == x])}), sd = sapply(1:7, function(x){sd(res$Weight[res$Type == x])})))
ggplot(tmp, aes(type, weight, fill = type))+
geom_bar(stat="identity")+
annotate(geom = "text", x = "2", y = tmp$weight[3] + tmp$sd[3] + 5,label = "*")+
annotate(geom = "text", x = "3", y = tmp$weight[4] + tmp$sd[4] + 5,label = "*")+
annotate(geom = "text", x = "5", y = tmp$weight[6] + tmp$sd[6] + 5,label = "*")+
annotate(geom = "text", x = "6", y = tmp$weight[7] + tmp$sd[7] + 5,label = "*")+
geom_errorbar(aes(ymin = weight - sd, ymax = weight + sd), width=0.2)+
theme_classic()
柱状图顶部的“*”该表示处理对比对照组是显著的。柱状图示mean(+-)SD,即误差线示标准差。
在此题中,剂量是严格控制的水平,故为固定因素。同时,试剂种类(橙汁和VC)也是严格控制的,故也为固定因素。故此题用两因素的固定效应模型分析。
# 读取数据
res <- read.csv("ToothGrowth.csv")
# 验证数据正态性
sapply(levels(factor(res$dose)),function(x){shapiro.test(res$len[res$dose == x])$p.value >= 0.05})
## 0.5 1 2
## TRUE TRUE TRUE
sapply(levels(factor(res$supp)),function(x){shapiro.test(res$len[res$supp == x])$p.value >= 0.05})
## OJ VC
## FALSE TRUE
# 方差齐性检验
leveneTest(res$len, as.factor(res$supp), center = "mean")
## Levene's Test for Homogeneity of Variance (center = "mean")
## Df F value Pr(>F)
## group 1 1.0973 0.2992
## 58
leveneTest(res$len, as.factor(res$dose), center = "mean")
## Levene's Test for Homogeneity of Variance (center = "mean")
## Df F value Pr(>F)
## group 2 0.7328 0.485
## 57
尽管OJ的总体未通过正态性检验,但不妨认为各总体基本配合正态分布,而且方差基本一致。
# 计算试剂种类、剂量带来的平方和
level_supp <- levels(factor(res$supp))
level_dose <- levels(factor(res$dose))
a <- length(level_supp)
b <- length(level_dose)
n <- 10
ssa <- b * n * sum(sapply(level_supp, function(level){
(mean(res$len[res$supp == level]) - mean(res$len)) ^ 2
}))
ssb <- a * n * sum(sapply(level_dose, function(level){
(mean(res$len[res$dose == level]) - mean(res$len)) ^ 2
}))
# 计算由交互效应带来的平方和
ssab <- n * sum(sapply(level_supp, function(level_a){
sum(sapply(level_dose, function(level_b){
(mean(res$len[res$supp == level_a & res$dose == level_b]) -
mean(res$len[res$supp == level_a]) -
mean(res$len[res$dose == level_b]) +
mean(res$len)) ^ 2
}))
}))
# 残差
sse <- sum(sapply(level_supp, function(level_a){
sum(sapply(level_dose, function(level_b){
sum((res$len[res$supp == level_a & res$dose == level_b] - mean(res$len[res$supp == level_a & res$dose == level_b])) ^ 2)
}))
}))
# 各项自由度
dfa <- a - 1
dfb <- b - 1
dfab <- (a - 1) * (b - 1)
dfe <- a * b * (n - 1)
# 各项均方
msa <- ssa / dfa
msb <- ssb / dfb
msab <- ssab / dfab
mse <- sse / dfe
# 构造统计量
fa <- msa / mse
fb <- msb / mse
fab <- msab / mse
# 接受原假设的概率
pa <- 1 - pf(fa, dfa, dfe)
pb <- 1 - pf(fb, dfb, dfe)
pab <- 1 - pf(fab, dfab, dfe)
print(paste("接受原假设,试剂种类对牙齿长度没有影响的概率p =", pa))
## [1] "接受原假设,试剂种类对牙齿长度没有影响的概率p = 0.000231182809773411"
print(paste("接受原假设,试剂量对牙齿长度没有影响的概率p =", pb))
## [1] "接受原假设,试剂量对牙齿长度没有影响的概率p = 0"
print(paste("接受原假设,试剂种类和试剂量对牙齿长度的影响没有交互作用的概率p =", pab))
## [1] "接受原假设,试剂种类和试剂量对牙齿长度的影响没有交互作用的概率p = 0.021860268964791"
# 示交互效应
ggplot(res, aes(dose, len, color = as.factor(supp)))+
geom_point()+
geom_smooth(se = F, method = "lm")+
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
# 用R语言自带的做一遍,结论和分析一致
# summary(aov(len ~ supp + as.factor(dose) + supp * as.factor(dose), data = res))
综上,可认为试剂种类和试剂量对牙齿长度存在影响。试剂种类和试剂量存在交互作用,由图上两条拟合线斜率不相等可明显看出。
由先验知识可知,区组因素应与npk因素没有交互效应。将区组视为随机因素,将npk施肥与否视为固定因素。此题采用线性混合模型。
res <- read.csv("npk.csv")
# 分组观察不同block之间的yield
ggplot(res, aes(N, yield, color = as.factor(block)))+
geom_point()+
geom_smooth(se = F, method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
注意到各block间斜率基本一致,而截距不同。故采用随机截距。
# 由block决定随机截距
model <- lmer(yield ~ N*P*K + (1 | block), data = res)
models <- summary(model)
models
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: yield ~ N * P * K + (1 | block)
## Data: res
##
## REML criterion at convergence: 104.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.3615 -0.4731 0.1140 0.6126 1.2082
##
## Random effects:
## Groups Name Variance Std.Dev.
## block (Intercept) 15.28 3.909
## Residual 15.44 3.929
## Number of obs: 24, groups: block, 6
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 51.4333 3.2002 9.1831 16.072 4.91e-08 ***
## N 12.3333 4.5258 9.1831 2.725 0.023 *
## P 2.9000 4.5258 9.1831 0.641 0.537
## K 0.5667 4.5258 9.1831 0.125 0.903
## N:P -8.7333 7.8322 5.6986 -1.115 0.310
## N:K -9.6667 7.8322 5.6986 -1.234 0.266
## P:K -4.4000 7.8322 5.6986 -0.562 0.596
## N:P:K 9.9333 14.2897 4.0000 0.695 0.525
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) N P K N:P N:K P:K
## N -0.707
## P -0.707 0.749
## K -0.707 0.749 0.749
## N:P 0.612 -0.865 -0.865 -0.720
## N:K 0.612 -0.865 -0.720 -0.865 0.832
## P:K 0.612 -0.720 -0.865 -0.865 0.832 0.832
## N:P:K -0.558 0.789 0.789 0.789 -0.912 -0.912 -0.912
# 残差与拟合值之间的关系
plot(model)
残差在拟合值上基本均匀分布在0的两侧,说明该模型拟合效果较好。
# 随机因素的95CI
ci95 <- confint(model, level = 0.95)[4:10,]
## Computing profile confidence intervals ...
# 各个区组间由区组因素带来的产量差异
ranef(model)
## $block
## (Intercept)
## 1 0.3126894
## 2 1.0644746
## 3 3.7190082
## 4 -4.7834829
## 5 -2.4815565
## 6 2.1688671
##
## with conditional variances for "block"
lmerTest::ranova(model)
## ANOVA-like table for random-effects: Single term deletions
##
## Model:
## yield ~ N + P + K + (1 | block) + N:P + N:K + P:K + N:P:K
## npar logLik AIC LRT Df Pr(>Chisq)
## <none> 10 -52.196 124.39
## (1 | block) 9 -54.498 127.00 4.6036 1 0.0319 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 森林图
tmp <- as.data.frame(list(point = models$coefficients[2:8,1],
lower = ci95[,1], upper = ci95[,2], id = names(ci95[,1])))
ggplot(tmp, aes(point, id, color = id))+
geom_point(size=3.6) +
geom_errorbarh(aes(xmax =upper, xmin = lower), height = 0.4) +
geom_vline(aes(xintercept = 0)) +
xlab("slope") + ylab("") +
annotate(geom = "text", x = 22, y = "N", label = "*") +
theme_classic()
由以上结果可知,在考虑交互作用的条件下,仅有添加N对产量的影响在统计学上显著,各个区组间由区组因素带来的产量差异是显著的。没有足够的证据支持是否添加N对产量的影响因是否施加P、K而不同。 因为交互效应不显著,我们可以去除交互项再做分析:
# 由block决定随机截距
model <- lmer(yield ~ N + P + K + (1 | block), data = res)
models <- summary(model)
models
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: yield ~ N + P + K + (1 | block)
## Data: res
##
## REML criterion at convergence: 128.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.79887 -0.62358 0.05054 0.65032 1.34337
##
## Random effects:
## Groups Name Variance Std.Dev.
## block (Intercept) 13.16 3.628
## Residual 16.01 4.002
## Number of obs: 24, groups: block, 6
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 54.650 2.205 12.418 24.784 5.97e-12 ***
## N 5.617 1.634 15.000 3.438 0.00366 **
## P -1.183 1.634 15.000 -0.724 0.47999
## K -3.983 1.634 15.000 -2.438 0.02767 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) N P
## N -0.370
## P -0.370 0.000
## K -0.370 0.000 0.000
# 残差与拟合值之间的关系
plot(model)
# 随机因素的95CI
ci95 <- confint(model, level = 0.95)[4:6,]
## Computing profile confidence intervals ...
# 各个区组间由区组因素带来的产量差异
ranef(model)
## $block
## (Intercept)
## 1 -0.651767
## 2 1.974470
## 3 4.524029
## 4 -3.642227
## 5 -3.335513
## 6 1.131007
##
## with conditional variances for "block"
lmerTest::ranova(model)
## ANOVA-like table for random-effects: Single term deletions
##
## Model:
## yield ~ N + P + K + (1 | block)
## npar logLik AIC LRT Df Pr(>Chisq)
## <none> 6 -64.029 140.06
## (1 | block) 5 -66.388 142.78 4.7194 1 0.02982 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 森林图
tmp <- as.data.frame(list(point = models$coefficients[2:4,1],
lower = ci95[,1], upper = ci95[,2], id = names(ci95[,1])))
ggplot(tmp, aes(point, id, color = id))+
geom_point(size=3.6) +
geom_errorbarh(aes(xmax =upper, xmin = lower), height = 0.4) +
geom_vline(aes(xintercept = 0)) +
xlab("slope") + ylab("") +
annotate(geom = "text", x = 9, y = "N", label = "*") +
annotate(geom = "text", x = -0.5, y = "K", label = "*") +
theme_classic()
由以上结果可知,在不考虑交互效应的条件下,分别添加N、P对产量的影响在统计学上显著,各个区组间由区组因素带来的产量差异是显著的。