Brown and Forsythe (1974) simulation

Author

Phil Chalmers

Simulation from:

Brown, M. B. and Forsythe, A. B. (1974). Robust tests for the equality of variances. Journal of the American Statistical Association, 69(346), 364–367.

User-defined functions for analyses

#-------------------------------------------------------------------
# Custom functions required (not defined in other R packages)

#' Jacknife variance test
#' 
#' @param DV a numeric vector
#' @param group a character/factor vector containing the group membership
Jacknife_vartest <- function(DV, group){
    stopifnot(length(DV) == length(group))
    nms <- unique(group)
    g <- length(nms)
    Ns <- s2 <- Ui. <- numeric(g)
    Uij <- vector('list', g)
    for(i in 1:g){
        pick <- group == nms[i]
        sub <- subset(DV, pick)
        Ns[i] <- length(sub)
        s2[i] <- var(sub)
        sij2 <- numeric(Ns[i])
        for(j in 1:length(sub))
            sij2[j] <- var(sub[-j])
        Uij[[i]] <- Ns[i] * log(s2[i]) - (Ns[i]-1) * log(sij2)
        Ui.[i] <- mean(Uij[[i]])
    }
    U.. <- mean(do.call(c, Uij))
    num <- sum(Ns * (Ui. - U..)^2) / (g-1)
    den <- 0
    for(i in 1:g)
        den <- den + sum((Uij[[i]] - Ui.[i])^2) / sum(Ns-1)
    J <- num/den
    df1 <- g-1; df2 <- sum(Ns-1)
    p.value <- pf(J, df1, df2, lower.tail = FALSE)
    data.frame(J=J, df1=df1, df2=df2, p.value=p.value, row.names="")
}

#' Layard's variance test
#' 
#' @param DV a numeric vector
#' @param group a character/factor vector containing the group membership
Layard_vartest <- function(DV, group){
    stopifnot(length(DV) == length(group))
    nms <- unique(group)
    Ns <- table(group)
    N <- sum(Ns)
    g <- length(nms)
    log_s2 <- numeric(g)
    deviation <- numeric(N)
    for(i in 1:g){
        pick <- group == nms[i]
        sub <- subset(DV, pick)
        xbar <- mean(sub)
        deviation[pick] <- sub - xbar
        log_s2[i] <- log(var(sub))
    }
    gamma <- N * sum(deviation^4) / (sum(deviation^2)^2) - 3
    tau2 <- 2 + (1 - g/N) * gamma
    S <- sum( (Ns-1) * (log_s2 - sum((Ns-1) * log_s2)/sum(Ns-1))^2) / tau2
    df <- g - 1
    p.value <- pchisq(S, df, lower.tail=FALSE)
    data.frame(S=S, df=df, p.value=p.value, row.names = "")
}

Simulation code

#-------------------------------------------------------------------
# Define the design conditions

library(SimDesign)

Design <- createDesign(var_ratio=c(4, 2, 1, 1/2, 1/4),
                       N=c(80, 20),
                       groups_equal=c(TRUE, FALSE),
                       distribution=c('Gaussian', 't4', 'Chi4', 'Cauchy'),
                       # remove redundant or not-applicable rows
                       subset = !(groups_equal & var_ratio < 1) |
                           !(distribution != 'Gaussian' & var_ratio != 1))

# Brown and Forsythe use different sample sizes when groups were unequal
pick1 <- with(Design, !groups_equal & N == 80)
pick2 <- with(Design, !groups_equal & N == 20)
Design$N[pick1] <- 60
Design$N[pick2] <- 30

Design

#-------------------------------------------------------------------

Generate <- function(condition, fixed_objects = NULL) {
    Attach(condition)
    sd1 <- ifelse(var_ratio <= 1, 1, sqrt(var_ratio))
    sd2 <- ifelse(var_ratio <= 1, sqrt(1/var_ratio), 1)
    if(groups_equal){
        N1 <- N2 <- N/2
    } else {
        N1 <- N * 1/3
        N2 <- N - N1
    }
    dv <- switch(distribution,
                 Gaussian = c(rnorm(N1, sd = sd1), rnorm(N2, sd = sd2)),
                 t4 = c(rt(N1, df = 4), rt(N2, df = 4)),
                 Chi4 = c(rchisq(N1, df = 4), rchisq(N2, df = 4)),
                 Cauchy = c(rcauchy(N1), rcauchy(N2))
    )
    dat <- data.frame(DV=dv,
                      group=c(rep('G1', N1), rep('G2', N2)))
    dat
}

Analyse <- function(condition, dat, fixed_objects = NULL) {
    Attach(dat)
    F_test <- var.test(DV ~ group)$p.value
    Jacknife <- Jacknife_vartest(DV, group)$p.value
    Layard <- Layard_vartest(DV, group)$p.value
    W50 <- levene.test(DV, group=group, location = 'median')$p.value
    W10 <- levene.test(DV, group=group, location = 'trim.mean',
                       trim.alpha = .1)$p.value
    Levene <- levene.test(DV, group=group, location = 'mean')$p.value
    ret <- c(F=F_test, Jacknife=Jacknife, Layard=Layard,
             Levene=Levene, W10=W10, W50=W50)
    ret
}

Summarise <- function(condition, results, fixed_objects = NULL) {
    ret <- EDR(results, alpha = .05)
    ret
}

#-------------------------------------------------------------------
# Run the MCS

res <- runSimulation(design=Design, replications=1000, parallel=TRUE,
                     generate=Generate, analyse=Analyse,
                     summarise=Summarise, packages='lawstat',
                     filename='BF_simulation')
res

# A tibble: 68 × 14
   var_ratio     N groups_equal distribution     F Jacknife Layard Levene   W10
       <dbl> <dbl> <lgl>        <chr>        <dbl>    <dbl>  <dbl>  <dbl> <dbl>
 1      4       80 TRUE         Gaussian     0.995    0.99   0.99   0.978 0.978
 2      2       80 TRUE         Gaussian     0.538    0.52   0.533  0.478 0.472
 3      1       80 TRUE         Gaussian     0.047    0.039  0.041  0.039 0.04 
 4      0.5     80 TRUE         Gaussian     0.577    0.56   0.564  0.51  0.501
 5      0.25    80 TRUE         Gaussian     0.99     0.984  0.986  0.978 0.975
 6      4       20 TRUE         Gaussian     0.517    0.421  0.517  0.439 0.42 
 7      2       20 TRUE         Gaussian     0.153    0.134  0.177  0.152 0.137
 8      1       20 TRUE         Gaussian     0.047    0.059  0.07   0.059 0.052
 9      0.5     20 TRUE         Gaussian     0.166    0.134  0.189  0.135 0.129
10      0.25    20 TRUE         Gaussian     0.513    0.412  0.52   0.435 0.403
# ℹ 58 more rows
# ℹ 5 more variables: W50 <dbl>, REPLICATIONS <int>, SIM_TIME <chr>,
#   COMPLETED <chr>, SEED <int>

Extras

# Type I errors
TypeI <- subset(res, var_ratio == 1)
TypeI

# A tibble: 16 × 14
   var_ratio     N groups_equal distribution     F Jacknife Layard Levene   W10
       <dbl> <dbl> <lgl>        <chr>        <dbl>    <dbl>  <dbl>  <dbl> <dbl>
 1         1    80 TRUE         Gaussian     0.047    0.039  0.041  0.039 0.04 
 2         1    20 TRUE         Gaussian     0.047    0.059  0.07   0.059 0.052
 3         1    60 FALSE        Gaussian     0.056    0.059  0.063  0.055 0.053
 4         1    30 FALSE        Gaussian     0.049    0.044  0.059  0.053 0.05 
 5         1    80 TRUE         t4           0.264    0.078  0.06   0.046 0.046
 6         1    20 TRUE         t4           0.14     0.055  0.075  0.055 0.045
 7         1    60 FALSE        t4           0.207    0.081  0.06   0.055 0.049
 8         1    30 FALSE        t4           0.173    0.066  0.069  0.058 0.049
 9         1    80 TRUE         Chi4         0.189    0.066  0.059  0.086 0.066
10         1    20 TRUE         Chi4         0.132    0.071  0.102  0.093 0.077
11         1    60 FALSE        Chi4         0.161    0.081  0.072  0.112 0.078
12         1    30 FALSE        Chi4         0.162    0.098  0.101  0.109 0.079
13         1    80 TRUE         Cauchy       0.782    0.194  0.295  0.134 0.025
14         1    20 TRUE         Cauchy       0.633    0.175  0.394  0.176 0.042
15         1    60 FALSE        Cauchy       0.766    0.2    0.303  0.165 0.027
16         1    30 FALSE        Cauchy       0.666    0.204  0.349  0.177 0.051
# ℹ 5 more variables: W50 <dbl>, REPLICATIONS <int>, SIM_TIME <chr>,
#   COMPLETED <chr>, SEED <int>

# Power
Power <- subset(res, var_ratio != 1)
Power

# A tibble: 52 × 14
   var_ratio     N groups_equal distribution     F Jacknife Layard Levene   W10
       <dbl> <dbl> <lgl>        <chr>        <dbl>    <dbl>  <dbl>  <dbl> <dbl>
 1      4       80 TRUE         Gaussian     0.995    0.99   0.99   0.978 0.978
 2      2       80 TRUE         Gaussian     0.538    0.52   0.533  0.478 0.472
 3      0.5     80 TRUE         Gaussian     0.577    0.56   0.564  0.51  0.501
 4      0.25    80 TRUE         Gaussian     0.99     0.984  0.986  0.978 0.975
 5      4       20 TRUE         Gaussian     0.517    0.421  0.517  0.439 0.42 
 6      2       20 TRUE         Gaussian     0.153    0.134  0.177  0.152 0.137
 7      0.5     20 TRUE         Gaussian     0.166    0.134  0.189  0.135 0.129
 8      0.25    20 TRUE         Gaussian     0.513    0.412  0.52   0.435 0.403
 9      4       60 FALSE        Gaussian     0.92     0.903  0.876  0.895 0.89 
10      2       60 FALSE        Gaussian     0.43     0.389  0.371  0.36  0.353
# ℹ 42 more rows
# ℹ 5 more variables: W50 <dbl>, REPLICATIONS <int>, SIM_TIME <chr>,
#   COMPLETED <chr>, SEED <int>