Vignette for Splitknockoff

Author : Yuxuan Chen, Haoxue Wang, Yang Cao, Xinwei Sun, Yuan Yao

Introduction

Split Knockoff is a data adaptive variable selection framework for controlling the (directional) false discovery rate (FDR) in structural sparsity, where variable selection on linear transformation of parameters is of concern. This proposed scheme relaxes the linear subspace constraint to its neighborhood, often known as variable splitting in optimization, which leads to better incoherence and possible performance improvement in both power and FDR. This vignette illustrates the usage of SplitKnockoff with simulation experiments in Cao et al. (2023) <arXiv.2310.07605> and will help you apply the split knockoff method in a light way.

install.packages("SplitKnockoff")   # just one line code to install our package

This is a R implement on the Matlab version of Split Knockoffs. This R package is more convenient as glmnet can be used directly by

install.packages("glmnet")  # just one line code to install the glmnet tool

Please update your glmnet package to the latest version for smooth usage of this package, the examples illustrated here are tested with glmnet 4.1-8.

For more information, please see the manual inside this package.

Key function

sk.filter(X, D, y, option) : the main function, Split Knockoff filter, for variable selection in structural sparsity problem.

Function involved frequently

sk.create(X, y, D, nu, option): generate the split knockoff copy for structural sparsity problem.

sk.W_path(X, D, y, nu, option): generate the \(W\) statistics for split knockoff on a split LASSO path.

sk.W_fixed(X, D, y, nu, option): generate the \(W\) statistics for split knockoff based on a fixed \(\beta(\lambda) = \hat{\beta}\).

select(W, q, method): this function is for variable selection based on \(W\) statistics.

For reproduction of the simulation, you can see the code as the following.

Simulation details

Install all the packages and library them.

install.packages("SplitKnockoff")   # install our package


library(latex2exp)
library(ggplot2)
library(Matrix)
library(glmnet)
library(SplitKnockoff)

Set the parameter for the simulation

k <- 20   # sparsity level
A <- 1    # magnitude
n <- 500  # sample size
p <- 100  # dimension of variables
c <- 0.5  # feature correlation
sigma <-1 # noise level

option <- list()

# the target (directional) FDR control
option$q <- 0.2 

# whether to normalize the dataset
option$normalize <- 'true'

# fraction of data used to estimate \beta(\lambda)
option$frac = 2/5

# choice on the set of regularization parameters for split LASSO path
option$lambda <- 10.^seq(0, -6, by=-0.01)
option$lambda_cv <- 10.^seq(0, -6, by=-0.01)

# choice of nu for split knockoffs
expo = seq(0, 2, by = 0.2)
option$nu <- 10.^expo
option$nu_cv <- 10.^expo
num_nu <- length(option$nu)

# set random seed
option$seed = 1

# the number of simulation instances
option$tests = 200 

option$W = 's'

Generate 3 types of transformation

D_G = matrix(0, p-1, p)

for (i in 1:(p-1)){
  D_G[i, i] = 1
  D_G[i, i+1] = -1
}

# generate D1, D2, and D3
D_1 = diag(p)
D_2 = D_G
D_3 = rbind(diag(p), D_G)
D_s = list(D_1 = D_1, D_2 = D_2, D_3 = D_3)

Generate \(X\)

# generate Sigma
Sigma = matrix(0, p, p)
for( i in 1: p){
  for(j in 1: p){
    Sigma[i, j] <- c^(abs(i - j))
  }
}

Package mvtnorm needed for this generation, please install it in advance

library(mvtnorm) # package mvtnorm needed for this generation
set.seed(100)
X <- rmvnorm(n,matrix(0, p, 1), Sigma) # generate X

Generate \(\beta\)

beta_true <- matrix(0, p, 1)
for( i in 1: k){
  beta_true[i, 1] = A
  if ( i%%3 == 1){
  beta_true[i, 1] = 0
  }
}

Key step

Split knockoff for all \(\nu\)

# choice on the way generating the W statistics
option$beta = 'path'

# save Z t_Z r t_Z for making plots
rawvalue = list()

# save y
Y = list()

tests = option$tests # the number of experiments

# create matrices to store results
fdr_split = array(0, dim = c(3, tests, num_nu, 2))
power_split = array(0, dim = c(3, tests, num_nu, 2))

for (test in 1:tests){
  # generate varepsilon
  set.seed(test)
    
  # generate noise and y
  varepsilon = matrix(rnorm(n), ncol = 1) * sqrt(sigma)
  y = X %*% beta_true + varepsilon
  Y[[test]] = y
  
  raw = list()

  for (j in 1:3){
    # choose the respective D for each example
    D = D_s[[j]]
    
    gamma_true <- D %*% beta_true
    
    sk_results = sk.filter(X, D, y, option)
    results = sk_results$results
    raw[[j]] = sk_results$stats
    
    for (i in 1:num_nu){
      r_sign = sk_results$stats[[i]]$r
      result = results[[i]]$sk
      fdr_split[j, test, i, 1] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1)
      power_split[j, test, i, 1] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0)
      result = results[[i]]$sk_plus
      fdr_split[j, test, i, 2] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1)
      power_split[j, test, i, 2] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0)
    }
  }
  rawvalue[[test]] = raw
}

Split knockoff for \(\nu\) chosen by cross validation

# choice on the way generating the W statistics
option$beta = 'cv_all'

# create matrices to store results
fdr_cv = array(0, dim = c(3, tests, 2))
power_cv = array(0, dim = c(3, tests, 2))

for (test in 1:tests){
  y <- Y[[test]]
  for (j in 1:3){
    # choose the respective D for each example
    D = D_s[[j]]
    
    gamma_true <- D %*% beta_true
    
    sk_results = sk.filter(X, D, y, option)
    results = sk_results$results
    r_sign = sk_results$stats$r
    result = results$sk
    fdr_cv[j, test, 1] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1)
    power_cv[j, test, 1] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0)
    result = results$sk_plus
    fdr_cv[j, test, 2] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1)
    power_cv[j, test, 2] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0)
  }
}
mean_fdr_cv <- apply(fdr_cv, c(1, 3), mean)
sd_fdr_cv <- apply(fdr_cv, c(1, 3), sd)
mean_power_cv <- apply(power_cv, c(1, 3), mean)
sd_power_cv <- apply(power_cv, c(1, 3), sd)

Knockoff

library(knockoff) # package knockoff needed

# create matrices to store results
fdr_knockoff = array(0, dim = c(2, tests, 2))
power_knockoff = array(0, dim = c(2, tests, 2))


D = D_s[[2]]
  
# Calculate X_bar, y_bar
Z <- t(D) %*% solve(D %*% t(D))

  
XF <- X %*% rep(1, p)
U_X <- XF / norm(XF, "F")
UU_X <- cbind(U_X, matrix(0, n, n-1))
qrresult <- qr(UU_X)
Qreslt <- qr.Q(qrresult)
UX_perp <- Qreslt[,2:n]
y_bar <- t(UX_perp) %*% y
X_bar <- t(UX_perp) %*% X %*% Z

for (test in 1:tests){
  y <- Y[[test]]
  
  # choose D_1 for each example
  D = D_s[[1]]
  
  gamma_true <- D %*% beta_true
    
  k_results = knockoff.filter(X, y, knockoffs = {function(x) create.fixed(x, y=y, method='equi')}, statistic = stat.lasso_lambdasmax)
  W_k = k_results$statistic
  result = select(W_k, option$q, 'knockoff')
  fdr_knockoff[1, test, 1] = sum(gamma_true[result] == 0) / max(length(result), 1)
  power_knockoff[1, test, 1] = sum(gamma_true[result] != 0) / sum(gamma_true != 0)
  result = select(W_k, option$q, 'knockoff+')
  fdr_knockoff[1, test, 2] = sum(gamma_true[result] == 0) / max(length(result), 1)
  power_knockoff[1, test, 2] = sum(gamma_true[result] != 0) / sum(gamma_true != 0)
  
  
  
  # choose the D_2 for each example
  D = D_s[[2]]
  
  gamma_true <- D %*% beta_true

  k_results = knockoff.filter(X_bar, y_bar, knockoffs = {function(x) create.fixed(x, y=y_bar, method='equi')}, statistic = stat.lasso_lambdasmax)
  W_k = k_results$statistic
  result = select(W_k, option$q, 'knockoff')
  fdr_knockoff[2, test, 1] = sum(gamma_true[result] == 0) / max(length(result), 1)
  power_knockoff[2, test, 1] = sum(gamma_true[result] != 0) / sum(gamma_true != 0)
  result = select(W_k, option$q, 'knockoff+')
  fdr_knockoff[2, test, 2] = sum(gamma_true[result] == 0) / max(length(result), 1)
  power_knockoff[2, test, 2] = sum(gamma_true[result] != 0) / sum(gamma_true != 0)
}

Plot figure 3

x <- expo
t_value = qt(c(0.1, 0.9), tests - 1)
lower_bound = t_value[1]
upper_bound = t_value[2]


fdr_knockoff_mean <- apply(fdr_knockoff, c(1, 3), mean)
power_knockoff_mean <- apply(power_knockoff, c(1, 3), mean)
fdr_knockoff_sd <- apply(fdr_knockoff, c(1, 3), sd)
power_knockoff_sd <- apply(power_knockoff, c(1, 3), sd)
fdr_split_mean <- apply(fdr_split, c(1, 3, 4), mean)
fdr_split_sd <- apply(fdr_split, c(1, 3, 4), sd)
power_split_mean <- apply(power_split, c(1, 3, 4), mean)
power_split_sd <- apply(power_split, c(1, 3, 4), sd)
    
fdr_split_top <- fdr_split_mean + fdr_split_sd * upper_bound
fdr_split_bot <- fdr_split_mean + fdr_split_sd * lower_bound

power_split_top <- power_split_mean + power_split_sd * upper_bound
power_split_bot <- power_split_mean + power_split_sd * lower_bound

## for D_1
## plot for FDR
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_31_fdr.png', height=1000, width=1000)
plot_data <- data.frame(
  x = rep(x, 4),
  y = c(fdr_split_mean[1, , 1], fdr_split_mean[1, , 2], rep(fdr_knockoff_mean[1, 1], num_nu), rep(fdr_knockoff_mean[1, 2], num_nu)),
  type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu)
)

# Plotting
fig <- ggplot() +
  geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
  geom_ribbon(aes(x = expo, ymin = fdr_split_bot[1, , 1], ymax = fdr_split_top[1, , 1]), fill = "red", alpha = 0.05) +
  geom_ribbon(aes(x = expo, ymin = fdr_split_bot[1, , 2], ymax = fdr_split_top[1, , 2]), fill = "blue", alpha = 0.05) +
  geom_abline(intercept = option$q, slope=0, linetype="dashed") +
  labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) +
  ggtitle(expression('D'[1])) +
  coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
  scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) +
  theme_light() +
  theme(plot.title = element_text(hjust = 0.5))

print(fig)
dev.off()
  
  
## plot for Power
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_31_power.png', height=1000, width=1000)
plot_data <- data.frame(
  x = rep(x, 4),
  y = c(power_split_mean[1, , 1], power_split_mean[1, , 2], rep(power_knockoff_mean[1, 1], num_nu), rep(power_knockoff_mean[1, 2], num_nu)),
  type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu)
)

# Plotting
fig <- ggplot() +
  geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
  geom_ribbon(aes(x = expo, ymin = power_split_bot[1, , 1], ymax = power_split_top[1, , 1]), fill = "red", alpha = 0.05) +
  geom_ribbon(aes(x = expo, ymin = power_split_bot[1, , 2], ymax = power_split_top[1, , 2]), fill = "blue", alpha = 0.05) +
  labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") +
  ggtitle(expression('D'[1])) +
  coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
  scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) +
  theme_light() +
  theme(plot.title = element_text(hjust = 0.5))
  
print(fig)
dev.off()
  
  
## for D_2
## plot for FDR
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_32_fdr.png', height=1000, width=1000)
plot_data <- data.frame(
  x = rep(x, 4),
  y = c(fdr_split_mean[2, , 1], fdr_split_mean[2, , 2], rep(fdr_knockoff_mean[2, 1], num_nu), rep(fdr_knockoff_mean[2, 2], num_nu)),
  type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu)
)

# Plotting
fig <- ggplot() +
  geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
  geom_ribbon(aes(x = expo, ymin = fdr_split_bot[2, , 1], ymax = fdr_split_top[2, , 1]), fill = "red", alpha = 0.05) +
  geom_ribbon(aes(x = expo, ymin = fdr_split_bot[2, , 2], ymax = fdr_split_top[2, , 2]), fill = "blue", alpha = 0.05) +
  geom_abline(intercept = option$q, slope=0, linetype="dashed") +
  labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) +
  ggtitle(expression('D'[2])) +
  coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
  scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) +
  theme_light() +
  theme(plot.title = element_text(hjust = 0.5))
  
print(fig)
dev.off()
  
  
## plot for Power
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_32_power.png', height=1000, width=1000)
plot_data <- data.frame(
  x = rep(x, 4),
  y = c(power_split_mean[2, , 1], power_split_mean[2, , 2], rep(power_knockoff_mean[2, 1], num_nu), rep(power_knockoff_mean[2, 2], num_nu)),
  type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu)
)

# Plotting
fig <- ggplot() +
  geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
  geom_ribbon(aes(x = expo, ymin = power_split_bot[2, , 1], ymax = power_split_top[2, , 1]), fill = "red", alpha = 0.05) +
  geom_ribbon(aes(x = expo, ymin = power_split_bot[2, , 2], ymax = power_split_top[2, , 2]), fill = "blue", alpha = 0.05) +
  labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") +
  ggtitle(expression('D'[2])) +
  coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
  scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) +
  theme_light() +
  theme(plot.title = element_text(hjust = 0.5))
  
print(fig)
dev.off()
  
  
  
  
## for D_3
## plot for FDR
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_33_fdr.png', height=1000, width=1000)
plot_data <- data.frame(
  x = rep(x, 2),
  y = c(fdr_split_mean[3, , 1], fdr_split_mean[3, , 2]),
  type = rep(c("Split Knockoff", "Split Knockoff+"), each = num_nu)
)

# Plotting
fig <- ggplot() +
  geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
  geom_ribbon(aes(x = expo, ymin = fdr_split_bot[3, , 1], ymax = fdr_split_top[3, , 1]), fill = "red", alpha = 0.05) +
  geom_ribbon(aes(x = expo, ymin = fdr_split_bot[3, , 2], ymax = fdr_split_top[3, , 2]), fill = "blue", alpha = 0.05) +
  geom_abline(intercept = option$q, slope=0, linetype="dashed") +
  labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) +
  ggtitle(expression('D'[3])) +
  coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
  scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue")) +
  theme_light() +
  theme(plot.title = element_text(hjust = 0.5))
  
print(fig)
dev.off()
  
  
## plot for Power
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_33_power.png', height=1000, width=1000)
plot_data <- data.frame(
  x = rep(x, 2),
  y = c(power_split_mean[3, , 1], power_split_mean[3, , 2]),
  type = rep(c("Split Knockoff", "Split Knockoff+"), each = num_nu)
)

# Plotting
fig <- ggplot() +
  geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
  geom_ribbon(aes(x = expo, ymin = power_split_bot[3, , 1], ymax = power_split_top[3, , 1]), fill = "red", alpha = 0.05) +
  geom_ribbon(aes(x = expo, ymin = power_split_bot[3, , 2], ymax = power_split_top[3, , 2]), fill = "blue", alpha = 0.05) +
  labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") +
  ggtitle(expression('D'[3])) +
  coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
  scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue")) +
  theme_light() +
  theme(plot.title = element_text(hjust = 0.5))
  
print(fig)
dev.off()

Save results for table 1

saveRDS(list(mean_fdr_D1_knockoff = fdr_knockoff_mean[1, 1],
             sd_fdr_D1_knockoff = fdr_knockoff_sd[1, 1],
             mean_power_D1_knockoff = power_knockoff_mean[1, 1],
             sd_power_D1_knockoff = power_knockoff_sd[1, 1],
             mean_fdr_D2_knockoff = fdr_knockoff_mean[2, 1],
             sd_fdr_D2_knockoff = fdr_knockoff_sd[2, 1],
             mean_power_D2_knockoff = power_knockoff_mean[2, 1],
             sd_power_D2_knockoff = power_knockoff_sd[2, 1],
             mean_fdr_D1_sk = mean_fdr_cv[1, 1],
             sd_fdr_D1_sk = sd_fdr_cv[1, 1],
             mean_power_D1_sk = mean_power_cv[1, 1],
             sd_power_D1_sk = sd_power_cv[1, 1],
             mean_fdr_D2_sk = mean_fdr_cv[2, 1],
             sd_fdr_D2_sk = sd_fdr_cv[2, 1],
             mean_power_D2_sk = mean_power_cv[2, 1],
             sd_power_D2_sk = sd_power_cv[2, 1],
             mean_fdr_D3_sk = mean_fdr_cv[3, 1],
             sd_fdr_D3_sk = sd_fdr_cv[3, 1],
             mean_power_D3_sk = mean_power_cv[3, 1],
             sd_power_D3_sk = sd_power_cv[3, 1],
             mean_fdr_D1_knockoff_pl = fdr_knockoff_mean[1, 2],
             sd_fdr_D1_knockoff_pl = fdr_knockoff_sd[1, 2],
             mean_power_D1_knockoff_pl = power_knockoff_mean[1, 2],
             sd_power_D1_knockoff_pl = power_knockoff_sd[1, 2],
             mean_fdr_D2_knockoff_pl = fdr_knockoff_mean[2, 2],
             sd_fdr_D2_knockoff_pl = fdr_knockoff_sd[2, 2],
             mean_power_D2_knockoff_pl = power_knockoff_mean[2, 2],
             sd_power_D2_knockoff_pl = power_knockoff_sd[2, 2],
             mean_fdr_D1_sk_pl = mean_fdr_cv[1, 2],
             sd_fdr_D1_sk_pl = sd_fdr_cv[1, 2],
             mean_power_D1_sk_pl = mean_power_cv[1, 2],
             sd_power_D1_sk_pl = sd_power_cv[1, 2],
             mean_fdr_D2_sk_pl = mean_fdr_cv[2, 2],
             sd_fdr_D2_sk_pl = sd_fdr_cv[2, 2],
             mean_power_D2_sk_pl = mean_power_cv[2, 2],
             sd_power_D2_sk_pl = sd_power_cv[2, 2],
             mean_fdr_D3_sk_pl = mean_fdr_cv[3, 2],
             sd_fdr_D3_sk_pl = sd_fdr_cv[3, 2],
             mean_power_D3_sk_pl = mean_power_cv[3, 2],
             sd_power_D3_sk_pl = sd_power_cv[3, 2]), 
        file = 'D:/SplitKnockoff_results/simu_experiments/Table/table1.rds')