library(SplitKnockoff)
Author : Yuxuan Chen, Haoxue Wang, Yang Cao, Xinwei Sun, Yuan Yao
Split Knockoff is a data adaptive variable selection framework for controlling the (directional) false discovery rate (FDR) in structural sparsity, where variable selection on linear transformation of parameters is of concern. This proposed scheme relaxes the linear subspace constraint to its neighborhood, often known as variable splitting in optimization, which leads to better incoherence and possible performance improvement in both power and FDR. This vignette illustrates the usage of SplitKnockoff with simulation experiments in Cao et al. (2023) <arXiv.2310.07605> and will help you apply the split knockoff method in a light way.
install.packages("SplitKnockoff") # just one line code to install our package
This is a R implement on the Matlab version of Split Knockoffs. This R package is more convenient as glmnet can be used directly by
install.packages("glmnet") # just one line code to install the glmnet tool
Please update your glmnet package to the latest version for smooth usage of this package, the examples illustrated here are tested with glmnet 4.1-8.
For more information, please see the manual inside this package.
sk.filter(X, D, y, option) : the main function, Split Knockoff filter, for variable selection in structural sparsity problem.
sk.create(X, y, D, nu, option): generate the split knockoff copy for structural sparsity problem.
sk.W_path(X, D, y, nu, option): generate the \(W\) statistics for split knockoff on a split LASSO path.
sk.W_fixed(X, D, y, nu, option): generate the \(W\) statistics for split knockoff based on a fixed \(\beta(\lambda) = \hat{\beta}\).
select(W, q, method): this function is for variable selection based on \(W\) statistics.
install.packages("SplitKnockoff") # install our package
library(latex2exp)
library(ggplot2)
library(Matrix)
library(glmnet)
library(SplitKnockoff)
<- 20 # sparsity level
k <- 1 # magnitude
A <- 500 # sample size
n <- 100 # dimension of variables
p <- 0.5 # feature correlation
c <-1 # noise level
sigma
<- list()
option
# the target (directional) FDR control
$q <- 0.2
option
# whether to normalize the dataset
$normalize <- 'true'
option
# fraction of data used to estimate \beta(\lambda)
$frac = 2/5
option
# choice on the set of regularization parameters for split LASSO path
$lambda <- 10.^seq(0, -6, by=-0.01)
option$lambda_cv <- 10.^seq(0, -6, by=-0.01)
option
# choice of nu for split knockoffs
= seq(0, 2, by = 0.2)
expo $nu <- 10.^expo
option$nu_cv <- 10.^expo
option<- length(option$nu)
num_nu
# set random seed
$seed = 1
option
# the number of simulation instances
$tests = 200
option
$W = 's' option
= matrix(0, p-1, p)
D_G
for (i in 1:(p-1)){
= 1
D_G[i, i] +1] = -1
D_G[i, i
}
# generate D1, D2, and D3
= diag(p)
D_1 = D_G
D_2 = rbind(diag(p), D_G)
D_3 = list(D_1 = D_1, D_2 = D_2, D_3 = D_3) D_s
# generate Sigma
= matrix(0, p, p)
Sigma for( i in 1: p){
for(j in 1: p){
<- c^(abs(i - j))
Sigma[i, j]
} }
library(mvtnorm) # package mvtnorm needed for this generation
set.seed(100)
<- rmvnorm(n,matrix(0, p, 1), Sigma) # generate X X
<- matrix(0, p, 1)
beta_true for( i in 1: k){
1] = A
beta_true[i, if ( i%%3 == 1){
1] = 0
beta_true[i,
} }
Split knockoff for all \(\nu\)
# choice on the way generating the W statistics
$beta = 'path'
option
# save Z t_Z r t_Z for making plots
= list()
rawvalue
# save y
= list()
Y
= option$tests # the number of experiments
tests
# create matrices to store results
= array(0, dim = c(3, tests, num_nu, 2))
fdr_split = array(0, dim = c(3, tests, num_nu, 2))
power_split
for (test in 1:tests){
# generate varepsilon
set.seed(test)
# generate noise and y
= matrix(rnorm(n), ncol = 1) * sqrt(sigma)
varepsilon = X %*% beta_true + varepsilon
y = y
Y[[test]]
= list()
raw
for (j in 1:3){
# choose the respective D for each example
= D_s[[j]]
D
<- D %*% beta_true
gamma_true
= sk.filter(X, D, y, option)
sk_results = sk_results$results
results = sk_results$stats
raw[[j]]
for (i in 1:num_nu){
= sk_results$stats[[i]]$r
r_sign = results[[i]]$sk
result 1] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1)
fdr_split[j, test, i, 1] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0)
power_split[j, test, i, = results[[i]]$sk_plus
result 2] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1)
fdr_split[j, test, i, 2] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0)
power_split[j, test, i,
}
}= raw
rawvalue[[test]] }
Split knockoff for \(\nu\) chosen by cross validation
# choice on the way generating the W statistics
$beta = 'cv_all'
option
# create matrices to store results
= array(0, dim = c(3, tests, 2))
fdr_cv = array(0, dim = c(3, tests, 2))
power_cv
for (test in 1:tests){
<- Y[[test]]
y for (j in 1:3){
# choose the respective D for each example
= D_s[[j]]
D
<- D %*% beta_true
gamma_true
= sk.filter(X, D, y, option)
sk_results = sk_results$results
results = sk_results$stats$r
r_sign = results$sk
result 1] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1)
fdr_cv[j, test, 1] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0)
power_cv[j, test, = results$sk_plus
result 2] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1)
fdr_cv[j, test, 2] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0)
power_cv[j, test,
}
}<- apply(fdr_cv, c(1, 3), mean)
mean_fdr_cv <- apply(fdr_cv, c(1, 3), sd)
sd_fdr_cv <- apply(power_cv, c(1, 3), mean)
mean_power_cv <- apply(power_cv, c(1, 3), sd) sd_power_cv
Knockoff
library(knockoff) # package knockoff needed
# create matrices to store results
= array(0, dim = c(2, tests, 2))
fdr_knockoff = array(0, dim = c(2, tests, 2))
power_knockoff
= D_s[[2]]
D
# Calculate X_bar, y_bar
<- t(D) %*% solve(D %*% t(D))
Z
<- X %*% rep(1, p)
XF <- XF / norm(XF, "F")
U_X <- cbind(U_X, matrix(0, n, n-1))
UU_X <- qr(UU_X)
qrresult <- qr.Q(qrresult)
Qreslt <- Qreslt[,2:n]
UX_perp <- t(UX_perp) %*% y
y_bar <- t(UX_perp) %*% X %*% Z
X_bar
for (test in 1:tests){
<- Y[[test]]
y
# choose D_1 for each example
= D_s[[1]]
D
<- D %*% beta_true
gamma_true
= knockoff.filter(X, y, knockoffs = {function(x) create.fixed(x, y=y, method='equi')}, statistic = stat.lasso_lambdasmax)
k_results = k_results$statistic
W_k = select(W_k, option$q, 'knockoff')
result 1, test, 1] = sum(gamma_true[result] == 0) / max(length(result), 1)
fdr_knockoff[1, test, 1] = sum(gamma_true[result] != 0) / sum(gamma_true != 0)
power_knockoff[= select(W_k, option$q, 'knockoff+')
result 1, test, 2] = sum(gamma_true[result] == 0) / max(length(result), 1)
fdr_knockoff[1, test, 2] = sum(gamma_true[result] != 0) / sum(gamma_true != 0)
power_knockoff[
# choose the D_2 for each example
= D_s[[2]]
D
<- D %*% beta_true
gamma_true
= knockoff.filter(X_bar, y_bar, knockoffs = {function(x) create.fixed(x, y=y_bar, method='equi')}, statistic = stat.lasso_lambdasmax)
k_results = k_results$statistic
W_k = select(W_k, option$q, 'knockoff')
result 2, test, 1] = sum(gamma_true[result] == 0) / max(length(result), 1)
fdr_knockoff[2, test, 1] = sum(gamma_true[result] != 0) / sum(gamma_true != 0)
power_knockoff[= select(W_k, option$q, 'knockoff+')
result 2, test, 2] = sum(gamma_true[result] == 0) / max(length(result), 1)
fdr_knockoff[2, test, 2] = sum(gamma_true[result] != 0) / sum(gamma_true != 0)
power_knockoff[ }
Plot figure 3
<- expo
x = qt(c(0.1, 0.9), tests - 1)
t_value = t_value[1]
lower_bound = t_value[2]
upper_bound
<- apply(fdr_knockoff, c(1, 3), mean)
fdr_knockoff_mean <- apply(power_knockoff, c(1, 3), mean)
power_knockoff_mean <- apply(fdr_knockoff, c(1, 3), sd)
fdr_knockoff_sd <- apply(power_knockoff, c(1, 3), sd)
power_knockoff_sd <- apply(fdr_split, c(1, 3, 4), mean)
fdr_split_mean <- apply(fdr_split, c(1, 3, 4), sd)
fdr_split_sd <- apply(power_split, c(1, 3, 4), mean)
power_split_mean <- apply(power_split, c(1, 3, 4), sd)
power_split_sd
<- fdr_split_mean + fdr_split_sd * upper_bound
fdr_split_top <- fdr_split_mean + fdr_split_sd * lower_bound
fdr_split_bot
<- power_split_mean + power_split_sd * upper_bound
power_split_top <- power_split_mean + power_split_sd * lower_bound
power_split_bot
## for D_1
## plot for FDR
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_31_fdr.png', height=1000, width=1000)
<- data.frame(
plot_data x = rep(x, 4),
y = c(fdr_split_mean[1, , 1], fdr_split_mean[1, , 2], rep(fdr_knockoff_mean[1, 1], num_nu), rep(fdr_knockoff_mean[1, 2], num_nu)),
type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu)
)
# Plotting
<- ggplot() +
fig geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
geom_ribbon(aes(x = expo, ymin = fdr_split_bot[1, , 1], ymax = fdr_split_top[1, , 1]), fill = "red", alpha = 0.05) +
geom_ribbon(aes(x = expo, ymin = fdr_split_bot[1, , 2], ymax = fdr_split_top[1, , 2]), fill = "blue", alpha = 0.05) +
geom_abline(intercept = option$q, slope=0, linetype="dashed") +
labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) +
ggtitle(expression('D'[1])) +
coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) +
theme_light() +
theme(plot.title = element_text(hjust = 0.5))
print(fig)
dev.off()
## plot for Power
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_31_power.png', height=1000, width=1000)
<- data.frame(
plot_data x = rep(x, 4),
y = c(power_split_mean[1, , 1], power_split_mean[1, , 2], rep(power_knockoff_mean[1, 1], num_nu), rep(power_knockoff_mean[1, 2], num_nu)),
type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu)
)
# Plotting
<- ggplot() +
fig geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
geom_ribbon(aes(x = expo, ymin = power_split_bot[1, , 1], ymax = power_split_top[1, , 1]), fill = "red", alpha = 0.05) +
geom_ribbon(aes(x = expo, ymin = power_split_bot[1, , 2], ymax = power_split_top[1, , 2]), fill = "blue", alpha = 0.05) +
labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") +
ggtitle(expression('D'[1])) +
coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) +
theme_light() +
theme(plot.title = element_text(hjust = 0.5))
print(fig)
dev.off()
## for D_2
## plot for FDR
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_32_fdr.png', height=1000, width=1000)
<- data.frame(
plot_data x = rep(x, 4),
y = c(fdr_split_mean[2, , 1], fdr_split_mean[2, , 2], rep(fdr_knockoff_mean[2, 1], num_nu), rep(fdr_knockoff_mean[2, 2], num_nu)),
type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu)
)
# Plotting
<- ggplot() +
fig geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
geom_ribbon(aes(x = expo, ymin = fdr_split_bot[2, , 1], ymax = fdr_split_top[2, , 1]), fill = "red", alpha = 0.05) +
geom_ribbon(aes(x = expo, ymin = fdr_split_bot[2, , 2], ymax = fdr_split_top[2, , 2]), fill = "blue", alpha = 0.05) +
geom_abline(intercept = option$q, slope=0, linetype="dashed") +
labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) +
ggtitle(expression('D'[2])) +
coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) +
theme_light() +
theme(plot.title = element_text(hjust = 0.5))
print(fig)
dev.off()
## plot for Power
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_32_power.png', height=1000, width=1000)
<- data.frame(
plot_data x = rep(x, 4),
y = c(power_split_mean[2, , 1], power_split_mean[2, , 2], rep(power_knockoff_mean[2, 1], num_nu), rep(power_knockoff_mean[2, 2], num_nu)),
type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu)
)
# Plotting
<- ggplot() +
fig geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
geom_ribbon(aes(x = expo, ymin = power_split_bot[2, , 1], ymax = power_split_top[2, , 1]), fill = "red", alpha = 0.05) +
geom_ribbon(aes(x = expo, ymin = power_split_bot[2, , 2], ymax = power_split_top[2, , 2]), fill = "blue", alpha = 0.05) +
labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") +
ggtitle(expression('D'[2])) +
coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) +
theme_light() +
theme(plot.title = element_text(hjust = 0.5))
print(fig)
dev.off()
## for D_3
## plot for FDR
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_33_fdr.png', height=1000, width=1000)
<- data.frame(
plot_data x = rep(x, 2),
y = c(fdr_split_mean[3, , 1], fdr_split_mean[3, , 2]),
type = rep(c("Split Knockoff", "Split Knockoff+"), each = num_nu)
)
# Plotting
<- ggplot() +
fig geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
geom_ribbon(aes(x = expo, ymin = fdr_split_bot[3, , 1], ymax = fdr_split_top[3, , 1]), fill = "red", alpha = 0.05) +
geom_ribbon(aes(x = expo, ymin = fdr_split_bot[3, , 2], ymax = fdr_split_top[3, , 2]), fill = "blue", alpha = 0.05) +
geom_abline(intercept = option$q, slope=0, linetype="dashed") +
labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) +
ggtitle(expression('D'[3])) +
coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue")) +
theme_light() +
theme(plot.title = element_text(hjust = 0.5))
print(fig)
dev.off()
## plot for Power
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_33_power.png', height=1000, width=1000)
<- data.frame(
plot_data x = rep(x, 2),
y = c(power_split_mean[3, , 1], power_split_mean[3, , 2]),
type = rep(c("Split Knockoff", "Split Knockoff+"), each = num_nu)
)
# Plotting
<- ggplot() +
fig geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
geom_ribbon(aes(x = expo, ymin = power_split_bot[3, , 1], ymax = power_split_top[3, , 1]), fill = "red", alpha = 0.05) +
geom_ribbon(aes(x = expo, ymin = power_split_bot[3, , 2], ymax = power_split_top[3, , 2]), fill = "blue", alpha = 0.05) +
labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") +
ggtitle(expression('D'[3])) +
coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue")) +
theme_light() +
theme(plot.title = element_text(hjust = 0.5))
print(fig)
dev.off()
saveRDS(list(mean_fdr_D1_knockoff = fdr_knockoff_mean[1, 1],
sd_fdr_D1_knockoff = fdr_knockoff_sd[1, 1],
mean_power_D1_knockoff = power_knockoff_mean[1, 1],
sd_power_D1_knockoff = power_knockoff_sd[1, 1],
mean_fdr_D2_knockoff = fdr_knockoff_mean[2, 1],
sd_fdr_D2_knockoff = fdr_knockoff_sd[2, 1],
mean_power_D2_knockoff = power_knockoff_mean[2, 1],
sd_power_D2_knockoff = power_knockoff_sd[2, 1],
mean_fdr_D1_sk = mean_fdr_cv[1, 1],
sd_fdr_D1_sk = sd_fdr_cv[1, 1],
mean_power_D1_sk = mean_power_cv[1, 1],
sd_power_D1_sk = sd_power_cv[1, 1],
mean_fdr_D2_sk = mean_fdr_cv[2, 1],
sd_fdr_D2_sk = sd_fdr_cv[2, 1],
mean_power_D2_sk = mean_power_cv[2, 1],
sd_power_D2_sk = sd_power_cv[2, 1],
mean_fdr_D3_sk = mean_fdr_cv[3, 1],
sd_fdr_D3_sk = sd_fdr_cv[3, 1],
mean_power_D3_sk = mean_power_cv[3, 1],
sd_power_D3_sk = sd_power_cv[3, 1],
mean_fdr_D1_knockoff_pl = fdr_knockoff_mean[1, 2],
sd_fdr_D1_knockoff_pl = fdr_knockoff_sd[1, 2],
mean_power_D1_knockoff_pl = power_knockoff_mean[1, 2],
sd_power_D1_knockoff_pl = power_knockoff_sd[1, 2],
mean_fdr_D2_knockoff_pl = fdr_knockoff_mean[2, 2],
sd_fdr_D2_knockoff_pl = fdr_knockoff_sd[2, 2],
mean_power_D2_knockoff_pl = power_knockoff_mean[2, 2],
sd_power_D2_knockoff_pl = power_knockoff_sd[2, 2],
mean_fdr_D1_sk_pl = mean_fdr_cv[1, 2],
sd_fdr_D1_sk_pl = sd_fdr_cv[1, 2],
mean_power_D1_sk_pl = mean_power_cv[1, 2],
sd_power_D1_sk_pl = sd_power_cv[1, 2],
mean_fdr_D2_sk_pl = mean_fdr_cv[2, 2],
sd_fdr_D2_sk_pl = sd_fdr_cv[2, 2],
mean_power_D2_sk_pl = mean_power_cv[2, 2],
sd_power_D2_sk_pl = sd_power_cv[2, 2],
mean_fdr_D3_sk_pl = mean_fdr_cv[3, 2],
sd_fdr_D3_sk_pl = sd_fdr_cv[3, 2],
mean_power_D3_sk_pl = mean_power_cv[3, 2],
sd_power_D3_sk_pl = sd_power_cv[3, 2]),
file = 'D:/SplitKnockoff_results/simu_experiments/Table/table1.rds')