n = 100
p = 10
p0 = 5
sigma = 1
sigmaX = 1
b0 = 5
beta0 = c(rep(b0,p0),rep(0,p-p0))
X = sapply(1:p, FUN=function(x){rnorm(n,0,sigmaX)})
Y = X%*%beta0 + rnorm(n,0,sigma)
plot(Y,X%*%beta0)
Model parameters are estimated using the \(\texttt{glmnet}\) \(\texttt{R}\)-package.
library(glmnet)
n = 100
p = 10
p0 = 5
sigma = 1
sigmaX = 1
b0 = 5
beta0 = c(rep(b0,p0),rep(0,p-p0))
X = sapply(1:p, FUN=function(x){rnorm(n,0,sigmaX)})
Y = X%*%beta0 + rnorm(n,0,sigma)
fit = glmnet(X, Y)
plot(fit,label=TRUE)
names(fit)
print(fit)
coef(fit)
Parameter \(\lambda\) is chosen by cross validation using :
library(glmnet)
getlasso <- function(X,Y){
p = dim(X)[2]
lambda.cv = cv.glmnet(X,Y, family = "gaussian",intercept=F)$lambda.1se
bh = glmnet(X,Y,family = "gaussian",intercept=F, lambda=lambda.cv)$beta
if ( sum(abs(bh))==0 ) {bh = rep(0,p)}
return(as.vector(bh))
}
n = 100
p = 10
p0 = 5
sigma = 1
sigmaX = 1
b0 = 5
beta0 = c(rep(b0,p0),rep(0,p-p0))
X = sapply(1:p, FUN=function(x){rnorm(n,0,sigmaX)})
Y = X%*%beta0 + rnorm(n,0,sigma)
getlasso(X,Y)
n = 100
p = 10
p0 = 5
sigma = 1
sigmaX = 1
snr = 1
b0 = sqrt(snr/p0)*sigma/sigmaX
beta0 = c(rep(b0,p0),rep(0,p-p0))
nbsimul = 10
res.lasso = matrix(NA,ncol=8,nrow=nbsimul)
for (i in 1:nbsimul){
cat("p=",p," ","p0=",p0," ","snr=",snr," ","i=",i,"\n")
X = sapply(1:p, FUN=function(x){rnorm(n,0,sigmaX)})
Y = X%*%beta0 + rnorm(n,0,sigma)
betah = getlasso(X,Y) ;
res.lasso[i,] = c(n, p, sigma, b0, snr, # setting
sum(betah-beta0), sum( (betah-beta0)^2 ), # estimation
"lasso") # method
}
# format conversion
res = as.data.frame(res.lasso)
colnames(res) = c("n","p","sigma","b0","snr","bias","mse","method")
res$bias = as.numeric(as.character(res$bias))
res$mse = as.numeric(as.character(res$mse))
mean(res$bias)
mean(res$mse)
Given two vectors \(\texttt{vtrue}\) and \(\texttt{vest}\) we can compute the estimated dimension, accuracy, sensitivity and specificity using the following functions:
sh = length(vest!=0)
getACC<-function(vtrue, vest){
p = length(vtrue)
tp = sum(vtrue!=0 & vest!=0)
tn = sum(vtrue==0 & vest ==0)
(tp+tn)/p
}
getsens <- function(vtrue,vest){
tp = sum(vtrue!=0 & vest!=0)
fn = sum(vtrue!=0 & vest==0)
tp/(fn+tp)
}
getspe <- function(vtrue,vest){
tn = sum(vtrue==0 & vest ==0)
fp = sum(vtrue==0 & vest!=0)
tn/(tn+fp)
}
Beware ! The hyperparameter \(\lambda\) should be tuned for each subsampled dataset ! So the prediction error requires two layers of cross validation.
getprederr <- function(X,Y,K){
n = length(Y)
p = dim(X)[2]
out = sapply(split(sample(n),1:K)), FUN=function(ii){
betah = getlasso(X[-ii,],Y[-ii]) # calibration of lambda by CV
sum( (Y[ii]- X[ii,]%*%betah)^2 )
})
return(sum(out)/K)
}
n = 100
p = 10
p0 = 5
sigma = 1
sigmaX = 1
snr = 1
b0 = sqrt(snr/p0)*sigma/sigmaX
beta0 = c(rep(b0,p0),rep(0,p-p0))
nbsimul = 10
res.lasso = matrix(NA,ncol=14,nrow=nbsimul)
for (i in 1:nbsimul){
cat("p=",p," ","p0=",p0," ","snr=",snr," ","i=",i,"\n")
X = sapply(1:p, FUN=function(x){rnorm(n,0,sigmaX)})
Y = X%*%beta0 + rnorm(n,0,sigma)
ptm = proc.time() ;
betah = getlasso(X,Y) ;
ptm = proc.time()-ptm ;
res.lasso[i,] = c(n, p, sigma, b0, snr, # setting
sum(betah-beta0), sum( (betah-beta0)^2 ), # estimation
ptm[3], # time
length(betah), getACC(beta0, betah), getsens(beta0, betah), getspe(beta0, betah),# selection
getprederr(X,Y,K=5), # prediction
"lasso") # method
}
res = as.data.frame(res.lasso)
colnames(res) = c("n","p","sigma","b0","snr","bias","mse","time","sh","acc","sens","spe","pred.err","method")
# dataframe formating
res$bias = as.numeric(as.character(res$bias))
res$mse = as.numeric(as.character(res$mse))
res$time = as.numeric(as.character(res$time))
res$time = as.numeric(as.character(res$sh))
res$acc = as.numeric(as.character(res$acc))
res$spe = as.numeric(as.character(res$spe))
res$sens = as.numeric(as.character(res$sens))
res$pred.err = as.numeric(as.character(res$pred.err))
Here we fix \(n=100\), \(p_0=5\), and \(p \in \{10,50,100\}\). Then \(\text{SNR} \in \{0.01, 1, 10\}\), which corresponds to 3 typical situations : no signal (difficult case), as much as signal as noise (limit case), more signal than noise (easy case).
for (snr in c(0.01,1,10)){
b0 = sqrt(snr/p0)*sigma/sigmaX
beta0 = c(rep(b0,p0),rep(0,p-p0))
for (i in 1:nbsimul){
cat("p=",p," ","p0=",p0," ","snr=",snr," ","i=",i,"\n")
Y = X%*%beta0 + rnorm(n,0,sigma)
# previous code
}
}
The code can be adapted to multiprocessor computers with the \(\texttt{parallel}\) library (\(\texttt{mclapply}\) function).
Representation can be done using boxplots or averaged performance:
library(ggplot2)
# permutation of the levels only for printing (lexicographic order)
res$p = factor(res$p,levels=levels(res$p)[c(2,3,1)])
ggplot(data=res, aes(x=p,y=bias,color=snr)) + geom_boxplot() + xlab("Dimension p") + ylab("Bias")
aggregate(bias~p*snr,data=res,mean)
tapply(res$bias,list(res$p,res$snr),mean)