1 Simulation of observations
2 Computing the LASSO with \(\texttt{glmnet}\)
3 Quality of estimation : bias and variance
4 Quality of Model Selection
5 Assessing the prediction error using K-fold Cross validation
6 Running first simulations
7 Empirical properties of the lasso in small dimension

1 Simulation of observations

n       = 100
p       = 10
p0      = 5
sigma   = 1
sigmaX  = 1
b0      = 5
beta0   = c(rep(b0,p0),rep(0,p-p0))
X       = sapply(1:p, FUN=function(x){rnorm(n,0,sigmaX)})
Y       = X%*%beta0 + rnorm(n,0,sigma)

plot(Y,X%*%beta0)

2 Computing the LASSO with \(\texttt{glmnet}\)

Model parameters are estimated using the \(\texttt{glmnet}\) \(\texttt{R}\)-package.

library(glmnet)
n       = 100
p       = 10
p0      = 5
sigma   = 1
sigmaX  = 1
b0      = 5
beta0   = c(rep(b0,p0),rep(0,p-p0))
X       = sapply(1:p, FUN=function(x){rnorm(n,0,sigmaX)})
Y       = X%*%beta0 + rnorm(n,0,sigma)

fit = glmnet(X, Y)
plot(fit,label=TRUE)
names(fit)
print(fit)
coef(fit)

Parameter \(\lambda\) is chosen by cross validation using :

library(glmnet)

getlasso <- function(X,Y){
  p         = dim(X)[2]
  lambda.cv = cv.glmnet(X,Y, family = "gaussian",intercept=F)$lambda.1se
  bh        = glmnet(X,Y,family = "gaussian",intercept=F, lambda=lambda.cv)$beta
  if ( sum(abs(bh))==0 ) {bh = rep(0,p)}
  return(as.vector(bh))
}

n       = 100
p       = 10
p0      = 5
sigma   = 1
sigmaX  = 1
b0      = 5
beta0   = c(rep(b0,p0),rep(0,p-p0))
X       = sapply(1:p, FUN=function(x){rnorm(n,0,sigmaX)})
Y       = X%*%beta0 + rnorm(n,0,sigma)

getlasso(X,Y)

3 Quality of estimation : bias and variance

n         = 100
p         = 10
p0        = 5
sigma     = 1
sigmaX    = 1
snr       = 1
b0        = sqrt(snr/p0)*sigma/sigmaX
beta0     = c(rep(b0,p0),rep(0,p-p0))       
nbsimul   = 10
res.lasso = matrix(NA,ncol=8,nrow=nbsimul)

for (i in 1:nbsimul){
  cat("p=",p," ","p0=",p0," ","snr=",snr," ","i=",i,"\n")
  X     = sapply(1:p, FUN=function(x){rnorm(n,0,sigmaX)})
  Y     = X%*%beta0 + rnorm(n,0,sigma)
  betah = getlasso(X,Y) ; 
  res.lasso[i,] = c(n, p, sigma, b0, snr,                     # setting
                    sum(betah-beta0), sum( (betah-beta0)^2 ), # estimation
                    "lasso")                                  # method  
}

# format conversion 
res      = as.data.frame(res.lasso)
colnames(res) = c("n","p","sigma","b0","snr","bias","mse","method")
res$bias = as.numeric(as.character(res$bias))
res$mse  = as.numeric(as.character(res$mse))

mean(res$bias)
mean(res$mse)

4 Quality of Model Selection

Given two vectors \(\texttt{vtrue}\) and \(\texttt{vest}\) we can compute the estimated dimension, accuracy, sensitivity and specificity using the following functions:

sh = length(vest!=0)

getACC<-function(vtrue, vest){
  p   = length(vtrue)
  tp  = sum(vtrue!=0 & vest!=0)
  tn  = sum(vtrue==0 & vest ==0)
  (tp+tn)/p
}
getsens <- function(vtrue,vest){
  tp  = sum(vtrue!=0 & vest!=0)
  fn  = sum(vtrue!=0 & vest==0)
  tp/(fn+tp)
}
getspe <- function(vtrue,vest){
  tn  = sum(vtrue==0 & vest ==0)
  fp  = sum(vtrue==0 & vest!=0)
  tn/(tn+fp)
}

5 Assessing the prediction error using K-fold Cross validation

Beware ! The hyperparameter \(\lambda\) should be tuned for each subsampled dataset ! So the prediction error requires two layers of cross validation.

getprederr <- function(X,Y,K){
    n = length(Y)
    p = dim(X)[2]
    out = sapply(split(sample(n),1:K)), FUN=function(ii){
        betah = getlasso(X[-ii,],Y[-ii]) # calibration of lambda by CV
        sum( (Y[ii]- X[ii,]%*%betah)^2 )
    })
    return(sum(out)/K)
}

6 Running first simulations

n         = 100
p         = 10
p0        = 5
sigma     = 1
sigmaX    = 1
snr       = 1
b0        = sqrt(snr/p0)*sigma/sigmaX
beta0     = c(rep(b0,p0),rep(0,p-p0))       
nbsimul   = 10
res.lasso = matrix(NA,ncol=14,nrow=nbsimul)

for (i in 1:nbsimul){
  cat("p=",p," ","p0=",p0," ","snr=",snr," ","i=",i,"\n")
  X     = sapply(1:p, FUN=function(x){rnorm(n,0,sigmaX)})
  Y     = X%*%beta0 + rnorm(n,0,sigma)
  ptm   = proc.time() ; 
  betah = getlasso(X,Y) ; 
  ptm   = proc.time()-ptm ;
  res.lasso[i,] = c(n, p, sigma, b0, snr,                                                            # setting
                    sum(betah-beta0), sum( (betah-beta0)^2 ),                                        # estimation
                    ptm[3],                                                                          # time
                    length(betah), getACC(beta0, betah), getsens(beta0, betah), getspe(beta0, betah),# selection
                    getprederr(X,Y,K=5),                                                             # prediction
                    "lasso")                                                                         # method  
}
res      = as.data.frame(res.lasso)
colnames(res) = c("n","p","sigma","b0","snr","bias","mse","time","sh","acc","sens","spe","pred.err","method")

# dataframe formating
res$bias = as.numeric(as.character(res$bias))
res$mse  = as.numeric(as.character(res$mse))
res$time = as.numeric(as.character(res$time))
res$time = as.numeric(as.character(res$sh))
res$acc  = as.numeric(as.character(res$acc))
res$spe  = as.numeric(as.character(res$spe))
res$sens = as.numeric(as.character(res$sens))
res$pred.err = as.numeric(as.character(res$pred.err))

7 Empirical properties of the lasso in small dimension

Here we fix \(n=100\), \(p_0=5\), and \(p \in \{10,50,100\}\). Then \(\text{SNR} \in \{0.01, 1, 10\}\), which corresponds to 3 typical situations : no signal (difficult case), as much as signal as noise (limit case), more signal than noise (easy case).

for (snr in c(0.01,1,10)){
  b0        = sqrt(snr/p0)*sigma/sigmaX
  beta0     = c(rep(b0,p0),rep(0,p-p0))       
  for (i in 1:nbsimul){
    cat("p=",p," ","p0=",p0," ","snr=",snr," ","i=",i,"\n")
    Y     = X%*%beta0 + rnorm(n,0,sigma)
    # previous code
  }
}

The code can be adapted to multiprocessor computers with the \(\texttt{parallel}\) library (\(\texttt{mclapply}\) function).

Representation can be done using boxplots or averaged performance:

library(ggplot2)
# permutation of the levels only for printing (lexicographic order)
res$p    = factor(res$p,levels=levels(res$p)[c(2,3,1)])

ggplot(data=res, aes(x=p,y=bias,color=snr)) + geom_boxplot() + xlab("Dimension p") + ylab("Bias")
aggregate(bias~p*snr,data=res,mean)
tapply(res$bias,list(res$p,res$snr),mean)

Running the Lasso on designed simulations

Practice, Part 1

Franck Picard

ECAS High Dimensional Statistics Course, October 2017