Multiple sequences

Multiple sequences

Simple data set where depmix errors for nstates=2

data(neuroblastoma, package="neuroblastoma")
library(data.table)
nb.dt <- data.table(neuroblastoma[["profiles"]])
nb.dt[, data.i := rank(position), keyby=.(profile.id, chromosome)]
(pro.dt <- nb.dt[profile.id=="4" & chromosome %in% paste(1:10)])
#> Key: <profile.id, chromosome>
#>       profile.id chromosome  position   logratio data.i
#>           <fctr>     <fctr>     <int>      <num>  <num>
#>    1:          4          1    809681 -0.5777670      1
#>    2:          4          1    928433 -0.4500844      2
#>    3:          4          1    987423 -0.6170561      3
#>    4:          4          1   1083595 -0.8059129      4
#>    5:          4          1   1392490 -0.6826959      5
#>   ---                                                  
#> 1797:          4         10 129101212 -0.2933589    129
#> 1798:          4         10 129523531 -0.2378638    130
#> 1799:          4         10 131219243 -0.2042331    131
#> 1800:          4         10 132561724 -0.2567005    132
#> 1801:          4         10 135221961 -0.1297339    133
library(ggplot2)
ggplot()+
  facet_grid(. ~ chromosome, scales="free", space="free")+
  scale_x_continuous(
    "Position/index in data sequence",
    breaks=seq(0, 1000, by=100))+
  scale_y_continuous(
    "logratio (data values)")+
  geom_point(aes(
    data.i, logratio),
    data=pro.dt)

plot of chunk unnamed-chunk-1

In the data set plotted above we have ten data sequences (panels from left to right), for which we can learn common HMM parameters.

pro.dt[, row := 1:.N]
all.y.vec <- pro.dt[["logratio"]]
data.list <- split(pro.dt, paste(pro.dt[["chromosome"]]))
first.row.vec <- pro.dt[data.i==1, row]
last.row.vec <- pro.dt[, .SD[data.i==.N], by=chromosome][["row"]]
n.states <- 4
log.A.mat <- log(matrix(1/n.states, n.states, n.states))
set.seed(1)
mean.vec <- rnorm(n.states)
sd.param <- 1
log.pi.vec <- log(rep(1/n.states, n.states))

Parameter initializations above. Below we initialize a matrix/array for gamma/xi which has an index sized according to the full data set (total number of observations across all ten sequences).

all.log.gamma.mat <- matrix(NA, nrow(pro.dt), n.states)
all.log.xi.array <- array(NA, c(n.states, n.states, nrow(pro.dt)))
for(chrom.i in seq_along(data.list)){
  one.chrom <- data.list[[chrom.i]]
  row.vec <- one.chrom[["row"]]
  y.vec <- one.chrom[["logratio"]]
  N.data <- length(y.vec)
  log.emission.mat <- dnorm(
    matrix(y.vec, N.data, n.states, byrow=FALSE),
    matrix(mean.vec, N.data, n.states, byrow=TRUE),
    sd.param,
    log=TRUE)
  fwd.list <- plotHMM::forward_interface(
    log.emission.mat, log.A.mat, log.pi.vec)
  log.alpha.mat <- fwd.list[["log_alpha"]]
  log.beta.mat <- plotHMM::backward_interface(log.emission.mat, log.A.mat)
  all.log.gamma.mat[row.vec, ] <- plotHMM::multiply_interface(
    log.alpha.mat, log.beta.mat)
  all.log.xi.array[,, row.vec[-N.data] ] <- plotHMM::pairwise_interface(
    log.emission.mat, log.A.mat, log.alpha.mat, log.beta.mat)
}

The code above has a for loop over the ten data sequences. At the end of each iteration row.vec is used to define the indices (specific to each sequence) where the results of multiply and pairwise are stored.

(new.log.pi.vec <- apply(
  all.log.gamma.mat[first.row.vec,]-log(length(first.row.vec)),
  2, plotHMM::logsumexp))
#> [1] -1.187884 -1.101956 -1.307548 -2.381292

Above we compute the new log initial state probabilities, using first.row.vec to define the indices of the first data points in each sequence.

prob.mat <- exp(all.log.gamma.mat)
(new.mean.vec <- colSums(all.y.vec*prob.mat)/colSums(prob.mat))
#> [1] -0.10342880 -0.05626308 -0.11673591  0.01231217
resid.mat <- all.y.vec-matrix(
  new.mean.vec, length(all.y.vec), n.states, byrow=TRUE)
var.est <- sum(prob.mat * resid.mat^2) / sum(prob.mat)
(new.sd.param <- sqrt(var.est))
#> [1] 0.2419531

Above we compute the new emission (mean and sd) parameters.

new.log.A.mat <- plotHMM::transition_interface(
  all.log.gamma.mat[-last.row.vec,],
  all.log.xi.array[,, -last.row.vec])

Finally we compute a new log transition probability matrix, using last.row.vec to exclude the last data point in each sequence.