---
title: "Introduction to baytaAAR"
output: rmarkdown::html_vignette
bibliography: ../inst/REFERENCES.bib
vignette: >
  %\VignetteIndexEntry{Introduction to baytaAAR}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

## Load libraries

```{r, echo = T, message=FALSE}
library(baytaAAR)
library(ggplot2)
library(bayesplot)
library(tidybayes)
library(flexsurv)
```

## Introduction

**baytaAAR** implements latent trait analysis within a Bayesian Markov Chain Monte Carlo (MCMC) framework. It is intended to estimate the age-of-death of adult individuals for whom one or several ordinal traits related to the human aging process have been assessed. It produces probability densities for the individual ages but also for the respective population as a whole. `baytaAAR` has been introduced and tested by Müller-Scheeßel et al. [-@ref_299013], and there the basic idea of the model is illustrated in the following figure:

```{r ordered probit regression, echo = FALSE, message=FALSE, fig.width=6, fig.height=4, fig.align = 'center' }
library(flexsurv)

# Thresholds and trait labels
trans <- c(0.5, 1.5, 2.5, 3.5)
trait_stages <- data.frame(
  stage = 1:5,
  trait = c("first", "second", "third", "fourth", "fifth")
)

# Model params
beta <- 2
beta0 <- -0.5 - beta * log(10)  # So that red line starts at (10, -0.5)
sigma <- 0.5
k <- 3

# Compute exact ages for thresholds
ages <- exp((trans - beta0) / beta)

# Compute true means at those ages using the red regression line
mu_vals <- beta0 + beta * log(ages)

# Function to create closed polygon paths for green Gaussians
make_gaussian_polygon <- function(x0, mu_y, clip_to_x_axis = FALSE) {

  x_seq <- seq(-k * sigma, k * sigma, length.out = 100)
  y_vals <- dnorm(x_seq, 0, sigma)
  y_scaled <- y_vals / max(y_vals) * 3

  x_vals <- x0 + y_scaled
  y_vals_shifted <- mu_y + x_seq

  if (clip_to_x_axis) {

    keep <- y_vals_shifted >= -0.5

    x_vals <- x_vals[keep]
    y_vals_shifted <- y_vals_shifted[keep]

    base_y <- -0.5

  } else {

    base_y <- mu_y - k * sigma

  }

  data.frame(
    x = c(x_vals, x0),
    y = c(y_vals_shifted, base_y),
    group = x0
  )
}

# Create green polygon data
paths_list <- mapply(function(x0, mu_y) {
  clip <- x0 == min(ages)
  make_gaussian_polygon(x0, mu_y, clip_to_x_axis = clip)
}, x0 = ages, mu_y = mu_vals, SIMPLIFY = FALSE)
paths <- do.call(rbind, paths_list)

# Latent trait density polygon (shifted down by 0.5)
latent_z <- seq(0, 5.5, length.out = 300)
latent_density <- dnorm(latent_z, mean = 2.5, sd = 1.2)
latent_x <- 10 - latent_density * 25
latent <- data.frame(
  x = c(latent_x, 10),
  y = c(latent_z - 0.5, min(latent_z - 0.5)),
  group = 1
)

# Gompertz curve (shifted down by 0.5)
age_seq <- seq(10, 100, length.out = 400)
gompertz_density <- flexsurv::dgompertz(age_seq - 10, shape = 0.06, rate = 0.002)
gompertz_density_scaled <- gompertz_density / max(gompertz_density) * 2
gompertz <- data.frame(
  x = c(age_seq, rev(age_seq)),
  y = c(-gompertz_density_scaled - 0.4, rep(-0.5, length(age_seq))),
  group = 1
)

# Axes (x-axis now at y = -0.5)
x_axis <- data.frame(x = c(10, 100), y = c(-0.5, -0.5))
y_axis <- data.frame(x = c(10, 10), y = c(-2.5, 5.5))

# Arrow for equation label
arrow_x <- 50
arrow_y <- beta0 + beta * log(arrow_x)
arrow_df <- data.frame(
  x = c(arrow_x, arrow_x),
  y = c(-0.5, arrow_y),
  xend = c(arrow_x, 10),
  yend = c(arrow_y, arrow_y)
)

# Threshold lines
thresholds <- data.frame(y = trans)

age_seq <- seq(10, 100, length.out = 400)

reg_df <- data.frame(
  x = age_seq,
  y = beta0 + beta * log(age_seq)
)

x_breaks <- seq(20, 90, by = 10)

tick_df <- data.frame(
  x = x_breaks,
  xend = x_breaks,
  y = -0.5,
  yend = -0.6   # small downward ticks
)

# Plot
  ggplot() +
  geom_line(data = reg_df,
            aes(x = x, y = y),
            color = "grey",
            linewidth = 1) +
  geom_hline(data = thresholds, aes(yintercept = y), linetype = "dashed", color = "grey50") +
  annotate("text", x = 1, y = c(trans, 4.5) - 0.5, label = trait_stages$trait, hjust = 0, size = 4) +

  geom_polygon(data = paths, aes(x = x, y = y, group = group), fill = "darkgreen", alpha = 0.4) +
  geom_polygon(data = latent, aes(x = x, y = y, group = group), fill = "steelblue", alpha = 0.4) +
  geom_polygon(data = gompertz, aes(x = x, y = y), fill = "lightgrey") +

  #geom_segment(data = x_axis, aes(x = x[1], y = y[1], xend = x[2], yend = y[2]), linewidth = 0.5) +
  #geom_segment(data = y_axis, aes(x = x[1], y = -0.5, xend = x[2], yend = y[2]), linewidth = 0.5) +
  annotate("segment", x = 10, xend = 100, y = -0.5, yend = -0.5) +
  annotate("segment", x = 10, xend = 10, y = -0.5, yend = 5.5) +
  geom_segment(data = arrow_df, aes(x = x, y = y, xend = xend, yend = yend),
               arrow = arrow(length = unit(0.2, "cm")), linewidth = 0.4) +

  annotate("text", x = arrow_x - 20, y = arrow_y + 0.3,
           label = expression(mu == beta[0] + beta[1]*log(age)), size = 5) +

  xlab("Age-at-death (years)") + ylab("Latent trait variable") +
  coord_cartesian(xlim = c(5, 90), ylim = c(-2.5, 4.25)) +
   # coord_cartesian(xlim = c(5, 90), ylim = c(-0.5, 4.25)) +
    geom_segment(data = tick_df,
                 aes(x = x, xend = xend, y = y, yend = yend),
                 linewidth = 0.4) +
    geom_text(data = data.frame(x = x_breaks),
              aes(x = x, y = -0.75, label = x),
              size = 4) +
    annotate("text",
             x = mean(range(x_breaks)),
             y = -1.2,
             label = "Age-at-death (years)",
             size = 5) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0), hjust = 0.7),
    axis.text.x  = element_blank(),
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank()
  )
```

The figure schematically visualizes the ordered probit regression as a latent trait approach with age-at-death estimated from a Gompertz distribution (grey area). The green Gaussian distributions symbolize the transitions between the trait stages (here, arbitrarily, 5 stages, therefore 4 transitions). The regression line with its parameters intercept ($\beta_0$) and slope ($\beta_1$) determines the position of the latent trait variable. Please note that age is log-scaled, so the regression line appears curved. In practical applications, the latent trait variable does not need to be centered as it is here and the transitions are unlikely to be as evenly spaced as shown here.

More in-depth information on the background of and the rationale behind `baytaAAR` can be found in the vignettes. One of them discusses a real-life use-case with partly known ages-at-death.


## Data layout

As input, the function `bay.ta()` assumes a `matrix()` of trait expressions. In its simplest form, this may contain only one column with a single trait as in the following example ("auricular surface"):

```{r sorsum data, echo = T}
data(sorsum_as, package = "baytaAAR")
head(sorsum_as)
```

The example stems from the neolithic gallery grave Sorsum in Hessen/Germany. The auricular surface was the trait which was possible to assess most often [data from @ref_128246, table S69], and it was assessed according to the method by Lovejoy et al [-@ref_65832] which comprises eight levels. An overview of the distribution of the levels is given in the following plot:

```{r sorsum plot, echo = F, fig.width=4, fig.height=3, fig.align = 'center' }
ggplot(sorsum_as, aes(x =auricular_surface)) + geom_histogram(binwidth = 0.5) + 
  theme_light() +
  scale_x_continuous(breaks=seq(1,8,1),minor_breaks=NULL) + 
  xlab("\nauricular surface") + ylab("count\n")
```

Level 5 is the most numerous while levels 1 and 2 are only present once each. More on data input can be found in the `vignette("data_preparation")`.


## Running a first analysis

We will now run a first analysis with `bay.ta()` with the Sorsum data. We will leave most parameters at the default values and only change the minimum age to `18` and the thinning interval (`thinsteps`) to `1000`. This number is multiplied with the number of saved steps (`1000`) and the result divided by the number of chains (default = 3) to obtain the total number of iterations, minus the number of iterations used for burning-in. We also set a seed for reproducibility.

```{r sorsum analysis, echo = TRUE}
sorsum_as_res <- bay.ta(
  method = sorsum_as[,2],
  minimum_age = 18,
  thinSteps = 1000,
  numSavedSteps = 1000,
  seed = 1234
)
```

The analysis takes about 1 minute, depending on the computer power. The console output informs about the monitored parameters (`a`, `age.s`, `b`, `beta`, `beta0`, `thresh`), the sampled nodes (`age`, `beta`, `beta0`, `b`, `thresh` and `ystar`), their number and the used sampler which is `RW` for Metropolis-Hastings sampling. The model output (`sorsum_as_res` in this case) is a matrix with the number of chains (three in the default case).

The output can be processed with any function able to deal with `coda::mcmc.lists()` but `baytaAAR` provides some functions for convenience to quickly achieve diagnostic results. One of them is the function `diagnostic.summary` which does exactly what its name suggests: a summary of diagnostic measures of the parameters.

```{r sorsum diagnostics, echo = TRUE}
sorsum_as_res_diag <- diagnostic.summary(sorsum_as_res) 
sorsum_as_res_diag |> head(10) |> knitr::kable(digits = 4)
```

The diagnostic table gives a first impression of the result, above the first ten rows are shown. `diagnostics.max.min()`, another convenience function, displays minimum/maximum values of the two quality measures `Potential scale reduction factor` (PSRF, also called Gelman-Rubin statistic), a measure of chain mixing and the `Effective sample size` (ESS), a measure of autocorrelation. The `PSRF` value should be below `1.1`, and the ESS value larger than `10,000` [@ref_77051, 181; 184].

```{r sorsum diagnostic table, echo = T}
diagnostics.max.min(sorsum_as_res_diag)
```

From the output of `diagnostics.max.min()` it is clear that these values are not yet reached and that therefore more iterations would be necessary.

Another measure to assess the quality of the simulations are so-called 'trace-plots'. They illustrate the mixing of the chains which ideally should be nearly indistinguishable. For this, we use the function `bayesplot::mcmc_trace` from the R package `bayesplot` [@gabry_et_al_2019]. We also switch to the color scheme `viridis` which makes distinguishing between chains easier.

```{r sorsum trace plots, echo = T}
bayesplot::color_scheme_set("viridis")
bayesplot::mcmc_trace(sorsum_as_res, 
                      pars = c("age.s[1]", "beta[1]"), n_warmup = 300,
                      facet_args = list(nrow = 1, labeller = label_parsed))
```

The left panel gives an impression how well-mixed chains should look like for the first of the estimated ages. On contrast, the right panel illustrates chains for the parameter `beta`, the slope of the latent linear regression function, that have not yet well-mixed. Longer chains are clearly necessary. This can also be illustrated with a plot showing the `Potential scale reduction factor` (PSRF), already mentioned above:

```{r sorsum gelman plots, echo = T}
coda::gelman.plot(sorsum_as_res[, c("b", "beta[1]")])
```

Ideally, the value of the PSRF should converge to `1`, but stay below `1.1`. On the left panel for the parameter `b`, this is clearly the case, but not so on the right panel, again for the `beta` parameter (please note the difference in scale of the y-axis!). This therefore also indicates that more iterations are required.

From the minimum ESS value of around 25 (see above), it can be surmised that roughly 400 times more iterations are needed to get to a value of 10,000. This would increase runtime proportionally (approximately 200--250 minutes, equaling about 4 hours). However, because the resulting file would also be 400 times larger, some degree of thinning is necessary, so saving only, say, every 10,000th step (= thinning of 10,000). This in turn might increase autocorrelation which in turn will force you to run the model even longer.

In the current vignette, we forego this step at the moment. Instead, we inspect the seven thresholds between the eight levels. For this, we use the internal function `threshold.chains()` to compute the threshold values for each iteration. The computation of the thresholds on the age-scale is done outside of the MCMC simulation to reduce computational cost and memory usage. The function returns a `coda::mcmc.list()` which is further processed with first `diagnostic.summary()` and then `threshold.matrix()`. The latter is again a convenience function to extract mean thresholds values from the output of `diagnostic.summary()` which is particularly handy when dealing with several traits.

```{r sorsum thresholds, echo = TRUE}
thresholds <- threshold.chains(sorsum_as_res)
thresh_diag <- diagnostic.summary(thresholds)
threshold.matrix(thresh_diag) |> data.frame() |> knitr::kable(digits = 1)
```

The thresholds of this trait (auricular surface) are unevenly spaced within the age range of 18 and 100 years. The probability distribution of the thresholds can conveniently visualised with e.g. the R-package `bayesplot`. To avoid too glaring colors, we switch the color scheme again:

```{r thresholds plot, echo = TRUE, message=FALSE, warning=F, fig.width=5, fig.height=6, fig.align = 'center' }
bayesplot::color_scheme_set("gray")
bayesplot::mcmc_areas_ridges(thresholds, prob = 0.8, point_est = c("median"), 
                      border_size = 0.2) + 
  theme_light() + xlim(18,100) + labs(x = "\nage-at-death (years)")
```
The resulting plot shows impressively the considerable overlap between the thresholds, even when only the 80%-credible level is shown (grey shaded areas), but also again that the thresholds are not evenly spaced. Interesting would be a comparison with the thresholds of this trait computed for other populations, keeping in mind that the current values are not reliable as the quality measures were not yet met.

The function `age.estim.summary()` conveniently provides summaries of age-related quantities like the mean estimated age, the mean of the highest density interval of the estimated ages as well as the parameters $\alpha$, $\beta$, and - derived from these two -- the modal age M:

```{r sorsum agerange, echo = TRUE}
age.estim.summary(sorsum_as_res_diag) |> knitr::kable(digits = 3)
```

The above table demonstrates that it makes a difference whether you choose `Mean`, `Median` or `Mode` as the measure of the mean.

To illustrate the distribution of some of the individual ages, this time we rely on the functionality of the R package `tidybayes` [@kay_2024]. Its function `tidybayes::spread_draws()` allows to subset the chains in a single step to extract the estimated ages (`age.s`). We limit here the age estimates to the first seven.

```{r sorsum ages, echo = TRUE, warning=F, fig.width=5, fig.height=6, fig.align = 'center' }
sorsum_as_res |> tidybayes::spread_draws(age.s[age_number])  |> 
  subset(age_number < 8) |>
  ggplot(aes(y =  as.factor(age_number), x = age.s)) +
  tidybayes::stat_halfeye(
    .width = 0.95, point_interval = mode_hdi, fill = "lightgrey") + 
  scale_x_continuous(breaks = seq(10,100,10), limits = c(18, 100)) +
  labs( x = "\nModal age-at-death (years)", y = "Individual no.\n" ) + 
  theme_light() +
  theme(panel.grid.minor.x = element_blank(), text = element_text(size = 12))
```

The plots of the first seven age estimates look similar to the threshold plots and allow immediate assessment of the spread of the probability distribution of the age estimates. So, for example, for the first individual with stage 4 of the auricular surface, the age mode is at about 40 years, but the 95%-credible range is between about 25 to 70 years.

Together with the individual age-at-death estimates, `bay.ta()` also estimates the parameters of the underlying Gompertz function, $\alpha$ and $\beta$:

```{r sorsum Gompertz plot, echo = TRUE, fig.width=6, fig.height=4, fig.align = 'center' }
ggplot() + ylab("density\n") + 
  geom_function(fun =  function(x) 
    flexsurv::dgompertz(x - 18, sorsum_as_res_diag["b",3],
                        sorsum_as_res_diag["a",3])) +
  xlab("\nAge in years") + theme_light() +
  scale_x_continuous(breaks = seq(10,100,10), limits = c(18, 100)) +
  theme(panel.grid.minor.x = element_blank(), text = element_text(size = 12))
```

The Gompertz function provides a different perspective on the mortality structure of the population studied as it does not depend on individual point estimates of ages like, for example, [Kaplan-Meier-diagrams](https://en.wikipedia.org/wiki/Kaplan–Meier_estimator). Please note that the maximum of the curve coincides with the arithmetic mean of the modal age M (55 years) in the table above.


## Running the analysis with JAGS

Provided [JAGS](https://mcmc-jags.sourceforge.io) is installed, `bay.ta()` can also be run with JAGS. For this, it is sufficient to set the parameter `framework` to `JAGS`. All other parameters remain unchanged.

```{r sorsum analysis jags, echo = TRUE, eval=FALSE}
sorsum_as_res <- bay.ta(
  framework = "JAGS",
  method = sorsum_as[,2],
  minimum_age = 18,
  thinSteps = 100,
  numSavedSteps = 5000,
  seed = 1234
)
```

The JAGS model needs a little bit longer than the NIMBLE model but if you run the diagnostics you will see that the overall performance is superior. At this point, we leave the further analysis following the same steps as above to the reader.


## Going further

More vignettes explain framework decisions (`vignette("computation_framework")`), detail the model in mathematical terms (`vignette("mathematical_background")`), demonstrate how data sets with known age-at-death can be dealt with (`vignette("known_age")`), provide a thoroughly worked example (`vignette("articles/worked_example")`) and show how the posterior probability densities can be grouped (`vignette("articles/groupings")`).

---

## References
