## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(tlda)

## ----fig.height=1, fig.width=3.5, fig.align='center', echo=FALSE--------------
oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1))

par(mar = c(0, 0, 0, 0), xpd = TRUE)

plot(c(1,2), c(1,1), xlim = c(-.2, 2.1), ylim = c(.7, 1.6), axes = FALSE, type = "n")
arrows(x0 = 1, x1 = 2, y0 = 1, y1 = 1, code = 3, angle = 90, length = .05)
points(x = 1.4, y = 1, pch = "|")
text(x = 1, y = .8, label = "0", cex = .8)
text(x = 2, y = .8, label = "1", cex = .8)
text(x = 1.4, y = .8, label = ".40", cex = .8)
text(x = 1, y = 1.2, label = ".01", col = "grey40", cex = .8)
text(x = 2, y = 1.2, label = ".11", col = "grey40", cex = .8)
text(x = 1.4, y = 1.2, label = ".05", col = "grey40", cex = .8)
text(x = .7, y = 1.19, label = "Dispersion scores", adj = 1, col = "grey40", cex = .8)
text(x = .7, y = .79, label = "Frequency adjustment", adj = 1, cex = .8)
text(x = 1:2, y = 1.5, label = c("Min", "Max"), cex = .8)

par(oldpar)

## ----echo=FALSE---------------------------------------------------------------
partsizes  <- c(
  30, 28, 25, 22, 20, 17, 16, 15, 15, 15, 14, 14, 14, 13, 
  13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 7, 5, 4, 3, 2)
subfreqs <- c(
  8, 3, 7, 2, 5, 3, 1, 4, 3, 0, 2, 0, 2, 4, 1, 0, 5, 1, 
  0, 1, 2, 0, 3, 1, 2, 0, 1, 0, 0, 0)

## ----fig.width=3.1, fig.height=2.35, fig.align='center', echo=FALSE, fig.cap="Illustrative set of data."----
plot_col <- matrix(
  rep(rep("transparent", max(partsizes)), length(subfreqs)), 
  nrow = max(partsizes), 
  byrow = FALSE)

for(i in 1:length(partsizes)) plot_col[1:partsizes[i], i] <- "black"

plot_fill <- matrix(
  rep(rep("transparent", max(partsizes)), length(subfreqs)), 
  nrow = max(partsizes), 
  byrow = FALSE)

for(i in 1:length(partsizes)) plot_fill[(1:(subfreqs[i] + 1)) - 1, i] <- "grey40"


oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1))

par(mar = c(2, 0, 0, 0), xpd = TRUE)

plot(rep(1:length(subfreqs), each = max(partsizes)),
     rep(1:max(partsizes), length(subfreqs)),
     col = plot_col, pch = 22, bg = plot_fill, axes = FALSE, 
     xlab = "", ylab = NA, asp = .6, cex = .8, lwd = .5)

text(x = 15.5, y = -3, label = "Corpus parts (texts)", cex = .8)

par(oldpar)

## ----fig.width=3.1, fig.height=2.35, fig.align='center', echo=FALSE, fig.cap="A **minimally pervasive** distribution."----
subfreq_min_disp <- find_min_disp(
  subfreq = subfreqs,
  partsize = partsizes,
  freq_adjust_method = "pervasive")

plot_fill_min <- matrix(
  rep(rep("transparent", max(partsizes)), length(subfreq_min_disp)), 
  nrow = max(partsizes), 
  byrow = FALSE)

for(i in 1:length(partsizes)) plot_fill_min[(1:(subfreq_min_disp[i] + 1)) - 1, i] <- "grey40"

oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1))

par(mar = c(2, 0, 0, 0), xpd = TRUE)

plot(rep(1:length(subfreq_min_disp), each = max(partsizes)),
     rep(1:max(partsizes), length(subfreq_min_disp)),
     col = plot_col, pch = 22, bg = plot_fill_min, axes = FALSE, 
     xlab = "", ylab = NA, asp = .6, cex = .8, lwd = .5)

text(x = 15.5, y = -3, label = "Corpus parts (texts)", cex = .8)

par(oldpar)

## ----fig.width=3.1, fig.height=2.35, fig.align='center', echo=FALSE, fig.cap="A **maximally pervasive** distribution."----
subfreq_max_disp <- find_max_disp(
  subfreq = subfreqs,
  partsize = partsizes,
  freq_adjust_method = "pervasive")

plot_fill_max <- matrix(
  rep(rep("transparent", max(partsizes)), length(subfreq_max_disp)), 
  nrow = max(partsizes), 
  byrow = FALSE)

for(i in 1:length(partsizes)) plot_fill_max[(1:(subfreq_max_disp[i] + 1)) - 1, i] <- "grey40"

oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1))

par(mar = c(2, 0, 0, 0), xpd = TRUE)

plot(rep(1:length(subfreq_max_disp), each = max(partsizes)),
     rep(1:max(partsizes), length(subfreqs)),
     col = plot_col, pch = 22, bg = plot_fill_max, axes = FALSE, 
     xlab = "", ylab = NA, asp = .6, cex = .8, lwd = .5)

text(x = 15.5, y = -3, label = "Corpus parts (texts)", cex = .8)

par(oldpar)

## ----fig.width=3.1, fig.height=2.35, fig.align='center', echo=FALSE, fig.cap="A **minimally even** distribution."----
subfreq_min_disp <- find_min_disp(
  subfreq = subfreqs,
  partsize = partsizes,
  freq_adjust_method = "even")

subfreq_min_disp[22] <- subfreq_min_disp[20]
subfreq_min_disp[20] <- 0

plot_fill_max <- matrix(
  rep(rep("transparent", max(partsizes)), length(subfreqs)), 
  nrow = max(partsizes), 
  byrow = FALSE)

for(i in 1:length(partsizes)) plot_fill_max[(1:(subfreq_min_disp[i] + 1)) - 1, i] <- "grey40"


oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1))

par(mar = c(2, 0, 0, 0), xpd = TRUE)

plot(rep(1:length(subfreqs), each = max(partsizes)),
     rep(1:max(partsizes), length(subfreqs)),
     col = plot_col, pch = 22, bg = plot_fill_max, axes = FALSE, 
     xlab = "", ylab = NA, asp = .6, cex = .8, lwd = .5)

text(x = 15.5, y = -3, label = "Corpus parts (texts)", cex = .8)

par(oldpar)

## ----fig.width=3.1, fig.height=2.35, fig.align='center', echo=FALSE, fig.cap="A **maximally even** distribution."----
subfreq_max_disp <- find_max_disp(
  subfreq = subfreqs,
  partsize = partsizes,
  freq_adjust_method = "even")

plot_fill_max <- matrix(
  rep(rep("transparent", max(partsizes)), length(subfreqs)), 
  nrow = max(partsizes), 
  byrow = FALSE)

for(i in 1:length(partsizes)) plot_fill_max[(1:(subfreq_max_disp[i] + 1)) - 1, i] <- "grey40"

oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1))

par(mar = c(2, 0, 0, 0), xpd = TRUE)

plot(rep(1:length(subfreqs), each = max(partsizes)),
     rep(1:max(partsizes), length(subfreqs)),
     col = plot_col, pch = 22, bg = plot_fill_max, axes = FALSE, 
     xlab = "", ylab = NA, asp = .6, cex = .8, lwd = .5)

text(x = 15.5, y = -3, label = "Corpus parts (texts)", cex = .8)

par(oldpar)

## ----fig.width=4, fig.height=2.5, warning=FALSE, message=FALSE, out.width="30%"----
oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1))

par(mar = c(4, 4, 1, 0.3), xpd = TRUE)

hist(
  biber150_spokenBNC1994[1,],
  main = NULL, 
  xlab = "Size of corpus parts (speakers)", 
  breaks = seq(0, 70000, length=40), 
  col = "grey60")

par(oldpar)

## -----------------------------------------------------------------------------
DM_even <- disp_tdm(
  biber150_spokenBNC1994, 
  row_partsize = "first",
  freq_adjust = TRUE,
  freq_adjust_method = "even",
  unit_interval = FALSE,
  print_score = FALSE,
  verbose = FALSE,
  suppress_warning = TRUE)

DM_pervasive <- disp_tdm(
  biber150_spokenBNC1994, 
  row_partsize = "first",
  freq_adjust = TRUE,
  freq_adjust_method = "pervasive",
  unit_interval = FALSE,
  print_score = FALSE,
  verbose = FALSE,
  suppress_warning = TRUE)

## -----------------------------------------------------------------------------
round(
  apply(
    DM_even,
    2,
    range, na.rm = TRUE),
  2)

## -----------------------------------------------------------------------------
apply(
  DM_even,
  2,
  function(x){
    sum(x < 0 | x > 1, na.rm = TRUE)
  })

## -----------------------------------------------------------------------------
round(
  apply(
    DM_pervasive,
    2,
    range, na.rm = TRUE),
  2)

## -----------------------------------------------------------------------------
apply(
  DM_pervasive,
  2,
  function(x){
    sum(x < 0 | x > 1, na.rm = TRUE)
  })

