## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE, comment = "#>",
  eval = FALSE
)

## ----fl-load------------------------------------------------------------------
#  library(treeSS)
#  data(fl_deaths)
#  
#  str(fl_deaths)
#  #> 'data.frame': N obs. of 5 variables:
#  #>  $ county_fips: chr  ...
#  #>  $ icd10_code : chr  "C56" "C61" "I10" ...
#  #>  $ icd10_desc : chr  ...
#  #>  $ deaths     : int  ...
#  #>  $ population : int  ...
#  
#  c(rows         = nrow(fl_deaths),
#    counties     = length(unique(fl_deaths$county_fips)),
#    icd10_codes  = length(unique(fl_deaths$icd10_code)),
#    total_deaths = sum(fl_deaths$deaths))

## ----fl-tree------------------------------------------------------------------
#  icd_codes <- sort(unique(fl_deaths$icd10_code))
#  
#  parsed <- data.frame(
#    code  = icd_codes,
#    three = ifelse(grepl("\\.", icd_codes),
#                    sub("\\..*", "", icd_codes),
#                    icd_codes),
#    is_three_leaf = !grepl("\\.", icd_codes) & nchar(icd_codes) == 3,
#    stringsAsFactors = FALSE
#  )
#  
#  # Map 3-character codes to ICD-10 chapters
#  icd_chapter_of <- function(three) {
#    L   <- substr(three, 1, 1)
#    num <- suppressWarnings(as.integer(substr(three, 2, 3)))
#    switch(L,
#      A = "I_AB", B = "I_AB",
#      C = "II_C_D",
#      D = if (!is.na(num) && num <= 48) "II_C_D" else "III_D",
#      E = "IV_E",  F = "V_F",  G = "VI_G",
#      H = if (!is.na(num) && num <= 59) "VII_H" else "VIII_H",
#      I = "IX_I",  J = "X_J",  K = "XI_K", L = "XII_L",
#      M = "XIII_M", N = "XIV_N", O = "XV_O", P = "XVI_P",
#      Q = "XVII_Q", R = "XVIII_R",
#      S = "XIX_S_T", T = "XIX_S_T",
#      V = "XX_V_Y", W = "XX_V_Y", X = "XX_V_Y", Y = "XX_V_Y",
#      "Other"
#    )
#  }
#  parsed$chapter <- sapply(parsed$three, icd_chapter_of)
#  
#  fl_tree <- unique(rbind(
#    data.frame(node_id = "Root", parent_id = NA_character_,
#               stringsAsFactors = FALSE),
#    data.frame(node_id = unique(parsed$chapter), parent_id = "Root",
#               stringsAsFactors = FALSE),
#    unique(data.frame(node_id = parsed$three, parent_id = parsed$chapter,
#                      stringsAsFactors = FALSE)),
#    data.frame(
#      node_id   = parsed$code[!parsed$is_three_leaf],
#      parent_id = parsed$three[!parsed$is_three_leaf],
#      stringsAsFactors = FALSE
#    )
#  ))
#  
#  # Sanity check: every code in fl_deaths is a leaf of fl_tree
#  parents <- unique(fl_tree$parent_id[!is.na(fl_tree$parent_id)])
#  leaves  <- setdiff(fl_tree$node_id, parents)
#  stopifnot(length(setdiff(icd_codes, leaves)) == 0)
#  
#  c(total_nodes = nrow(fl_tree),
#    chapters    = length(unique(parsed$chapter)),
#    leaves      = length(leaves))

## ----fl-polygons--------------------------------------------------------------
#  library(sf)
#  library(tigris)
#  
#  fl_map <- counties(state = "FL", cb = TRUE, year = 2016)
#  fl_map <- st_transform(fl_map, 4326)
#  
#  centroids   <- st_centroid(st_geometry(fl_map))
#  coords      <- st_coordinates(centroids)
#  fl_map$x    <- coords[, "X"]
#  fl_map$y    <- coords[, "Y"]

## ----fl-vectors---------------------------------------------------------------
#  # Per-county population: same value across all leaves, so take any one
#  fl_pop <- aggregate(population ~ county_fips, data = fl_deaths,
#                       FUN = max)
#  
#  # Per-county centroids from the polygon layer
#  xy <- data.frame(county_fips = fl_map$GEOID,
#                    x = fl_map$x, y = fl_map$y,
#                    stringsAsFactors = FALSE)
#  
#  dat <- merge(fl_deaths[, c("county_fips", "icd10_code", "deaths")],
#               fl_pop, by = "county_fips")
#  dat <- merge(dat, xy, by = "county_fips")
#  
#  # Integer region_id is convenient downstream; here we use the rank
#  # of county_fips in alphabetical order
#  dat$region_id <- as.integer(factor(
#    dat$county_fips, levels = sort(unique(dat$county_fips))
#  ))
#  
#  # Keep only rows with at least one death (sparse-by-default scan)
#  dat <- dat[dat$deaths > 0, ]
#  
#  c(long_rows = nrow(dat),
#    counties  = length(unique(dat$region_id)))

## ----fl-scan------------------------------------------------------------------
#  result_fl <- treespatial_scan(
#    cases       = dat$deaths,
#    population  = dat$population,
#    region_id   = dat$region_id,
#    x           = dat$x,
#    y           = dat$y,
#    node_id     = dat$icd10_code,
#    tree        = fl_tree,
#    max_pop_pct = 0.05,
#    nsim        = 999, seed = 2016,
#    n_cores     = 4L
#  )
#  print(result_fl)

## ----fl-filter----------------------------------------------------------------
#  filter_clusters(result_fl)[1:5,
#                              c("node_id", "n_regions",
#                                "cases", "expected", "llr",
#                                "pvalue")]

## ----fl-seq-------------------------------------------------------------------
#  seq_fl <- sequential_scan(
#    cases       = dat$deaths,
#    population  = dat$population,
#    region_id   = dat$region_id,
#    x           = dat$x,
#    y           = dat$y,
#    node_id     = dat$icd10_code,
#    tree        = fl_tree,
#    max_iter    = 5L, alpha = 0.05,
#    nsim        = 999, seed = 2016,
#    max_pop_pct = 0.05, n_cores = 4L
#  )
#  print(seq_fl)

## ----fl-plot------------------------------------------------------------------
#  library(ggplot2)
#  library(dplyr)
#  
#  region_info <- unique(dat[, c("region_id", "county_fips")])
#  
#  cr_seq <- merge(
#    get_cluster_regions(seq_fl, overlap = TRUE),
#    region_info, by = "region_id"
#  )
#  
#  # Cross-join the polygon layer with every iteration panel so all
#  # counties appear in every panel (those outside the analysis
#  # dataset show as NA-coloured rather than as an extra empty panel)
#  panels       <- unique(cr_seq$panel)
#  cr_seq_keys  <- unique(cr_seq[, c("county_fips", "cluster",
#                                     "node_id", "llr", "pvalue",
#                                     "panel")])
#  fl_seq_plot <- merge(
#    do.call(rbind,
#            lapply(panels, function(p) cbind(fl_map, panel = p))),
#    cr_seq_keys,
#    by.x = c("GEOID", "panel"),
#    by.y = c("county_fips", "panel"),
#    all.x = TRUE
#  )
#  
#  ggplot(fl_seq_plot) +
#    geom_sf(aes(fill = factor(cluster)), color = "gray40",
#            linewidth = 0.12, alpha = 0.75) +
#    scale_fill_manual(
#      values = c("#C44E52", "#4C72B0", "#55A868",
#                 "#8172B2", "#CCB974"),
#      na.value = "gray95", na.translate = FALSE
#    ) +
#    facet_wrap(~ panel, nrow = 1) +
#    theme_minimal() +
#    theme(legend.position = "none")

