#' glsm.R
#'
#' @title Saturated Model Log-Likelihood for Multinomial Outcomes
#'
#' @description
#' When the response variable \code{Y} takes one of \code{R > 1} values, the function \code{"glsm()"} computes the maximum likelihood estimates (MLEs) of the parameters under four models: null, complete, saturated, and logistic. It also calculates the log-likelihood values for each model.
#'
#' The method assumes independent, non-identically distributed variables. For grouped data with a multinomial outcome variable, where the observations are divided into \code{J} populations, the function '\code{"glsm()"} offers reliable estimation for any number \code{K} of explanatory variables.
#'
#' @param formula An object of class \code{"formula"} (or one that can be coerced to that class): a symbolic description of the model to be fitted. See 'Details' for more information on model specification.
#' @param ref Optional character string indicating the reference level of the response variable. If not specified, the first level is used by default.
#' @param data An optional data frame, list, or environment (or object coercible via \code{as.data.frame}) containing the variables in the model. If variables are not found in \code{data}, they are taken from \code{environment(formula)}, typically the environment from which \code{glsm()} is called.
#'
#' @return An object of class \code{"glsm"}, which is a list containing at least the following components:
#'
#' \item{coefficients}{Vector of estimated coefficients, including intercepts and slopes.}
#'
#' \item{coef}{Alias for \code{coefficients}. Returns the same vector of estimated intercepts and slopes.}
#'
#' \item{Std.Error}{Vector of standard errors for the estimated coefficients (intercepts and slopes).}
#'
#' \item{ExpB}{Vector containing the exponentiated coefficients (i.e., \code{exp(beta)}) for interpretation as odds ratios.}
#'
#' \item{Wald}{Wald test statistic used to assess the significance of each coefficient (assumed to follow a chi-squared distribution).}
#'
#' \item{DF}{Degrees of freedom associated with the Wald test's chi-squared distribution.}
#'
#' \item{P.value}{P-values corresponding to the Wald test statistics.}
#'
#' \item{Log_Lik_Complete}{Log-likelihood value of the complete model.}
#'
#' \item{Log_Lik_Null}{Log-likelihood value of the null model.}
#'
#' \item{Log_Lik_Logit}{Log-likelihood value of the logistic model.}
#'
#' \item{Log_Lik_Saturate}{Log-likelihood value of the saturated model.}
#'
#' \item{Populations}{Number of populations considered in the saturated model.}
#'
#' \item{Dev_Null_vs_Logit}{Deviance statistic comparing the null and logistic models.}
#'
#' \item{Dev_Logit_vs_Complete}{Deviance statistic comparing the logistic and complete models.}
#'
#' \item{Dev_Logit_vs_Saturate}{Deviance statistic comparing the logistic and saturated models.}
#'
#' \item{Df_Null_vs_Logit}{Degrees of freedom for the deviance test comparing the null and logistic models.}
#'
#' \item{Df_Logit_vs_Complete}{Degrees of freedom for the deviance test comparing the logistic and complete models.}
#'
#' \item{Df_Logit_vs_Saturate}{Degrees of freedom for the deviance test comparing the logistic and saturated models.}
#'
#' \item{P.v_Null_vs_Logit}{P-value for the hypothesis test comparing the null and logistic models.}
#'
#' \item{P.v_Logit_vs_Complete}{P-value for the hypothesis test comparing the logistic and complete models.}
#'
#' \item{P.v_Logit_vs_Saturate}{P-value for the hypothesis test comparing the logistic and saturated models.}
#'
#' \item{Logit_r}{Matrix of log-odds values, with respect to the reference category \code{r} of the outcome variable \code{Y}.}
#'
#' \item{p_hat_complete}{Vector of probabilities that the outcome variable takes the value 1, given the \code{j}th population (estimated from the complete model, excluding the logistic model).}
#'
#' \item{p_hat_null}{Vector of probabilities that the outcome variable takes the value 1, given the \code{j}th population (estimated from the null model, excluding the logistic model).}
#'
#' \item{p_rj}{Matrix containing the estimated values of each \code{prj}, the probability that the outcome variable takes the value \code{r}, given the \code{j}th population (estimated using the logistic model).}
#'
#' \item{odd}{Vector containing the odds for each \code{j}th population.}
#'
#' \item{OR}{Vector containing the odds ratios for each variable's coefficient.}
#'
#' \item{z_rj}{Vector containing the values of each \code{Zrj}, defined as the sum of observations in the \code{j}th population.}
#'
#' \item{nj}{Vector containing the number of observations (\code{nj}) in each \code{j}th population.}
#'
#' \item{p_rj_tilde}{Vector containing the estimated values of each \code{prj}, the probability that the outcome variable takes the value \code{r}, given the \code{j}th population (estimated under the saturated model, without estimating logistic parameters).}
#'
#' \item{v_rj}{Vector of variances of the Bernoulli variables in the \code{j}th population and category \code{r}.}
#'
#' \item{m_rj}{Vector of expected values of \code{Zj} in the \code{j}th population and category \code{r}.}
#'
#' \item{V_rj}{Vector of variances of \code{Zj} in the \code{j}th population and category \code{r}.}
#'
#' \item{V}{Variance–covariance matrix of \code{Z}, the vector containing all \code{Zj} values.}
#'
#' \item{S_p}{Score vector computed under the saturated model.}
#'
#' \item{I_p}{Fisher information matrix under the saturated model.}
#'
#' \item{Zast_j}{Vector of standardized values for the variable \code{Zj}.}
#'
#' \item{mcov}{Variance–covariance matrix of the coefficient estimates.}
#'
#' \item{mcor}{Correlation matrix of the coefficient estimates.}
#'
#' \item{Esm}{Estimated Saturated Matrix. A data frame containing estimates from the saturated model. For each population \code{j}, it includes the values of the explanatory variables, \code{nj}, \code{Zrj}, \code{prj_tilde}, and the log-likelihood \code{Lp_tilde}.}
#'
#' \item{Elm}{Estimated Logit Matrix. A data frame containing estimates from the logistic model. For each population \code{j}, it includes the values of the explanatory variables, \code{nj}, \code{Zrj}, \code{prj}, the logit transformation \code{Logit_rj}, and the variance of the logit (\code{var_logit_rj}).}
#'
#' \item{call}{The original function call used to fit the glsm model.}
#' @encoding UTF-8
#'
#' @details
#' An expression of the form \code{y ~ model} is interpreted as a specification that the response variable \code{y} is modeled by a linear predictor, symbolically defined by \code{model} (the systematic component). The model consists of terms separated by \code{+} operators. Each term can include variable or factor names, and interactions between variables are denoted by \code{:}. Such a term represents the interaction of all included variables and factors. In this context, \code{y} is the outcome variable, which may be binary or polychotomous.
#'
#' @references
#' Hosmer, D., Lemeshow, S., & Sturdivant, R. (2013). *Applied Logistic Regression* (3rd ed.). New York: Wiley. ISBN: 978-0-470-58247-3
#' Llinás, H. (2006). Precisiones en la teoría de los modelos logísticos. *Revista Colombiana de Estadística*, 29(2), 239–265.
#' Llinás, H., & Carreño, C. (2012). The Multinomial Logistic Model for the Case in Which the Response Variable Can Assume One of Three Levels and Related Models. *Revista Colombiana de Estadística*, 35(1), 131–138.
#' Orozco, E., Llinás, H., & Fonseca, J. (2020). Convergence theorems in multinomial saturated and logistic models. *Revista Colombiana de Estadística*, 43(2), 211–231.
#' Llinás, H., Arteta, M., & Tilano, J. (2016). El modelo de regresión logística para el caso en que la variable de respuesta puede asumir uno de tres niveles: estimaciones, pruebas de hipótesis y selección de modelos. *Revista de Matemática: Teoría y Aplicaciones*, 23(1), 173–197.
#'
#' @author
#' Humberto Llinás (Universidad del Norte, Barranquilla-Colombia; author),
#' Jorge Villalba (Universidad Tecnológica de Bolívar, Cartagena-Colombia; author and creator),
#' Jorge Borja (Universidad del Norte, Barranquilla-Colombia; author and creator),
#' Jorge Tilano (Universidad del Norte, Barranquilla-Colombia; author)
#'
#' @examples
#' library(glsm)
#' data("hsbdemo", package = "glsm")
#' model <- glsm(prog ~ ses + gender, data = hsbdemo, ref = "academic")
#' model
#'
#'
#' @export
#'

glsm <- function(formula, data, ref = NaN) {

  if (!inherits(formula, "formula")) stop("'formula' must be a valid formula.")
  if (!is.data.frame(data)) stop("'data' must be a data frame.")


  xdata <- data
  mf <- model.frame(formula = formula, data = xdata)

  predictors <- colnames(mf)[-1]

  n_data <- as.data.frame(mf)
  if (length(unique(n_data[[1]])) < 3) {
    stop("The dependent variable must have 3 or more levels.\n\nIf you are trying to perform a dichotomous logistic regression model,\nI recommend using the lsm() function from the package of the same name.")
  }

  lvs <- levels(as.factor(n_data[[1]]))
  rw <- nrow(n_data)
  means <- list()

  for (i in lvs){
    n_data[paste0("u_", i)] <- ifelse(n_data[, 1] == i, 1, 0)
  }

  # -----------------------------------------
  #                Null model
  #------------------------------------------

  means_u <- colMeans(n_data[, grepl("^u_", names(n_data))])

  p_u <- means_u

  l <- list()

  for (i in 1:length(means_u)){
    l[[i]] <- means_u[i] * log(p_u[i])
  }

  l <- rw * sum(unlist(l))
  Log_Lik_Null <- l

  # -----------------------------------------
  #             Complete model
  #------------------------------------------

  l <- list()

  for (i in 1:length(lvs)){
    u <- n_data[, grepl("^u_", names(n_data))][i]
    l[[i]] <- ifelse(u == 0, 0, u * log(u))
  }

  l <- sum(unlist(l), na.rm = T)
  Log_Lik_Complete <- l

  # -----------------------------------------
  #             Saturated model
  #------------------------------------------

  ff <- count(data, vars = c(names(mf)[-1]))
  names(ff)[ncol(ff)] <- c("n")
  J <- nrow(ff)

  aa <- split(mf,mf[,names(mf)[1]])
  bb <- lapply(aa, function(x) count(x, vars = c(colnames(x)[-1])))

  for (i in 1:length(bb)) {
    names(bb[[i]])[ncol(bb[[i]])] <- c(paste0("z_", names(bb[i]), "_j"))
  }

  for(i in 1:length(bb)) {
    bb[[i]] <- join(bb[[i]], ff, by = names(mf)[-1]) # join the counts with the total counts
    bb[[i]][paste0("p_", names(bb[i]), "_j_tilde")] <- bb[[i]][paste0("z_", names(bb[i]), "_j")]/bb[[i]]["n"] # calculate the proportions
  }

  tb <- as.data.frame(bb[[1]])
  tb <- tb[, c(1:(ncol(tb) - 3), ncol(tb) - 1, ncol(tb) - 2, ncol(tb))]

  for(i in bb[-1]){
    tb <- join(tb, i, by = c(names(mf)[-1], "n"), type = "full")
  }

  tb[is.na(tb)] <- 0
  nc <- length(names(mf)[-1]) + 2
  pos <- 0
  l <- numeric(length(bb))

  tb <- as.data.frame(tb)

  for (i in 1:(length(bb))) {
    tb[paste0("l_", names(bb[i]))] <- ifelse(tb[, nc + pos + 1] == 0 | tb[, nc + pos] == 0, 0, tb[, nc + pos] * log(tb[, nc + pos + 1]))
    pos <- pos + 2
  }

  tb["Lp"] <- apply(tb[, grep("^l_", names(tb))], 1, function(x) {
    if(0 %in% x){
      return(0)
    } else{
      return(sum(x))
    }
  })

  tb <- tb[, -grep("^l_", names(tb))]

  l <- sum(tb$Lp)
  Log_Lik_Saturate <- l

  Populations <- J
  Saturate_Table <- tb
  Saturate_List <- bb

  z_rj <- tb[, grep("^z_", names(tb))]
  nj <- tb[, 'n']
  p_rj_tilde <- tb[, grep("^p_", names(tb))]

  tb  <- tb[, c(names(tb[predictors]) ,"n" , names(z_rj), names(p_rj_tilde),"Lp")]

  zast_j <- scale(z_rj, center = TRUE, scale = TRUE)
  colnames(zast_j) <- c(paste0("z_", lvs, "_j_ast"))

  # -----------------------------------------
  #           Model parameters
  #------------------------------------------

  lvs_t <- lvs[-match(ifelse(is.na(ref), lvs[1], ref), lvs)]

  formula_str <- as.formula(paste(as.character(formula)[-1], collapse = " ~ "))
  ref_lvl <- match(ifelse(is.na(ref), lvs[1], ref), lvs)

  model <- vglm(
    formula_str,
    multinomial(refLevel = ref_lvl),
    data = data
  )

  Log_Lik_Logit <- -deviance(model)/2

  coef <- coef(model)

  for (i in seq_along(lvs_t)) {
    names(coef) <- gsub(paste0(":", i), paste0(":", lvs_t[i]), names(coef))
  }

  coefficients <- coef
  ExpB <- exp(coefficients)

  Std.Error <- sqrt(diag(vcov(model)))
  for (i in seq_along(lvs_t)) {
    names(Std.Error) <- gsub(paste0(":", i), paste0(":", lvs_t[i]), names(Std.Error))
  }

  Wald <- (coefficients/Std.Error)^2
  DF <- rep(1, length(coef))
  P.value <- pchisq(Wald, DF, lower.tail = F)

  Dev_Null_vs_Logit <- 2 * (Log_Lik_Logit - Log_Lik_Null)
  Dev_Logit_vs_Complete <- -2 * Log_Lik_Logit
  Dev_Logit_vs_Saturate <- 2 * (Log_Lik_Saturate - Log_Lik_Logit)

  K <- length(lvs)
  Df_Null_vs_Logit <- 2 * (1 + K) - 2
  Df_Logit_vs_Complete <- 2 * (rw - (1 + K))
  Df_Logit_vs_Saturate <- 2 * (J - (1 + K))

  P.v_Null_vs_Logit <- pchisq(Dev_Null_vs_Logit, Df_Null_vs_Logit, lower.tail = F)
  P.v_Logit_vs_Complete <- pchisq(Dev_Logit_vs_Complete, Df_Logit_vs_Complete, lower.tail = F)
  P.v_Logit_vs_Saturate <- pchisq(Dev_Logit_vs_Saturate, Df_Logit_vs_Saturate, lower.tail = F)

  p_rj <- predict(model, newdata = tb[predictors], type = 'response')

  p_ref <- p_rj[, which(colnames(p_rj) %in% lvs[ref_lvl])]
  odd_p <- p_rj[, setdiff(colnames(p_rj), lvs[ref_lvl])]
  odds <- odd_p / p_ref

  logit_p <- log(odds)
  colnames(logit_p) <- paste0("logit_p_", lvs_t, "_j")

  or <- exp(coef)

  colnames(p_rj) <- paste0("p_", lvs, "_j")

  ltb <- tb[predictors]

  ltb$n <- tb$n
  ltb[colnames(tb[grepl("^z_", names(tb))])] <- tb[grepl("^z_", names(tb))]
  ltb[colnames(p_rj)] <- p_rj
  ltb[colnames(logit_p)] <- logit_p

  m_rj <- ltb[, 'n'] * ltb[, grepl("^p_", names(ltb))]
  colnames(m_rj) <- paste0("m_", lvs, "_j")

  v_rj <- ltb[, grepl("^p_", names(ltb))] * (1 - ltb[, grepl("^p_", names(ltb))])
  colnames(v_rj) <- paste0("v_", lvs, "_j")
  ltb[colnames(v_rj)] <- v_rj

  V_rj <- ltb$n * v_rj

  S_p <- ((tb$n * (p_rj_tilde  - p_rj)))/v_rj
  colnames(S_p) <- paste0("S_", lvs, "(p)")

  cov_m <- vcov(model)

  for (i in seq_along(lvs_t)) {
    row.names(cov_m) = colnames(cov_m) <- gsub(paste0(":", i), paste0(":", lvs_t[i]), row.names(cov_m))
  }

  logi <- list(
    data = n_data,
    coefficients = coefficients,
    coef = coef,
    Std.Error = Std.Error,
    ExpB = as.matrix(ExpB),
    Wald = as.matrix(Wald),
    DF = as.matrix(DF),
    P.value = as.matrix(P.value),
    Log_Lik_Complete = Log_Lik_Complete,
    Log_Lik_Null = Log_Lik_Null,
    Log_Lik_Saturate = Log_Lik_Saturate,
    Log_Lik_Logit = Log_Lik_Logit,
    Populations = Populations,
    Dev_Null_vs_Logit = Dev_Null_vs_Logit,
    Dev_Logit_vs_Complete = Dev_Logit_vs_Complete,
    Dev_Logit_vs_Saturate = Dev_Logit_vs_Saturate,
    Df_Null_vs_Logit = Df_Null_vs_Logit,
    Df_Logit_vs_Complete = Df_Logit_vs_Complete,
    Df_Logit_vs_Saturate = Df_Logit_vs_Saturate,
    P.v_Null_vs_Logit = P.v_Null_vs_Logit,
    P.v_Logit_vs_Complete = P.v_Logit_vs_Complete,
    P.v_Logit_vs_Saturate = P.v_Logit_vs_Saturate,
    Logit_r = logit_p,
    p_hat_complete = n_data[, grepl("^u_", names(n_data))],
    p_hat_null = p_u,
    p_rj = as.matrix(p_rj),
    odd = odds,
    OR = or,
    z_rj = as.matrix(z_rj),
    nj = as.matrix(nj),
    p_rj_tilde = as.matrix(p_rj_tilde),
    v_rj = as.matrix(v_rj),
    m_rj = as.matrix(m_rj),
    V_rj = as.matrix(V_rj),
    V = cov(z_rj),
    S_p = as.matrix(S_p),
    I_p = cov(S_p),
    Zast_j = zast_j,
    mcov = cov_m,
    mcor = as.matrix(cov2cor(cov_m)),
    Esm = tb,
    Elm = ltb,
    call = match.call()
  )

  class(logi) <- "glsm"

  return(logi)
}


#' @export


print.glsm <- function(x, ...) {
  TB <- cbind(x$coefficients, x$Std.Error,  x$ExpB)
  colnames(TB) <- c("Coef(B)", "Std.Error", "Exp(B)")

  cat("\nCall:\n")
  print(x$call)

  cat("\nPopulations in Saturated Model: ", x$Populations, "\n", sep = "")

  cat("\nCoefficients: \n",  sep = "")

  if(anyNA(x$coef) == TRUE){
    cat("(", sum(is.na(x$coef)), " not defined because of singularities)\n", sep = "")
  }

  print(TB, P.values = TRUE, has.Pvalue = TRUE)

  cat("\nLog Likelihood: \n")
  LL <- cbind(x$Log_Lik_Complete, x$Log_Lik_Null, x$Log_Lik_Logit, x$Log_Lik_Saturate)
  dimnames(LL) <- list("Estimation", c("Complete", "Null", "Logit", "Saturate"))
  print(t(LL))

  if(anyNA(unlist(x$data))==TRUE){
    cat("(",nrow(x$data) - nrow(na.omit(x$data)) , " observations deleted due to missingness)\n", sep = "")
  }
}
