


#-----transformation of the multinomial distribution with theta -------------
#' @keywords internal
softmax <- function(logits) {
  exp_logits <- exp(logits)
  probabilities <- exp_logits / sum(exp_logits)
  return(probabilities)
}

#' Fitting binary regression with missing categorical covariates using new likelihood based method that does not require EM algorithm
#'@description This function allows users to fit logistic regression models with incomplete predictors that are categorical. The model is fitted using a new likelihood-based method, which ensures reliable parameter estimation even when dealing with missing data. For more information on the underlying methodology, please refer to Pradhan, Nychka, and Bandyopadhyay (2025).
#' @param par A vector including a list of parameters to be estimated. This include the beta (the regression parameters) and theta, the multinomial paraters for observing a missing covaraite pattern.
#' @param data Input data for fitting the model
#' @param formula A formula expression as for regression models, of the form \code{response ~ predictors}. The response should be a numeric binary variable with missing values, and predictors can be any variables. A predictor with categorical values with missing can be used in the model. See the documentation of formula for other details.
#' @param augData An augmented data including all possible covarites that could have been observed.
#' @param biasCorr a TRUE or FALSE value, by default it is TRUE.
#'
#' @return return the regression estimates
#' @export
#'
#' @references
#' Firth, D. (1993). Bias reduction of maximum likelihood estimates, Biometrika, 80, 27-38. doi:10.2307/2336755.
#'
#' Kosmidis, I., Firth, D. (2021). Jeffreys-prior penalty, finiteness and shrinkage in binomial-response generalized linear models. Biometrika, 108, 71-82. doi:10.1093/biomet/asaa052.
#'
#' Pradhan, V., Nychka, D. and Bandyopadhyay, S. (2025). Bridging Gaps in Logistic Regression: Tackling Missing Categorical Covariates with a New Likelihood Method (to be submitted).
#'
#' Pradhan, V., Nychka, D. and Bandyopadhyay, S. (2025). glmFitMiss: Binary Regression with Missing Data in R (to be submitted)

llkmiss <- function(par, data, formula, augData, biasCorr=TRUE) {
  df <- augData$DF
  k <- augData$distptrn

  # Convert factor variables to numeric
  factor_cols <- sapply(df, is.factor)
  df[factor_cols] <- lapply(df[factor_cols], function(x) as.numeric(as.character(x)))

  # Using the provided formula to create the model matrix
  numeric_df <- model.matrix(formula, data=df)
  x <- numeric_df

  beta <- matrix(par[1:ncol(x)], ncol=1)

  # Transforming theta from logits to probabilities
  logits <- par[(ncol(x) + 1):(length(par) - 1)]  # Exclude the last logit since it's redundant
  theta <- softmax(c(logits, 0))  # Adding a 0 for the base comparison; softmax will normalize

  y <- df[, all.vars(formula)[1]]  # Dynamically select the response variable from the formula
  m <- df$m
  grp <- df$grp

  linpred <- x %*% beta
  pi = 1 / (1 + exp(-linpred))

  # Small constant for numerical stability
  epsilon <- 1e-10

  # Computing the log-likelihood of the observed data
  loglikeObs <- sum((log(pmax(pi, epsilon)) * y + log(pmax(1 - pi, epsilon)) * (1 - y)) * (!m))

  #Computing the log-likelihood of the missing data part
  loglikeMiss <- sum(log(tapply(((pi^y * (1 - pi)^(1 - y)) * theta[df$patternID])[m], grp[m], sum)))

  # print(cbind(loglikeObs, loglikeMiss))
  # Hybrid Bias correction due to Firth
  if (biasCorr==TRUE){
    wvec=diag(as.vector(pi*(1-pi)))
    Fisher=t(x)%*%wvec%*%x
    logdetFisher=log(det(Fisher))
    loglikeObs=loglikeObs+0.5*logdetFisher
  }
  llk <- loglikeObs + loglikeMiss

  # llk <- loglikeObs

  if (!is.finite(llk)) {
    return(Inf)  # Return infinity if the likelihood is not finite
  }

  return(-llk)  # Return the negative log-likelihood value
}

