#' Polynomial Method for CSSEM
#'
#' @description
#' Implement the polynomial method for computing conditional standard errors of
#' measurement for scale scores (CSSEM). A polynomial regression of scale scores
#' on raw scores is fit for degrees 1 through \code{K}; for each degree \code{k},
#' the transformation derivative is used to map raw-score CSEM values to
#' scale-score CSSEM values.
#'
#' @param csemx A data frame or matrix containing raw scores and their CSEM on
#'   the raw-score metric. It must have at least the following numeric columns:
#'   \itemize{
#'     \item \code{x}: raw scores,
#'     \item \code{csem}: conditional standard errors of measurement on the
#'       raw-score metric.
#'   }
#' @param ct A data frame or matrix containing the score conversion table. It
#'   must have at least the following numeric columns:
#'   \itemize{
#'     \item \code{x}: raw scores (matching those in \code{csemx}),
#'     \item \code{ss}: scale scores corresponding to each raw score.
#'   }
#' @param K Integer. Highest polynomial degree to fit. Defaults to \code{10}.
#' @param gra Logical. If \code{TRUE}, a plot of the fitted polynomial curve
#'   and the observed conversion points is produced for each degree \code{k}.
#
#' @details
#' At the beginning of the function, \code{csemx} and \code{ct} are merged by
#' the \code{x} column (inner join) to create an internal data frame . Only
#' rows with \code{x} values present in both inputs are
#' used. The polynomial model is then fit to \code{ss ~ poly(x, k, raw = TRUE)}
#' for \code{k = 1, \dots, K}.
#'
#' @return A list with two components:
#' \describe{
#'   \item{rsquared}{A matrix with one column containing the R-squared values
#'     from polynomial fits of degree \code{k = 1, \dots, K}, where
#'     \code{K} is the largest successfully fitted degree.}
#'   \item{cssempoly}{A data frame containing the merged data
#'     (\code{x}, \code{csem}, \code{ss}) and, for each degree \code{k},
#'     the additional columns:
#'     \itemize{
#'       \item \code{fx_k1}, \code{fx_k2}, \dots: transformation derivatives
#'         \eqn{f'_k(x)} for each raw score,
#'       \item \code{ss_k1}, \code{ss_k2}, \dots: fitted (rounded) scale scores
#'         from the polynomial of degree \code{k},
#'       \item \code{cssem_k1}, \code{cssem_k2}, \dots: CSSEM values on
#'         the scale-score metric, computed as \eqn{f'_k(x)\,\mathrm{CSEM}_x}.
#'     }}
#' }
#'
#' @examples
#' data(ct.u)
#' cssem_polynomial(as.data.frame(csem_lord(40)), ct.u, K = 4, gra = TRUE)
#'
#' @import ggplot2
#' @importFrom rlang .data
#'
#' @export
cssem_polynomial <- function(csemx, ct, K = 10, gra = TRUE) {

  # ---- check csemx ----------------------------------------------------------
  if (!is.data.frame(csemx) && !is.matrix(csemx)) {
    stop("`csemx` must be a data frame or matrix.")
  }
  csemx <- as.data.frame(csemx)

  req_csemx <- c("x", "csem")
  miss_csemx <- setdiff(req_csemx, names(csemx))
  if (length(miss_csemx) > 0L) {
    stop("The following required columns are missing from `csemx`: ",
         paste(miss_csemx, collapse = ", "))
  }
  for (nm in req_csemx) {
    if (!is.numeric(csemx[[nm]])) {
      stop("Column `", nm, "` in `csemx` must be numeric.")
    }
  }

  # Store original row count BEFORE merging
  n_raw <- nrow(csemx)

  # ---- check ct -------------------------------------------------------------
  if (!is.data.frame(ct) && !is.matrix(ct)) {
    stop("`ct` must be a data frame or matrix.")
  }
  ct <- as.data.frame(ct)

  req_ct <- c("x", "ss")
  miss_ct <- setdiff(req_ct, names(ct))
  if (length(miss_ct) > 0L) {
    stop("The following required columns are missing from `ct`: ",
         paste(miss_ct, collapse = ", "))
  }
  for (nm in req_ct) {
    if (!is.numeric(ct[[nm]])) {
      stop("Column `", nm, "` in `ct` must be numeric.")
    }
  }

  # ---- merge by x to form cssemDat ------------------------------------------
  cssemDat <- merge(csemx, ct, by = "x", all = FALSE)

  if (n_raw != nrow(cssemDat)) {
    warning(
      "Number of rows reduced after merging by `x`. Only raw scores ",
      "present in both `csemx` and `ct` are used."
    )
  }

  # keep only required + any extra columns
  required_cols <- c("x", "csem", "ss")
  # ensure required are numeric
  for (nm in required_cols) {
    if (!is.numeric(cssemDat[[nm]])) {
      stop("Column `", nm, "` in merged data must be numeric.")
    }
  }

  # drop rows with missing values in required columns
  keep <- stats::complete.cases(cssemDat[, required_cols])
  if (!all(keep)) {
    cssemDat <- cssemDat[keep, , drop = FALSE]
  }

  if (!is.numeric(K) || length(K) != 1L || is.na(K)) {
    stop("`K` must be a single numeric value.")
  }
  K <- as.integer(K)
  if (K < 1L) {
    stop("`K` must be at least 1.")
  }

  gra <- isTRUE(gra)

  x_vals <- cssemDat$x
  csem_x <- cssemDat$csem
  n <- nrow(cssemDat)

  # storage for R^2
  r2_vec <- rep(NA_real_, K)
  k_last <- 0L

  ## ---- iterate over polynomial degrees k = 1..K -----------------------------
  for (k in seq_len(K)) {

    # fit polynomial of degree k: ss ~ poly(x, k, raw = TRUE)
    fit_k <- stats::lm(ss ~ stats::poly(x, degree = k, raw = TRUE),
                       data = cssemDat)

    summ_k <- summary(fit_k)
    r2_vec[k] <- summ_k$r.squared

    # regression coefficients: intercept + k polynomial terms
    coef_k <- stats::coef(fit_k)
    # coef_k[1] = intercept, coef_k[2:(k+1)] = beta_1..beta_k
    if (length(coef_k) < (k + 1L) || any(is.na(coef_k[1:(k + 1L)]))) {
      message("The maximum k accepted is ", k - 1L, ".")
      break
    }

    k_last <- k

    # optional plot -----------------------------------------------------------
    if (gra) {
      prd <- data.frame(
        x = seq(from = min(x_vals), to = max(x_vals), length.out = 100L)
      )
      prd$predictedSS <- stats::predict(fit_k, newdata = prd)

      p <- ggplot2::ggplot(prd, ggplot2::aes(x = .data$x, y = .data$predictedSS)) +
        ggplot2::theme_bw() +
        ggplot2::geom_line() +
        ggplot2::geom_point(
          data = cssemDat,
          ggplot2::aes(x = .data$x, y = .data$ss)
        ) +
        ggplot2::scale_y_continuous(name = "Scale Score") +
        ggplot2::scale_x_continuous(name = "Raw Score") +
        ggplot2::ggtitle(
          paste0("Polynomial Method Fitted Line with k = ", k)
        ) +
        ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))

      print(p)
    }

    ## ---- compute transformation derivative f'(x) ---------------------------
    # Model:   SS_hat = beta0 + beta1 * x + ... + beta_k * x^k
    # Deriv: f'(x)  = beta1 + 2*beta2 * x + ... + k * beta_k * x^{k-1}
    beta <- coef_k[-1L]          # length k
    deg  <- length(beta)         # should equal k

    # matrix of x^(j-1), j = 1..k  => exponents 0:(k-1)
    pow_mat_deriv <- outer(x_vals, 0:(deg - 1L), `^`)
    # coefficients j * beta_j, j = 1..k
    deriv_coef <- beta * seq_len(deg)
    # f'(x) for each observed x
    fx <- as.numeric(pow_mat_deriv %*% deriv_coef)

    ## ---- compute fitted scale scores from polynomial -----------------------
    # SS_hat = beta0 + sum_{j=1}^k beta_j x^j
    pow_mat_fit <- outer(x_vals, seq_len(deg), `^`)
    SS_hat <- as.numeric(coef_k[1L] + pow_mat_fit %*% beta)
    SS_hat_round <- round(SS_hat)

    ## ---- compute CSSEM on scale metric -------------------------------------
    cssem_poly <- fx * csem_x

    # check for negative transformation coefficients
    neg_idx <- fx < 0
    if (any(neg_idx)) {
      message(
        "Negative transformation coefficient exists when k = ", k,
        ". The corresponding CSSEM will be set to NA."
      )
      cssem_poly[neg_idx] <- NA_real_
    }

    # attach columns for this k
    cssemDat[[paste0("fx_k", k)]]        <- fx
    cssemDat[[paste0("ss_k", k)]]        <- SS_hat_round
    cssemDat[[paste0("cssem_k", k)]] <- cssem_poly
  }

  ## ---- prepare output ------------------------------------------------------
  if (k_last == 0L) {
    warning("No polynomial model was successfully fitted (k_last = 0).")
    RSq_mat <- matrix(NA_real_, ncol = 1L, dimnames = list(NULL, "R2"))
  } else {
    RSq_mat <- matrix(r2_vec[seq_len(k_last)], ncol = 1L)
    colnames(RSq_mat) <- "R2"
  }

  return(list(
    rsquared  = RSq_mat,
    cssempoly = cssemDat
  ))
}
