#' @title Split and Coalesce Duplicated Records
#' @description Deduplicates datasets by retaining the most complete and informative records. Identifies duplicated entries based on a specified key column, calculates completeness scores for each row, and compares values within groups. When differences between duplicates exceed a user-defined threshold, records are split into unique IDs; otherwise, they are coalesced into a single, most complete entry. Returns a list containing the original duplicates, the split entries, and the final coalesced dataset. Useful for cleaning survey or administrative data where duplicated IDs may reflect minor data entry inconsistencies.
#' @details This function:
#'   1. Computes a completeness percentage for each record.
#'   2. Flags duplicates and checks if the proportion of differing fields
#'      relative to the most complete record exceeds `diff_cutoff`.
#'     - Records exceeding the threshold are split with new IDs.
#'     - Others are merged using the most complete non‑NA values.
#'
#' @param df A data frame or tibble.
#' @param key_col Character. Name of the column to identify duplicates.
#' @param diff_cutoff Numeric between 0 and 1. Proportion of comparable fields that must differ
#'   for a duplicated record to be split into its own ID. Differences below this cutoff are treated
#'   as acceptable and those rows will stay merged under the original key. Defaults to 0.5 (50 percent).
#'
#' @return A named list with three data frames:
#' \describe{
#'   \item{\code{duplicates_df}}{All rows flagged as duplicates, ordered by completeness.}
#'   \item{\code{split_df}}{Rows split off because they exceeded \code{diff_cutoff}.}
#'   \item{\code{coalesced_df}}{The final deduplicated data frame, coalesced prioritising completeness.}
#' }
#'
#' @examples
#' # Create a small sample with real duplicates
#' df <- data.frame(
#'   id    = c(1, 1, 2, 2, 3, 4, 4),
#'   value = c(10, 10, NA, 20, 5, 3, 3),
#'   tag   = c("A", "A", NA, "B", "C", "X", NA),
#'   stringsAsFactors = FALSE
#' )
#'
#' # Run pickmax with default diff_cutoff (50%)
#' res <- pickmax(df, key_col = "id", diff_cutoff = 0.5)
#'
#' # Show the duplicates flagged
#' print(res$duplicates_df)
#'
#' # Show records that got split per diff_cutoff
#' print(res$split_df)
#'
#' # Show final cleaned dataset
#' print(res$coalesced_df)

#'
#' @importFrom magrittr %>%
#' @importFrom dplyr mutate filter arrange select row_number group_by first desc semi_join left_join across all_of ungroup summarise coalesce n
#' @importFrom rlang .data


#' @export


pickmax <- function(df, key_col = "id", diff_cutoff = 0.5) {
  data_cols <- setdiff(names(df), key_col)
  num_vars  <- length(data_cols)

  df2 <- df %>%
    mutate(
      .row_id = row_number(),
      completeness_pct = apply(select(., all_of(data_cols)), 1,
                               function(x) round(sum(!is.na(x)) / num_vars * 100, 1))
    )

  dup_flags <- duplicated(df2[[key_col]]) | duplicated(df2[[key_col]], fromLast = TRUE)
  duplicates_df <- df2 %>%
    filter(dup_flags) %>%
    arrange(.data[[key_col]], desc(completeness_pct))

  df_sorted <- df2 %>%
    arrange(.data[[key_col]], desc(completeness_pct))

  comparisons <- df_sorted %>%
    group_by(across(all_of(key_col))) %>%
    filter(n() > 1) %>%
    mutate(
      base_row = first(.row_id),
      diff_count = rowSums(
        across(all_of(data_cols),
               ~ .x != first(.x) & !is.na(.x) & !is.na(first(.x))),
        na.rm = TRUE
      ),
      comp_vars = rowSums(
        across(all_of(data_cols),
               ~ !is.na(.x) & !is.na(first(.x))),
        na.rm = TRUE
      ),
      diff_pct = ifelse(comp_vars > 0, diff_count / comp_vars, 0)
    ) %>%
    ungroup()

  splits <- comparisons %>%
    filter(.row_id != base_row, diff_pct >= diff_cutoff) %>%
    group_by(.data[[key_col]]) %>%
    mutate(suffix = sprintf("%04d", row_number()),
           new_id = paste0(.data[[key_col]], "-", suffix)) %>%
    ungroup() %>%
    select(.row_id, new_id)

  split_df <- df_sorted %>%
    semi_join(splits, by = ".row_id") %>%
    left_join(splits, by = ".row_id") %>%
    mutate(final_id = new_id) %>%
    select(-.row_id, -completeness_pct, -new_id)

  df_final <- df_sorted %>%
    left_join(splits, by = ".row_id") %>%
    mutate(final_id = coalesce(new_id, as.character(.data[[key_col]]))) %>%
    select(-.row_id, -completeness_pct, -new_id)

  coalesced_df <- df_final %>%
    group_by(final_id) %>%
    summarise(across(all_of(data_cols), ~ do.call(coalesce, as.list(.x))),
              .groups = "drop")

  list(
    duplicates_df = duplicates_df,
    split_df = split_df,
    coalesced_df = coalesced_df
  )
}

