#!/usr/bin/env bash
# =============================================================================
# PONG2 CLI - Production-Ready Pipeline
# Version: 1.0.0
# Usage: pong2 <command> [options]
# Commands: train | impute
# =============================================================================
set -euo pipefail

# =============================================================================
# GLOBAL CONFIGURATION
# =============================================================================
readonly VERSION="1.0.0"

# ── Colors ────────────────────────────────────────────────────────────────────
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NC='\033[0m'

# ── Logging ───────────────────────────────────────────────────────────────────
log_info()    { echo -e "${GREEN}[INFO]${NC}  $*"; }
log_warn()    { echo -e "${YELLOW}[WARN]${NC}  $*"; }
log_error()   { echo -e "${RED}[ERROR]${NC} $*" >&2; }
log_step()    { echo -e "\n${BOLD}${BLUE}▶ $*${NC}"; }
die()         { log_error "$*"; exit 1; }
timestamp()   { date '+%Y-%m-%d %H:%M:%S'; }

# ── Validate required tools ───────────────────────────────────────────────────
RSCRIPT="$(command -v Rscript)" || die "Rscript not found in PATH"
PONG2_root="$(Rscript -e 'cat(system.file(package="PONG2"))' 2>/dev/null)" \
  || die "PONG2 R package not installed"

# =============================================================================
# DEFAULT PARAMETERS
# =============================================================================
COMMAND=""
VCF_FILE=""         # Pre-phased VCF for minimac4 pre-imputation
BIM_FILE=""
KIR_FILE=""
OUTPUT_DIR=""
LOCUS=""
ASSEMBLY="hg19"
NCLASSIFIER="Null"
THREADS=4
KIRSPLIT="Null"
KIRMAF="Null"
MAC="Null"
FILTER=0.005
LOCUS_REGION="Null"
FORCE="false"
FILL_MISSING="false"
MODEL_PATH="Null"
THRESHOLD=0.5
MODEL_DIR=""

# =============================================================================
# KIR REGION POSITIONS
# =============================================================================
pos() {
  case "$1" in
    hg19) echo "55000000 55400000 19" ;;
    hg38) echo "54000000 55000000 chr19" ;;
    *)    die "Unknown assembly: $1. Use hg19 or hg38" ;;
  esac
}

# =============================================================================
# HELP
# =============================================================================
show_help() {
  local topic="${1:-}"
  local help_script
  help_script="$("$RSCRIPT" -e "cat(system.file('scripts', 'help.sh', package='PONG2'))")"
  bash "$help_script" "$topic"
}

show_version() {
  echo "PONG2 version $VERSION"
  "$RSCRIPT" -e "cat('R package version:', as.character(packageVersion('PONG2')), '\n')"
}

# =============================================================================
# VALIDATION HELPERS
# =============================================================================
check_file() {
  local file="$1" label="$2"
  [[ -f "$file" ]] || die "$label not found: $file"
}

check_plink_bfile() {
  local bfile="$1"
  check_file "${bfile}.bed" "PLINK .bed"
  check_file "${bfile}.bim" "PLINK .bim"
  check_file "${bfile}.fam" "PLINK .fam"
}

check_plink2() {
  plink2 --version 2>/dev/null | grep -q "2.0" \
    || die "plink2 v2.0+ required but not found"
}

# =============================================================================
# SHARED: PLINK PREPROCESSING
# =============================================================================
preprocess_plink() {
  local bfile="$1" outdir="$2" kir_file="$3"

  log_step "Preprocessing PLINK files"

  # Step 1: Sample keep file from KIR CSV (skip header, strip quotes)
  log_info "Creating sample keep file..."
  awk -F',' 'NR>1 {gsub(/"/, "", $1); print $1, $1}' \
    "$kir_file" > "$outdir/tmp/kir_sample.txt"

  local n_kir
  n_kir=$(wc -l < "$outdir/tmp/kir_sample.txt")
  log_info "KIR samples: $n_kir"

  # Step 2: Subset to KIR samples
  log_info "Subsetting PLINK to KIR samples..."
  plink2 \
    --bfile "$bfile" \
    --keep "$outdir/tmp/kir_sample.txt" \
    --silent \
    --make-bed \
    --out "$outdir/tmp/temp"

  local n_retained
  n_retained=$(wc -l < "$outdir/tmp/temp.fam")
  log_info "Samples retained: $n_retained / $n_kir"

  if [[ "$n_retained" -lt 10 ]]; then
    log_warn "KIR sample ID example:  $(head -1 "$outdir/tmp/kir_sample.txt")"
    log_warn "FAM sample ID example:  $(awk '{print $1, $2}' "$outdir/tmp/temp.fam" | head -1)"
    die "Too few samples retained ($n_retained). Check sample ID format."
  fi

  # Step 3: Remove duplicate SNPs
  log_info "Removing duplicate SNPs..."
  awk '{print $2}' "$outdir/tmp/temp.bim" | \
    sort | uniq -d > "$outdir/tmp/snp_duplicate.txt"

  local n_dups
  n_dups=$(wc -l < "$outdir/tmp/snp_duplicate.txt")
  log_info "Duplicate SNPs removed: $n_dups"

  # Step 4: Exclude duplicates
  plink2 \
    --bfile "$outdir/tmp/temp" \
    --exclude "$outdir/tmp/snp_duplicate.txt" \
    --silent \
    --make-bed \
    --out "$outdir/tmp/chr19"

  local n_variants
  n_variants=$(wc -l < "$outdir/tmp/chr19.bim")
  log_info "Final variants: $n_variants"
}

# =============================================================================
# COMMAND: TRAIN
# =============================================================================
train_model() {
  log_step "PONG2 Training Pipeline"

  # ── Validate inputs ──────────────────────────────────────────────────────
  [[ -n "$BIM_FILE"   ]] || die "--bfile required"
  [[ -n "$KIR_FILE"   ]] || die "--kfile required"
  [[ -n "$OUTPUT_DIR" ]] || die "--output required"
  [[ -n "$LOCUS"      ]] || die "--locus required"
  [[ -n "$ASSEMBLY"   ]] || die "--assembly required (hg19/hg38)"

  check_plink_bfile "$BIM_FILE"
  check_file "$KIR_FILE" "KIR file"
  check_plink2

  # ── Verify PONG2 train script ────────────────────────────────────────────
  local train_script
  train_script="$("$RSCRIPT" -e "cat(system.file('scripts', 'train.R', package='PONG2'))")"
  [[ -f "$train_script" ]] || die "train.R not found in PONG2 package"

  # ── Check dependency ─────────────────────────────────────────────────────
  "$PONG2_root/scripts/dependancy.sh" "$OUTPUT_DIR" "$ASSEMBLY" "plink2"

  # ── Setup directories ────────────────────────────────────────────────────
  mkdir -p "$OUTPUT_DIR/tmp"

  # ── Preprocess PLINK ─────────────────────────────────────────────────────
  preprocess_plink "$BIM_FILE" "$OUTPUT_DIR" "$KIR_FILE"

  # ── Print training summary ───────────────────────────────────────────────
  cat <<SUMMARY

$(echo -e "${BOLD}--- Training Summary ---${NC}")
  Locus:         $LOCUS
  Assembly:      $ASSEMBLY
  N Classifiers: $NCLASSIFIER
  Threads:       $THREADS
  KIR Split:     $KIRSPLIT
  KIR MAF:       $KIRMAF
  MAC:           $MAC
  Locus Region:  $LOCUS_REGION
------------------------
SUMMARY

  # ── Run R training script ─────────────────────────────────────────────────
  log_step "Training KIR model: $LOCUS"
  "$RSCRIPT" --vanilla "$train_script" \
    "$OUTPUT_DIR/tmp/chr19" \
    "$KIR_FILE" \
    "$LOCUS" \
    "$ASSEMBLY" \
    "$OUTPUT_DIR" \
    "$NCLASSIFIER" \
    "$THREADS" \
    "$KIRSPLIT" \
    "$KIRMAF" \
    "$MAC" \
    "$LOCUS_REGION"

  # ── Cleanup ───────────────────────────────────────────────────────────────
  log_step "Cleaning up temporary files..."
  rm -rf "$OUTPUT_DIR/tmp"

  log_info "Training complete. Model saved to: $OUTPUT_DIR/${LOCUS}_model.RData"
}

# =============================================================================
# COMMAND: TRAIN EVALUATION
# =============================================================================

evaluation() {
  log_step "PONG2 Model Evaluation"

  [[ -n "$MODEL_DIR" ]] || die "--model-dir required"
  [[ -n "$LOCUS"     ]] || die "--locus required"

  # Check model files exist
  check_file "${MODEL_DIR}/${LOCUS}_model.RData" "Model file"
  check_file "${MODEL_DIR}/${LOCUS}_test.RData"  "Test genotype file"
  check_file "${MODEL_DIR}/${LOCUS}_split.RData" "Split file"

  local evaluate_script
  evaluate_script="$("$RSCRIPT" -e "cat(system.file('scripts', 'evaluate.R', package='PONG2'))")"
  [[ -f "$evaluate_script" ]] || die "evaluate.R not found in PONG2 package"

  "$RSCRIPT" --vanilla "$evaluate_script" \
    "$MODEL_DIR" \
    "$LOCUS" \
    "$THRESHOLD"
}

# =============================================================================
# COMMAND: IMPUTE
# =============================================================================
imputation() {
  log_step "PONG2 Imputation Pipeline"

  # ── Validate inputs ──────────────────────────────────────────────────────
  [[ -n "$OUTPUT_DIR" ]] || die "--output required"
  [[ -n "$LOCUS"      ]] || die "--locus required"
  [[ -n "$ASSEMBLY"   ]] || die "--assembly required (hg19/hg38)"
  check_plink2

  # ── Validate custom model if supplied ────────────────────────────────────
  if [[ "$MODEL_PATH" != "Null" ]]; then
    check_file "$MODEL_PATH" "Custom model file"
    log_info "Using custom model: $MODEL_PATH"
  fi

  # ── Route: fill-missing uses VCF only; normal imputation uses PLINK ───────
  if [[ "$FILL_MISSING" == "true" ]]; then
    [[ -n "$VCF_FILE" ]] || die \
      "--vcf is required with --fill-missing.\n" \
      "  PLINK files cannot hold phased haplotype data required by minimac4.\n" \
      "  Please supply a pre-phased VCF (e.g. from Eagle2):\n" \
      "    pong2 impute --vcf phased.vcf.gz --fill-missing ..."
    check_file "$VCF_FILE" "Pre-phased VCF"
  else
    [[ -n "$BIM_FILE" ]] || die "--bfile required"
    check_plink_bfile "$BIM_FILE"
  fi

  # ── Check dependency ─────────────────────────────────────────────────────
  "$PONG2_root/scripts/dependancy.sh" "$OUTPUT_DIR" "$ASSEMBLY" "plink2"

  # ── Locate scripts ───────────────────────────────────────────────────────
  local predict_script missingness_script
  predict_script="$("$RSCRIPT" -e "cat(system.file('scripts', 'predict.R', package='PONG2'))")"
  missingness_script="$("$RSCRIPT" -e "cat(system.file('scripts', 'snpmissingness.R', package='PONG2'))")"

  [[ -f "$predict_script"     ]] || die "predict.R not found in PONG2 package"
  [[ -f "$missingness_script" ]] || die "snpmissingness.R not found in PONG2 package"

  # ── Setup ────────────────────────────────────────────────────────────────
  mkdir -p "$OUTPUT_DIR/tmp"
  read -r start end chr <<< "$(pos "$ASSEMBLY")"

  # ── Extract KIR region ───────────────────────────────────────────────────
  local filename

  if [[ "$FILL_MISSING" == "true" ]]; then
    filename=$(basename "${VCF_FILE%.vcf.gz}")
    log_step "Extracting KIR region from VCF: ${chr}:${start}-${end}"

    local input_vcf="$VCF_FILE"
    if [[ "$VCF_FILE" != *.gz ]]; then
      log_info "Compressing VCF with bgzip..."
      bgzip -c "$VCF_FILE" > "$OUTPUT_DIR/tmp/$filename.vcf.gz"
      input_vcf="$OUTPUT_DIR/tmp/$filename.vcf.gz"
    fi
    [[ -f "${input_vcf}.tbi" ]] || tabix -f "$input_vcf"

    local vcf_chr
    vcf_chr=$(zcat "$input_vcf" | grep -v "^#" | head -1 | awk '{print $1}')
    if [[ "$ASSEMBLY" == "hg38" && "$vcf_chr" != chr* ]]; then
      log_warn "Fixing missing chr prefix for hg38..."
      zcat "$input_vcf" | \
        awk 'BEGIN{OFS="\t"} /^#/{print; next} {$1="chr"$1; print}' | \
        bgzip > "$OUTPUT_DIR/tmp/$filename.fixed.vcf.gz"
      tabix -f "$OUTPUT_DIR/tmp/$filename.fixed.vcf.gz"
      input_vcf="$OUTPUT_DIR/tmp/$filename.fixed.vcf.gz"
    elif [[ "$ASSEMBLY" == "hg19" && "$vcf_chr" == chr* ]]; then
      log_warn "Removing chr prefix for hg19..."
      zcat "$input_vcf" | \
        awk 'BEGIN{OFS="\t"} /^#/{print; next} {sub(/^chr/,"",$1); print}' | \
        bgzip > "$OUTPUT_DIR/tmp/$filename.fixed.vcf.gz"
      tabix -f "$OUTPUT_DIR/tmp/$filename.fixed.vcf.gz"
      input_vcf="$OUTPUT_DIR/tmp/$filename.fixed.vcf.gz"
    fi

    log_info "Converting VCF to PLINK for SNP matching check..."
    plink2 \
      --vcf "$input_vcf" \
      --chr "$chr" \
      --from-bp "$start" \
      --to-bp "$end" \
      --make-bed \
      --out "$OUTPUT_DIR/tmp/$filename" \
      --silent

  else
    filename=$(basename "$BIM_FILE")
    log_step "Extracting KIR region from PLINK: ${chr}:${start}-${end}"

    plink2 \
      --bfile "$BIM_FILE" \
      --chr "$chr" \
      --from-bp "$start" \
      --to-bp "$end" \
      --make-bed \
      --out "$OUTPUT_DIR/tmp/$filename" \
      --silent

    if [[ "$ASSEMBLY" == "hg38" ]]; then
      sed -i 's/^/chr/' "$OUTPUT_DIR/tmp/$filename.bim"
    fi
  fi

  # ── SNP missingness check ─────────────────────────────────────────────────
  log_step "Checking SNP overlap with KIR model..."

  local matching
  matching=$("$RSCRIPT" --vanilla "$missingness_script" \
    "$OUTPUT_DIR/tmp/$filename" \
    "$OUTPUT_DIR" \
    "$ASSEMBLY" \
    "$LOCUS" \
    "$FILTER" \
    "$PONG2_root" \
    "$MODEL_PATH")

  local matching_pct
  matching_pct=$(awk "BEGIN {printf \"%.2f\", $matching * 100}")
  log_info "SNP match rate: ${matching_pct}%"

  # ── Route based on match rate ─────────────────────────────────────────────
  if awk -v m="$matching" 'BEGIN { exit (m < 0.6) ? 0 : 1 }'; then

    log_warn "Match rate ${matching_pct}% is below 50% threshold"

    if [[ "$FILL_MISSING" == "true" ]]; then
      log_step "Pre-imputing missing SNPs with minimac4..."
      "$PONG2_root/scripts/dependancy.sh" "$OUTPUT_DIR" "$ASSEMBLY" "minimac4"
      _run_minimac4_from_vcf "$input_vcf" "$filename"
      kir_imputation "$OUTPUT_DIR/tmp/imputed.$filename"

    elif [[ "$FORCE" == "true" ]]; then
      log_warn "Proceeding despite low match rate (--force enabled)"
      kir_imputation "$OUTPUT_DIR/tmp/$filename"

    else
      log_error "Match rate too low: ${matching_pct}%"
      echo -e "Options:"
      echo -e "  ${YELLOW}--fill-missing${NC}  Impute missing SNPs with minimac4"
      echo -e "  ${YELLOW}--force${NC}         Proceed anyway (not recommended)"
      exit 1
    fi

  else
    log_info "Match rate sufficient: ${matching_pct}% — proceeding with imputation"
    kir_imputation "$OUTPUT_DIR/tmp/$filename"
  fi
}

# =============================================================================
# HELPER: MINIMAC4 PRE-IMPUTATION (from pre-phased VCF)
# =============================================================================
_run_minimac4_from_vcf() {
  local input_vcf="$1"
  local filename="$2"

  log_info "Running minimac4 on pre-phased VCF: $input_vcf"
  _minimac4_impute "$input_vcf" "$filename"
  _imputed_vcf_to_plink "$filename"
}

# =============================================================================
# HELPER: SHARED MINIMAC4 EXECUTION
# =============================================================================
_minimac4_impute() {
  local input_vcf="$1"
  local filename="$2"

  if [[ ! -f "$OUTPUT_DIR/tmp/imputed.$filename.vcf.gz" ]]; then
    log_info "Running minimac4 imputation..."
    run_minimac4_416 \
      "$PONG2_root/extdata/$ASSEMBLY/"*.msav \
      "$input_vcf" \
      --region "${chr}:${start}-${end}" \
      -o "$OUTPUT_DIR/tmp/tmp.imputed.$filename.vcf.gz" \
      -c 1000000 \
      -t "$THREADS" \
      -e "$OUTPUT_DIR/tmp/empirical.imputed_$filename.vcf.gz" \
      --format DS

    mv "$OUTPUT_DIR/tmp/tmp.imputed.$filename.vcf.gz" \
       "$OUTPUT_DIR/tmp/imputed.$filename.vcf.gz"
    log_info "Minimac4 imputation complete"
  else
    log_info "Imputed VCF already exists — skipping minimac4"
  fi
}

# =============================================================================
# HELPER: CONVERT IMPUTED VCF TO PLINK
# =============================================================================
_imputed_vcf_to_plink() {
  local filename="$1"

  log_info "Converting imputed VCF to PLINK format..."
  plink2 \
    --vcf "$OUTPUT_DIR/tmp/imputed.$filename.vcf.gz" dosage=DS \
    --import-dosage-certainty 0.8 \
    --make-pgen \
    --out "$OUTPUT_DIR/tmp/imputed.$filename" \
    --silent

  plink2 \
    --pfile "$OUTPUT_DIR/tmp/imputed.$filename" \
    --make-bed \
    --out "$OUTPUT_DIR/tmp/imputed.$filename" \
    --silent

  log_info "PLINK conversion complete"
}

# =============================================================================
# HELPER: MINIMAC4 VERSION CHECK
# =============================================================================
run_minimac4_416() {
  local paths=()
  while IFS= read -r p; do paths+=("$p"); done < <(which -a minimac4 2>/dev/null || true)
  paths+=("$HOME/bin/minimac4" "/usr/local/bin/minimac4")

  for path in "${paths[@]}"; do
    if [[ -x "$path" ]] && "$path" --version 2>/dev/null | grep -q "4.1.6"; then
      log_info "Using minimac4: $path"
      "$path" "$@"
      return $?
    fi
  done
  die "minimac4 v4.1.6 not found"
}

# =============================================================================
# HELPER: KIR PREDICTION
# =============================================================================
kir_imputation() {
  local plink_file="$1"

  log_step "Running KIR prediction: $LOCUS"

  # Standardize SNP IDs to chr:pos format
  awk '{print $1, $1":"$4, $3, $4, $5, $6}' \
    "${plink_file}.bim" > "${plink_file}.mod.bim"

  # Find and remove duplicate SNP IDs
  awk '{print $2}' "${plink_file}.mod.bim" | \
    sort | uniq -d > "${plink_file}.snp_dup.txt"

  mv "${plink_file}.mod.bim" "${plink_file}.bim"

  plink2 \
    --bfile "$plink_file" \
    --exclude "${plink_file}.snp_dup.txt" \
    --make-bed \
    --out "${plink_file}.nodup" \
    --silent

  # Copy final plink files to output
  cp "${plink_file}.nodup."* "$OUTPUT_DIR/"

  # Run R prediction — pass MODEL_PATH as 7th argument
  "$RSCRIPT" --vanilla "$predict_script" \
    "${plink_file}.nodup" \
    "$OUTPUT_DIR" \
    "$LOCUS" \
    "$ASSEMBLY" \
    "$FILTER" \
    "$THREADS" \
    "$MODEL_PATH"

  # Cleanup
  log_step "Cleaning up temporary files..."
  rm -rf "$OUTPUT_DIR/tmp/"

  log_info "Imputation complete. Results saved to: $OUTPUT_DIR"
}

# =============================================================================
# ARGUMENT PARSING
# =============================================================================
args=("$@")
for arg in "${args[@]}"; do
  case "$arg" in
    -*) continue ;;
    *)  COMMAND="$arg"; break ;;
  esac
done

if (( ${#args[@]} > 0 )); then
  set -- "${args[@]}"
fi

while [[ $# -gt 0 ]]; do
  case "$1" in
    -i|--bfile)        BIM_FILE="$2";      shift 2 ;;
    --vcf)             VCF_FILE="$2";      shift 2 ;;
    -k|--kfile)        KIR_FILE="$2";      shift 2 ;;
    -o|--output)       OUTPUT_DIR="$2";    shift 2 ;;
    -l|--locus)        LOCUS="$2";         shift 2 ;;
    -a|--assembly)     ASSEMBLY="$2";      shift 2 ;;
    --filter)          FILTER="$2";        shift 2 ;;
    --split)           KIRSPLIT="$2";      shift 2 ;;
    -r|--region)       LOCUS_REGION="$2";  shift 2 ;;
    --nclassifier)     NCLASSIFIER="$2";   shift 2 ;;
    --kirmaf)          KIRMAF="$2";        shift 2 ;;
    --mac)             MAC="$2";           shift 2 ;;
    -t|--threads)      THREADS="$2";       shift 2 ;;
    -m|--model)        MODEL_PATH="$2";    shift 2 ;;
    -f|--force)        FORCE="true";       shift 1 ;;
    --fill-missing)    FILL_MISSING="true";shift 1 ;;
    --model-dir)       MODEL_DIR="$2";     shift 2 ;;
    --threshold)       THRESHOLD="$2";     shift 2 ;;
    -v|--version)      show_version;       exit 0  ;;
    -h|--help)
      shift
      show_help "${1:-}"
      exit 0
      ;;
    --) shift; break ;;
    -*) die "Unknown option: $1" ;;
    *)
      [[ "$1" != "$COMMAND" ]] && die "Unexpected argument: $1"
      shift
      ;;
  esac
done

# =============================================================================
# COMMAND DISPATCH
# =============================================================================
case "$COMMAND" in
  train)
    train_model
    ;;
  evaluate)
  evaluation
   ;;
  impute)
    imputation
    ;;
  version)
    show_version
    ;;
  ""|help)
    show_help
    exit 0
    ;;
  *)
    die "Unknown command: '$COMMAND'. Use: train | impute"
    ;;
esac
