## ----setup--------------------------------------------------------------------
library(clinCompare)

## ----compare-datasets---------------------------------------------------------
baseline <- data.frame(
  USUBJID = c("SUBJ01", "SUBJ02", "SUBJ03"),
  AGE     = c(45, 52, 38),
  SEX     = c("M", "F", "M"),
  RACE    = c("WHITE", "WHITE", "ASIAN"),
  stringsAsFactors = FALSE
)

updated <- data.frame(
  USUBJID = c("SUBJ01", "SUBJ02", "SUBJ03"),
  AGE     = c(45, 53, 38),
  SEX     = c("M", "F", "F"),
  RACE    = c("WHITE", "WHITE", "ASIAN"),
  stringsAsFactors = FALSE
)

result <- compare_datasets(baseline, updated)
result

## ----drill-into-result--------------------------------------------------------
# Per-column difference counts
result$observation_comparison$discrepancies

# Row-level details for a specific variable
result$observation_comparison$details$SEX

## ----compare-variables--------------------------------------------------------
df_a <- data.frame(
  USUBJID = c("SUBJ01", "SUBJ02"),
  AGE     = c(45, 52),
  SEX     = c("M", "F"),
  stringsAsFactors = FALSE
)

df_b <- data.frame(
  USUBJID = c("SUBJ01", "SUBJ02"),
  AGE     = c(45L, 52L),
  WEIGHT  = c(75.5, 80.2),
  stringsAsFactors = FALSE
)

compare_variables(df_a, df_b)

## ----compare-observations-----------------------------------------------------
df1 <- data.frame(
  ID    = c(1, 2, 3),
  SCORE = c(80, 90, 70),
  stringsAsFactors = FALSE
)

df2 <- data.frame(
  ID    = c(1, 2, 3),
  SCORE = c(80, 95, 70),
  stringsAsFactors = FALSE
)

compare_observations(df1, df2)

## ----clean-dataset------------------------------------------------------------
messy <- data.frame(
  NAME  = c("Alice", "alice", "Bob", "Bob"),
  SCORE = c(100, 100, 85, 85),
  stringsAsFactors = FALSE
)

clean_dataset(messy, remove_duplicates = TRUE, convert_to_case = "upper")

## ----prepare-datasets---------------------------------------------------------
df_unsorted1 <- data.frame(
  REGION = c("West", "East", "North"),
  SALES  = c(150, 200, 180)
)

df_unsorted2 <- data.frame(
  REGION = c("East", "North", "West"),
  SALES  = c(210, 185, 160)
)

prepped <- prepare_datasets(df_unsorted1, df_unsorted2, sort_columns = "REGION")
prepped$df1
prepped$df2

## ----compare-by-group---------------------------------------------------------
site_data_v1 <- data.frame(
  SITEID = c("SITE01", "SITE01", "SITE02", "SITE02"),
  SUBJID = c("S01", "S02", "S03", "S04"),
  AGE    = c(45, 52, 38, 61)
)

site_data_v2 <- data.frame(
  SITEID = c("SITE01", "SITE01", "SITE02", "SITE02"),
  SUBJID = c("S01", "S02", "S03", "S04"),
  AGE    = c(45, 53, 38, 62)
)

by_site <- compare_by_group(site_data_v1, site_data_v2, group_vars = "SITEID")
names(by_site)

## ----detect-domain------------------------------------------------------------
dm_data <- data.frame(
  STUDYID  = rep("STUDY01", 3),
  USUBJID  = c("SUBJ01", "SUBJ02", "SUBJ03"),
  AGE      = c(45, 62, 51),
  SEX      = c("M", "F", "M"),
  RACE     = c("WHITE", "BLACK", "ASIAN"),
  ARMCD    = c("TRT", "PBO", "TRT"),
  ARM      = c("Treatment", "Placebo", "Treatment"),
  stringsAsFactors = FALSE
)

detect_cdisc_domain(dm_data)

## ----cdisc-compare------------------------------------------------------------
dm_v1 <- data.frame(
  STUDYID  = rep("STUDY01", 3),
  USUBJID  = c("SUBJ01", "SUBJ02", "SUBJ03"),
  AGE      = c(45, 62, 51),
  SEX      = c("M", "F", "M"),
  RACE     = c("WHITE", "BLACK", "ASIAN"),
  ARMCD    = c("TRT", "PBO", "TRT"),
  ARM      = c("Treatment", "Placebo", "Treatment"),
  RFSTDTC  = c("2024-01-15", "2024-01-16", "2024-01-17"),
  stringsAsFactors = FALSE
)

dm_v2 <- data.frame(
  STUDYID  = rep("STUDY01", 3),
  USUBJID  = c("SUBJ01", "SUBJ02", "SUBJ03"),
  AGE      = c(45, 62, 52),
  SEX      = c("M", "F", "M"),
  RACE     = c("WHITE", "BLACK", "ASIAN"),
  ARMCD    = c("TRT", "PBO", "TRT"),
  ARM      = c("Treatment", "Placebo", "Treatment"),
  RFSTDTC  = c("2024-01-15", "2024-01-16", "2024-01-17"),
  stringsAsFactors = FALSE
)

cdisc_result <- cdisc_compare(dm_v1, dm_v2, domain = "DM", standard = "SDTM")
cdisc_result

## ----validate-cdisc-----------------------------------------------------------
validation <- validate_cdisc(dm_v1, domain = "DM", standard = "SDTM")

## ----get-all-diffs------------------------------------------------------------
diffs <- get_all_differences(cdisc_result)
diffs

## ----export-report------------------------------------------------------------
# HTML report
export_report(cdisc_result, file.path(tempdir(), "dm_report.html"))

# Text report
export_report(cdisc_result, file.path(tempdir(), "dm_report.txt"))

## ----export-excel, eval=FALSE-------------------------------------------------
# # Excel workbook with Summary, Variable Diffs, Value Diffs, and CDISC tabs
# export_report(cdisc_result, file.path(tempdir(), "dm_report.xlsx"))

## ----batch-compare, eval=FALSE------------------------------------------------
# results <- compare_submission(
#   base_dir    = "submission_v1/",
#   compare_dir = "submission_v2/",
#   output_file = "submission_diff.xlsx"
# )

