% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/joinless.R
\name{joinless}
\alias{joinless}
\title{Infer relationship types between variables in two datasets using sampling}
\usage{
joinless(
  x,
  y,
  x_vars = NULL,
  y_vars = NULL,
  conf = 0.95,
  error = 0.05,
  n_x = NULL,
  n_y = NULL,
  max_vars = 20,
  ignore = character(0),
  missingness_tol = 0.1,
  type_coerce = TRUE,
  seed = NULL,
  verbose = FALSE,
  info = FALSE
)
}
\arguments{
\item{x, y}{Data frames.
Input datasets to be compared.}

\item{x_vars, y_vars}{Character vectors specifying the column names to compare.
If \code{NULL}, the function selects up to \code{max_vars} variables from each dataset.}

\item{conf}{Numeric. Confidence level used to compute automatic sample sizes
(default: \code{0.95}).}

\item{error}{Numeric. Margin of error used in sample size calculation
(default: \code{0.05}).}

\item{n_x, n_y}{Optional fixed sample sizes for \code{x} and \code{y}.
If not provided, sample sizes are computed automatically based on
population size, \code{conf}, and \code{error}.}

\item{max_vars}{Integer. Maximum number of variables to compare per dataset.
Defaults to \code{20}.}

\item{ignore}{Character vector of relation types to exclude from the output.
By default, no types are excluded.}

\item{missingness_tol}{Numeric. Maximum tolerated proportion of
missing/problematic values within a variable (default: \code{0.1}).
Problematic values include: \code{NA}, \code{NaN}, \code{NULL}, \code{Inf}, \code{-Inf}, empty strings,
and whitespace-only strings.}

\item{type_coerce}{Logical. If \code{TRUE} (default), attempts to coerce variables
to a common type (typically character) when domains differ.
If \code{FALSE}, type mismatches produce an \code{"error_type"} result.}

\item{seed}{Optional integer. Random seed to make the sampling reproducible.}

\item{verbose}{Logical. If \code{TRUE}, prints progress messages during execution.}

\item{info}{Logical. If \code{FALSE} (default), returns only the inferred
relationship type for each variable pair.
If \code{TRUE}, returns additional diagnostics such as match rate,
missingness rates, inferred types, and notes.}
}
\value{
A data frame summarizing the inferred relationship between every
variable pair.
If \code{info = FALSE}, the output contains:
\itemize{
\item \code{x_var}: variable name in \code{x}
\item \code{y_var}: variable name in \code{y}
\item \code{relation_type}: inferred relationship
}

If \code{info = TRUE}, additional columns include:
\itemize{
\item \code{n_used}: sample size used
\item \code{match_rate}: proportion of sampled values from \code{x} found in \code{y}
\item \code{null_rate_x}, \code{null_rate_y}: missingness/problematic rates
\item \code{type_x}, \code{type_y}: underlying storage types
\item \code{notes}: diagnostic messages
}
}
\description{
This function compares selected variables from two data frames and infers
their relational structure (e.g., one-to-one, many-to-one).
It uses random sampling—either automatic or user-defined—to estimate
match behavior, uniqueness patterns, and missingness characteristics.
The goal is to help diagnose potential join keys or detect unrelated fields
without performing full-table comparisons.
}
\details{
Relationship inference is determined using:
\itemize{
\item \strong{Match rate}: proportion of keys in \code{x} found in \code{y}
\item \strong{Key uniqueness}: frequency distribution of non-missing values
}

Based on these, relationships are classified as:
\itemize{
\item \code{"one-one"}
\item \code{"many-one"}
\item \code{"one-many"}
\item \code{"many-many"}
\item \code{"unrelated"} (very low or zero match rate)
\item \code{"null"} (missingness above tolerance)
\item \code{"error_type"} (incompatible types and coercion disabled)
}
}
\examples{
df1 <- data.frame(
  id    = 1:5,
  value = 1:5
)

df2 <- data.frame(
  id    = 3:7,
  value = 3:7
)

joinless(df1, df2, x_vars = "id", y_vars = "id")
joinless(df1, df2, conf = 0.99, error = 0.02, info = TRUE)
joinless(df1, df2, ignore = "unrelated")

}
