% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/deadwood.R
\name{deadwood}
\alias{deadwood}
\alias{deadwood.default}
\alias{deadwood.dist}
\alias{deadwood.mstclust}
\alias{deadwood.mst}
\title{Deadwood: Outlier Detection via Trimming of Mutual Reachability Minimum Spanning Trees}
\usage{
deadwood(d, ...)

\method{deadwood}{default}(
  d,
  M = 5L,
  contamination = NA_real_,
  max_debris_size = NA_real_,
  max_contamination = 0.5,
  ema_dt = 0.01,
  distance = c("euclidean", "l2", "manhattan", "cityblock", "l1", "cosine"),
  verbose = FALSE,
  ...
)

\method{deadwood}{dist}(
  d,
  M = 5L,
  contamination = NA_real_,
  max_debris_size = NA_real_,
  max_contamination = 0.5,
  ema_dt = 0.01,
  verbose = FALSE,
  ...
)

\method{deadwood}{mstclust}(
  d,
  contamination = NA_real_,
  max_debris_size = NA_real_,
  max_contamination = 0.5,
  ema_dt = 0.01,
  verbose = FALSE,
  ...
)

\method{deadwood}{mst}(
  d,
  contamination = NA_real_,
  max_debris_size = NA_real_,
  max_contamination = 0.5,
  ema_dt = 0.01,
  cut_edges = NULL,
  verbose = FALSE,
  ...
)
}
\arguments{
\item{d}{a numeric matrix with \eqn{n} rows and \eqn{p} columns
(or an object coercible to one, e.g., a data frame with numeric-like
columns), an object of class \code{dist} (see \code{\link[stats]{dist}}),
an object of class \code{mstclust} (see \pkg{genieclust}
and \pkg{lumbermark}),
or an object of class \code{mst} (see \code{\link{mst}})}

\item{...}{further arguments passed to \code{\link{mst}}}

\item{M}{smoothing factor; \eqn{M \leq 1} gives the selected \code{distance};
otherwise, the mutual reachability distance based on the \eqn{M}-th
nearest neighbours is used}

\item{contamination}{single numeric value or \code{NA};
the estimated (approximate) proportion of outliers in the dataset;
if \code{NA}, the contamination amount will be determined
by identifying the most significant elbow point of the curve
comprised of increasingly ordered tree edge weights
smoothened with an exponential moving average}

\item{max_debris_size}{single integer value or \code{NA};
the maximal size of the leftover connected components that
will be considered outliers; if \code{NA}, \eqn{\sqrt{n}} is assumed}

\item{max_contamination}{single numeric value;
maximal contamination level assumed when \code{contamination} is \code{NA}}

\item{ema_dt}{single numeric value;
controls the smoothing parameter \eqn{\alpha = 1-\exp(-dt)}
of the exponential moving average (in edge length elbow point detection),
\eqn{y_i = \alpha w_i + (1-\alpha) w_{i-1}}, \eqn{y_1 = d_1}}

\item{distance}{metric used in the case where \code{d} is a matrix; one of:
\code{"euclidean"} (synonym: \code{"l2"}),
\code{"manhattan"} (a.k.a. \code{"l1"} and \code{"cityblock"}),
\code{"cosine"}}

\item{verbose}{logical; whether to print diagnostic messages
and progress information}

\item{cut_edges}{numeric vector or \code{NULL};
\eqn{k-1} indexes of the tree edges whose omission lead to
\eqn{k} connected components (clusters), where the outliers are to
be sought independently; most frequently this is generated
via \pkg{genieclust} or \pkg{lumbermark}.}
}
\value{
A logical vector \code{y} of length \eqn{n}, where \code{y[i] == TRUE}
means that the \code{i}-th observation is deemed to be an outlier.

The \code{mst} attribute gives the computed minimum
spanning tree which can be reused in further calls to the functions
from \pkg{genieclust}, \pkg{lumbermark}, and \pkg{deadwood}.
\code{cut_edges} gives the \code{cut_edges} passed as argument.
\code{contamination} gives the detected contamination levels
in each cluster (which can be different from the observed proportion
of outliers detected).
}
\description{
Deadwood is an anomaly detection algorithm based on Mutual Reachability
Minimum Spanning Trees.  It trims protruding tree segments and marks small
debris as outliers.

More precisely, the use of a mutual reachability distance
pulls peripheral points farther away from each other.
Tree edges with weights beyond the detected elbow point
are removed. All the resulting connected components whose
sizes are smaller than a given threshold are deemed anomalous.
}
\details{
As with all distance-based methods (this includes k-means and DBSCAN as well),
applying data preprocessing and feature engineering techniques
(e.g., feature scaling, feature selection, dimensionality reduction)
might lead to more meaningful results.

If \code{d} is a numeric matrix or an object of class \code{dist},
\code{\link{mst}} will be called to compute an MST, which
generally takes at most \eqn{O(n^2)} time. However, by default,
for low-dimensional Euclidean spaces, a faster algorithm based on K-d trees
is selected automatically; see \code{\link[quitefastmst]{mst_euclid}} from
the \pkg{quitefastmst} package.

Once the spanning tree is determined (\eqn{\Omega(n \log n)}-\eqn{O(n^2)}),
the Deadwood algorithm runs in \eqn{O(n)} time.
Memory use is also \eqn{O(n)}.
}
\examples{
library("datasets")
data("iris")
X <- jitter(as.matrix(iris[1:2]))  # some data
is_outlier <- deadwood(X, M=5)
plot(X, col=c("#ff000066", "#55555555")[is_outlier+1],
    pch=c(16, 1)[is_outlier+1], asp=1, las=1)

}
\references{
M. Gagolewski, deadwood, in preparation, 2026, TODO

V. Satopaa, J. Albrecht, D. Irwin, B. Raghavan, Finding a "Kneedle"
in a haystack: Detecting knee points in system behavior,
In: 31st Intl. Conf. Distributed Computing Systems Workshops,
2011, 166-171, \doi{10.1109/ICDCSW.2011.20}

R.J.G.B. Campello, D. Moulavi, J. Sander,
Density-based clustering based on hierarchical density estimates,
Lecture Notes in Computer Science 7819, 2013, 160-172,
\doi{10.1007/978-3-642-37456-2_14}
}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski}
}
\seealso{
The official online manual of \pkg{deadwood} at \url{https://deadwood.gagolewski.com/}

}
