% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/preprocess.R, R/split.R
\name{cleanse.data.frame}
\alias{cleanse.data.frame}
\alias{cleanse}
\title{Cleansing the dataset for classification modeling}
\usage{
\method{cleanse}{data.frame}(
  .data,
  uniq = TRUE,
  uniq_thres = 0.1,
  char = TRUE,
  missing = FALSE,
  verbose = TRUE,
  ...
)

cleanse(.data, ...)
}
\arguments{
\item{.data}{a data.frame or a \code{\link{tbl_df}}.}

\item{uniq}{logical. Set whether to remove the variables whose unique value is one.}

\item{uniq_thres}{numeric. Set a threshold to removing variables when the ratio of unique values(number of unique values / number of observation) is greater than the set value.}

\item{char}{logical. Set the change the character to factor.}

\item{missing}{logical. Set whether to removing variables including missing value}

\item{verbose}{logical. Set whether to echo information to the console at runtime.}

\item{...}{further arguments passed to or from other methods.}
}
\value{
An object of data.frame or train_df. and return value is an object of the same type as the .data argument.
}
\description{
The cleanse() cleanse the dataset for classification modeling
}
\details{
This function is useful when fit the classification model.
This function does the following.:
Remove the variable with only one value. And remove variables that have a unique number of values relative to the number of observations for a character or categorical variable. In this case, it is a variable that corresponds to an identifier or an identifier. And converts the character to factor.
}
\examples{
# create sample dataset
set.seed(123L)
id <- sapply(1:1000, function(x)
  paste(c(sample(letters, 5), x), collapse = ""))

year <- "2018"

set.seed(123L)
count <- sample(1:10, size = 1000, replace = TRUE)

set.seed(123L)
alpha <- sample(letters, size = 1000, replace = TRUE)

set.seed(123L)
flag <- sample(c("Y", "N"), size = 1000, prob = c(0.1, 0.9), replace = TRUE)

dat <- data.frame(id, year, count, alpha, flag, stringsAsFactors = FALSE)
# structure of dataset
str(dat)

# cleansing dataset
newDat <- cleanse(dat)

# structure of cleansing dataset
str(newDat)

# cleansing dataset
newDat <- cleanse(dat, uniq = FALSE)

# structure of cleansing dataset
str(newDat)

# cleansing dataset
newDat <- cleanse(dat, uniq_thres = 0.3)

# structure of cleansing dataset
str(newDat)

# cleansing dataset
newDat <- cleanse(dat, char = FALSE)

# structure of cleansing dataset
str(newDat)

}
