#' Train Multiple Regression and Produce Model Predictions
#'
#' @description
#' Train multiple base learners and generate prediction matrices for use in the Meta Fuzzy
#' Function framework.
#'
#' @param target character string specifying the name of the response variable in the data frame. This
#' variable is excluded from the predictor set and used as the ground truth for training and
#' evaluation.
#' @param data A data frame containing the predictor variables and the target variable. All columns
#' except target are treated as predictors.
#' @param ntest An integer indicating the number of observations allocated to the test set. This sub-
#' set is completely held out from model training and validation and is used for final
#' performance assessment.
#' @param nvalid An integer specifying the number of observations assigned to the validation set. Predic-
#' tions on this subset are used to construct Meta Fuzzy Functions and to tune clustering-related
#' hyperparameters.
#' @param seed An integer used to set the random seed for reproducibility.
#'
#' @details
#' Splits data into train/validation/test, then fits a suite of base learners and generates predictions
#' for validation and test. Predictions are returned as matrices with dimension \eqn{N_{test} \times M}.
#' These matrices are the standard input \eqn{x}{x} for \code{mff()} and \code{tune.mff()}.
#'
#' Base learners include linear regression, Lasso, Ridge, Elastic Net, Random Forest, XGBoost,
#' and LightGBM, as implemented by the package dependencies.
#'
#' If a selected method requires hyperparameter optimization, this optimization is not performed
#' within the \code{model.train} function. Instead, all hyperparameters are fixed a priori using
#' commonly accepted default values.
#'
#' Training base models is not a mandatory step to use the MFF framework. The \code{model.train}
#' function is provided as a convenience utility only. Users may independently train any number
#' of prediction methods using external workflows or software and directly supply their
#' predictions as inputs to the MFF.
#'
#' Accordingly, the \code{model.train} function can be completely skipped while still fully utilizing the
#' MFF framework with precomputed model outputs.
#'
#' @return A list containing:
#' \itemize{
#'   \item \code{pred_matrix_valid}: A numeric matrix of validation-set predictions, where each column
#'   corresponds to a base model.
#'   \item \code{pred_matrix_test}: A numeric matrix of test-set predictions generated by the same base
#'   models.
#'   \item \code{y_valid}: A numeric vector of true response values for the validation set.
#'   \item \code{y_test}: A numeric vector of true response values for the test set.
#' }
#'
#' @references
#' Breiman, L. (2001). Random forests. \emph{Machine Learning}, 45(1), 5-32.
#' \doi{10.1023/A:1010933404324}
#'
#' Chen, T., & Guestrin, C. (2016). XGBoost: A Scalable Tree Boosting System.
#' In \emph{Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
#' 785-794. \doi{10.1145/2939672.2939785}
#'
#' Chen, T., He, T., Benesty, M., et al. (2025). \emph{xgboost: Extreme Gradient Boosting}.
#' R package version 3.1.2.1. \url{https://CRAN.R-project.org/package=xgboost}
#'
#' Ke, G., Meng, Q., Finley, T., et al. (2017). LightGBM: A highly efficient gradient boosting decision tree.
#' In \emph{Proceedings of the 31st International Conference on Neural Information Processing Systems},
#' 3149-3157.
#'
#' Liaw, A., & Wiener, M. (2002). Classification and Regression by randomForest.
#' \emph{R News}, 2(3), 18-22. \url{https://CRAN.R-project.org/doc/Rnews/}
#'
#' Shi, Y., Ke, G., Soukhavong, D., et al. (2025). \emph{lightgbm: Light Gradient Boosting Machine}.
#' R package version 4.6.0. \url{https://CRAN.R-project.org/package=lightgbm}
#'
#' Tay, J. K., Narasimhan, B., & Hastie, T. (2023). Elastic Net Regularization Paths for All Generalized Linear Models.
#' \emph{Journal of Statistical Software}, 106(1), 1-31. \doi{10.18637/jss.v106.i01}
#'
#' @seealso
#' \code{\link{mff}} for the main framework application,
#' \code{\link{tune.mff}} for hyperparameter optimization,
#'
#' @examples
#'  boston <- MASS::Boston
#'   result <- model.train(
#'     target = "medv",
#'     data = boston,
#'     ntest = 50,
#'     nvalid = 50,
#'     seed = 123
#'   )
#'
#'   head(result$pred_matrix_valid)
#'   head(result$pred_matrix_test)
#'
#' @importFrom glmnet cv.glmnet glmnet
#' @importFrom randomForest randomForest
#' @importFrom xgboost xgb.train xgb.DMatrix
#' @importFrom lightgbm lgb.train lgb.Dataset
#' @importFrom stats lm predict model.matrix as.formula
#'
#' @export
model.train <- function(target,data,ntest,nvalid,seed = 123) {
set.seed(seed)
n <- nrow(data)

# Validation set
valid_index <- sample(seq_len(n), size = nvalid)
remaining <- setdiff(seq_len(n), valid_index)

# Test set
test_index <- sample(remaining, size = ntest)

# Train set
train_index <- setdiff(remaining, test_index)

# Create sets
train_data <- data[train_index, ]
valid_data <- data[valid_index, ]
test_data  <- data[test_index, ]

# Training models
# Formula for models (no feature selection)
formula <- as.formula(paste(target, "~ ."))

# Linear model
lm <- lm(formula, data = train_data)
lm_pred_valid <- predict(lm,valid_data)
lm_pred_test <- predict(lm,test_data)

# Lasso Ridge Elastic Net
X_train <- model.matrix(formula, train_data)[, -1]
y_train <- as.matrix(train_data[target])

X_valid <- model.matrix(formula, valid_data)[, -1]
y_valid <- as.matrix(valid_data[target])

X_test <- model.matrix(formula, test_data)[, -1]
y_test <- as.matrix(test_data[target])

# Lasso
cv_model_lasso <- cv.glmnet(X_train, y_train, alpha = 1, nfolds = 10)
lasso_pred_valid <- predict(cv_model_lasso,s = "lambda.min" ,X_valid)
lasso_pred_test <- predict(cv_model_lasso,s = "lambda.min" ,X_test)

# Ridge
cv_model_ridge <- cv.glmnet(X_train, y_train, alpha = 0, nfolds = 10)
ridge_pred_valid <- predict(cv_model_ridge,s = "lambda.min" ,X_valid)
ridge_pred_test <- predict(cv_model_ridge,s = "lambda.min" ,X_test)

# Elastic Net
cv_model_elastic <- cv.glmnet(X_train, y_train, alpha = 0.5, nfolds = 10)
elastic_pred_valid <- predict(cv_model_elastic,s = "lambda.min" ,X_valid)
elastic_pred_test <- predict(cv_model_elastic,s = "lambda.min" ,X_test)

# Random Forest (RF)
rf_model <- randomForest(formula, data = train_data, ntree = 100)
rf_pred_valid <- predict(rf_model,valid_data)
rf_pred_test <- predict(rf_model,test_data)

# XGBoost
nrounds = 200
eta = 0.1
max_depth = 6

xgboost_dtrain <- xgb.DMatrix(X_train, label = y_train)
xgboost_params <- list(objective = "reg:squarederror", eta = eta, max_depth = max_depth, eval_metric = "rmse")

xgboost_model <- xgb.train(xgboost_params, xgboost_dtrain, nrounds = nrounds, verbose = 0)
xgboost_pred_valid <- predict(xgboost_model, xgb.DMatrix(X_valid))
xgboost_pred_test <- predict(xgboost_model, xgb.DMatrix(X_test))

# LightGBM
learning_rate = 0.05
num_leaves = 31

lightgbm_dtrain <- lgb.Dataset(X_train, label = y_train)
lightgbm_params <- list(objective = "regression", metric = "rmse", learning_rate = learning_rate, num_leaves = num_leaves, verbose = -1, force_row_wise = TRUE)
lightgbm_model <- lgb.train(lightgbm_params, lightgbm_dtrain, nrounds = nrounds, verbose = -1)
lightgbm_pred_valid <- predict(lightgbm_model, X_valid)
lightgbm_pred_test <- predict(lightgbm_model, X_test)

# Pred Matrix
pred_matrix_valid <- cbind(lm_pred_valid, lasso_pred_valid, ridge_pred_valid, elastic_pred_valid ,rf_pred_valid,xgboost_pred_valid,lightgbm_pred_valid)
pred_matrix_test <- cbind(lm_pred_test, lasso_pred_test, ridge_pred_test, elastic_pred_test ,rf_pred_test,xgboost_pred_test,lightgbm_pred_test)

modelNames <- c("LM","Lasso","Ridge","ElasticNet","RF","XGBoost","LightGBM")
colnames(pred_matrix_valid) <- modelNames
colnames(pred_matrix_test) <- modelNames

# Function Output
out <- list()
out$pred_matrix_valid <- pred_matrix_valid
out$pred_matrix_test <- pred_matrix_test
out$y_test <- as.numeric(y_test)
out$y_valid <-as.numeric(y_valid)
return(out)
}
