/*
 *
 * Copyright (C) 2022 Juan Domingo (Juan.Domingo@uv.es)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <Rcpp.h>
#include <memhelper.h>
#include <fastpam.h>

extern unsigned char DEB;

using namespace std;

//' ClassifAsDataFrame
//'
//' Returns the results of the classification returned by ApplyPAM as a R dataframe
//' 
//' The dataframe has three columns: PointName (name of each point), NNPointName (name of the point which is the center of the cluster to which PointName belongs to)
//' and NNDistance (distance between the points PointName and NNPointName).
//' Medoids are identified by the fact that PointName and NNPointName are equal, or equivalently, NNDistance is 0.
//'
//' @param L      The list returned by ApplyPAM with fields L$med and\cr
//'               L$clasif with the numbers of the medoids and the classification of each point
//' @param fdist  The binary file containing the symmetric matrix with the dissimilarities between points (usually, generated by 
//'               a call to CalcAndWriteDissimilarityMatrix).
//' @return Df    Dataframe with columns PointName, NNPointName and NNDistance. See Details for description.
//' @examples
//' # Synthetic problem: 10 random seeds with coordinates in [0..20]
//' # to which random values in [-0.1..0.1] are added
//' M<-matrix(0,100,500)
//' rownames(M)<-paste0("rn",c(1:100))
//' for (i in (1:10))
//' {
//'  p<-20*runif(500)
//'  Rf <- matrix(0.2*(runif(5000)-0.5),nrow=10)
//'  for (k in (1:10))
//'  {
//'   M[10*(i-1)+k,]=p+Rf[k,]
//'  }
//' }
//' tmpfile1=paste0(tempdir(),"/pamtest.bin")
//' JWriteBin(M,tmpfile1,dtype="float",dmtype="full")
//' tmpdisfile1=paste0(tempdir(),"/pamDL2.bin")
//' CalcAndWriteDissimilarityMatrix(tmpfile1,tmpdisfile1,distype="L2",restype="float",nthreads=0)
//' L <- ApplyPAM(tmpdisfile1,10,init_method="BUILD")
//' df <- ClassifAsDataFrame(L,tmpdisfile1)
//' df
//' # Identification of medoids:
//' which(df[,3]==0)
//' # Verification they are the same as in L (in different order)
//' L$med
//' @export
// [[Rcpp::export]]
Rcpp::DataFrame ClassifAsDataFrame(Rcpp::List L,std::string fdist)
{
 unsigned char mtype,ctype;
 MatrixType(fdist,mtype,ctype);
 if (mtype!=MTYPESYMMETRIC)
  Rcpp::stop("This function can operate only with binary symmetric matrices.\n");
 
 Rcpp::NumericVector med=L["med"];
 Rcpp::NumericVector clas=L["clasif"];
 
 std::vector<std::string> names;
 size_t n;
 Rcpp::NumericVector nn_distances;
 switch (ctype)
 {
  case FTYPE:
  { 
   SymmetricMatrix<float> D(fdist);
   names=D.GetRowNames();
   n=D.GetNRows();
   nn_distances = Rcpp::NumericVector(n);
   for (size_t cn=0; cn<n; cn++)
    nn_distances[cn]=D.Get(cn,med[clas[cn]-1]-1);
   break;
  }
  case DTYPE:
  { 
   SymmetricMatrix<double> D(fdist);
   names=D.GetRowNames();
   n=D.GetNRows();
   nn_distances = Rcpp::NumericVector(n);
   for (size_t cn=0; cn<n; cn++)
    nn_distances[cn]=D.Get(cn,med[clas[cn]-1]-1);
   break;
  }
  default: Rcpp::stop("This function can operate only with binary symmetric matrices of type float or double.\n");
           break;
 }
 
 Rcpp::StringVector point_names(n),point_nn_names(n);
 for (size_t cn=0; cn<n; cn++)
 {
    point_names[cn]=names[cn];
    point_nn_names[cn]=names[med[clas[cn]-1]-1];
 }
 
 Rcpp::DataFrame df = Rcpp::DataFrame::create( Rcpp::Named("PointName") = point_names, Rcpp::Named("NNPointName") = point_nn_names, Rcpp::Named("NNDistance") = nn_distances);
 
 return df;
}

unsigned char TestInitMethodArgument(string initmethod,Rcpp::Nullable<Rcpp::NumericVector> initial_med)
{
  unsigned char initmet=0;
  while ((initmet<NUM_INIT_METHODS) && (initmethod.find(init_method_names[initmet])==string::npos))
    initmet++;
  
  if (initmet>=NUM_INIT_METHODS)
  {
     ostringstream errst;
     errst << "Initialization method must be one of\n";
     
     for (unsigned char initmt=0; initmt<NUM_INIT_METHODS; initmt++)
      if (initmt == INIT_METHOD_PREVIOUS)
         errst << init_method_names[initmt] << " ";
      else
         errst << init_method_names[initmt] << " " << init_method_names[initmt] << "w ";
      
     Rcpp::stop(errst.str());
  }
  
  if (initmet==INIT_METHOD_PREVIOUS && initial_med.isNull())
     Rcpp::stop("You have asked for PREV initialization method but you have not provided the file with the initial medoids.\n");
     
  if (initmet!=INIT_METHOD_PREVIOUS && initial_med.isNotNull())
     Rcpp::stop("You have asked for an initialization method other than PREV but you have provided a file with initial medoids.\n");
 
  if (initmet==INIT_METHOD_PREVIOUS && !Rf_isVector(initial_med))
     Rcpp::stop("The argument you have passed as initial set of medoids is not a NumericVector (indeed, it is not a vector).\n");
  
  if (initmet==INIT_METHOD_PREVIOUS && !Rf_isNumeric(initial_med))
     Rcpp::stop("The argument you have passed as initial set of medoids is a Vector but not a NumericVector.\n");
  return initmet;
}
  
//' ApplyPAM
//'
//' A function to implement the Partitioning-around-medoids algorithm described in\cr 
//' Schubert, E. and Rousseeuw, P.J.: "Fast and eager k-medoids clustering: O(k) runtime improvement of the PAM, CLARA, and CLARANS algorithms."\cr
//' Information Systems, vol. 101, p. 101804, 2021.\cr
//' doi: https://doi.org/10.1016/j.is.2021.101804\cr
//' Notice that the actual values of the vectors (instances) are not needed. To recover them, look at the data matrix
//' used to generate the distance matrix.\cr
//' The number of instances, N, is not passed since dissimilarity matrix is NxN and therefore its size indicates the N value.
//' 
//' With respect to the returned value, L$med has as many components\cr
//' as requested medoids and L$clasif has as many components as instances.\cr
//' Medoids are expressed in L$med by its number in the array of points (row in the dissimilarity matrix) starting at 1 (R convention).\cr
//' L$clasif contains the number of the medoid (i.e.: the cluster) to which each instance has been assigned, according to their order in\cr
//' L$med (also from 1).\cr
//' This means that if L$clasif[p] is m, the point p belongs to the\cr
//' class grouped around medoid L$med[m].\cr
//' Moreover, if the dissimilarity matrix contains as metadata\cr
//' (row names) the cell names, the returned vector is a R-named vector with such names.
//' 
//' @param dissim_file  A string with the name of the binary file that contains the symmetric matrix of dissimilarities. Such matrix
//'                     should have been generated by CalcAndWriteDissimilarityMatrix and it must be a symmetric matrix.
//' @param k            A possitive integer (the desired number of medoids).
//' @param init_method  One of the strings 'PREV', 'BUILD' or 'LAB'. See meaning of initialization algorithms BUILD and LAB in the original paper.\cr
//'                     'PREV' should be used exclusively to start the second part of the algorithm (optimization) from a initial set of medoids generated by a former call.\cr
//'                     Default: BUILD.
//' @param initial_med  A vector with initial medoids to start optimization. It is to be used only by the 'PREV' method and it will have been obtained as the first
//'                     element (L$med) of the two-element list returned by a previous call to this function used in just-initialize mode (max_iter=0).\cr
//'                     Default: empty vector.
//' @param max_iter     The maximum number of allowed iterations. 0 means stop immediately after finding initial medoids.\cr
//'                     Default: 1000
//' @param nthreads     The number of used threads.\cr
//'                     -1 means don't use threads (serial implementation).\cr
//'                     0 means let the program choose according to the number of cores and of points.\cr
//'                     Any other number forces this number of threads. Choosing more than the number of available cores is allowed, but discouraged.\cr
//'                     Default: 0
//' @return L["med","clasif"] A list of two numeric vectors. See section Details for more information\cr                     
//' @examples
//' # Synthetic problem: 10 random seeds with coordinates in [0..20]
//' # to which random values in [-0.1..0.1] are added
//' M<-matrix(0,100,500)
//' rownames(M)<-paste0("rn",c(1:100))
//' for (i in (1:10))
//' {
//'  p<-20*runif(500)
//'  Rf <- matrix(0.2*(runif(5000)-0.5),nrow=10)
//'  for (k in (1:10))
//'  {
//'   M[10*(i-1)+k,]=p+Rf[k,]
//'  }
//' }
//' tmpfile1=paste0(tempdir(),"/pamtest.bin")
//' JWriteBin(M,tmpfile1,dtype="float",dmtype="full")
//' tmpdisfile1=paste0(tempdir(),"/pamDL2.bin")
//' CalcAndWriteDissimilarityMatrix(tmpfile1,tmpdisfile1,distype="L2",restype="float",nthreads=0)
//' L <- ApplyPAM(tmpdisfile1,10,init_method="BUILD")
//' # Final value of sum of distances to closest medoid
//' GetTD(L,tmpdisfile1)
//' # Medoids:
//' L$med
//' # Medoid in which each individual has been classified
//' n<-names(L$med)
//' n[L$clasif]
//' @export
// [[Rcpp::export]]
Rcpp::List ApplyPAM(std::string dissim_file, int k, 
                    std::string init_method="BUILD", Rcpp::Nullable<Rcpp::NumericVector> initial_med=R_NilValue, 
                    int max_iter=1000,int nthreads=0)
{
 if ((unsigned int)k>=MAX_MEDOIDS)
 {
     ostringstream errst;
     errst << "Asking for too many medoids. Maximum is " << MAX_MEDOIDS-1 << ".\n";
     Rcpp::stop(errst.str());
 }
 if ((unsigned int)max_iter>MAX_ITER)
 {
     ostringstream errst;
     errst << "Asking for too many limit iterations. Maximum is " << MAX_ITER-1 << ".\n";
     errst << "If you need more, change the constant MAX_ITER at fastpam.h and reinstall the package.\n";
     Rcpp::stop(errst.str());
 }

 unsigned char initmet=TestInitMethodArgument(init_method,initial_med);
 
 if (DEB & DEBPP)
 {
     Rcpp::Rcout << "Reading symmetric distance/dissimilarity matrix " << dissim_file << "\n";
     Rcpp::Rcout.flush();
 }
 
 unsigned char mtype,ctype,e,md;
 indextype nr,nc;
 MatrixType(dissim_file,mtype,ctype,e,md,nr,nc);;
 if (mtype!=MTYPESYMMETRIC)
  Rcpp::stop("This function can operate only with binary symmetric matrices.\n");
 if ((ctype!=FTYPE) && (ctype!=DTYPE))
  Rcpp::stop("This function can operate only with binary symmetric matrices with float or double elements.\n");
  
 MemoryWarnings(nr,(ctype==FTYPE ? sizeof(float) : sizeof(double)));
 
 unsigned int nt=ChooseNumThreads(nthreads);
 
 vector<indextype> v;
 vector<indextype> m;
 vector<string> pointnames;
 switch (ctype)
 {
  case FTYPE:
  { 
   SymmetricMatrix<float> D(dissim_file);
   
   Rcpp::checkUserInterrupt();
   if (!D.TestDistDisMat())
   {
     ostringstream errst;
     errst << "  Sorry, the matrix in file " << dissim_file << " is not a distance/dissimilarity matrix.\n";
     errst << "  It has either non-zero elements in the main diagonal or null or negative elements outside it.\n";
     errst << "  The PAM algorithm does not work with this type of matrices.\n";
     Rcpp::stop(errst.str());
   }
   if (DEB & DEBPP)
     Rcpp::Rcout << "  Matrix is a correct distance/dissimilarity matrix.\n";
 
   FastPAM<float> FP(&D,k,initmet,max_iter);
 
   indextype ncells=D.GetNRows();
   if (ncells<1000)
   {
    nt=1;
    if (DEB & DEBPP)
     Rcpp::Rcout << "Calculating with a single thread, since you have only " << ncells << " vectors and the overhead of using threads would be excessive.\n";
   }
   FP.Init(initial_med,nt);
 
   if (max_iter!=0)
    FP.Run(nt); 
   
   if (DEB & DEBPP)
   {
    Rcpp::Rcout << "Time summary ";
    if (nt==1)
     Rcpp::Rcout << " (serial implementation).\n";
    else
     Rcpp::Rcout << " (parallel implementation with " << nt << " threads).\n";
    Rcpp::Rcout << "   Initalization: " << FP.GetInTime() << " s (method " << init_method << ").\n";
    Rcpp::Rcout << "   Optimization:  " << FP.GetOptTime() << " s in " << FP.GetNumIter() << " iterations";
    if (FP.GetNumIter()!=0)
     Rcpp::Rcout << " (" << FP.GetOptTime()/double(FP.GetNumIter()) << " seconds/iteration).\n";
    else
     Rcpp::Rcout << ".\n";
     
    double tt=FP.GetInTime()+FP.GetOptTime();
    Rcpp::Rcout << "   Total time:    " << tt << " s (" << int(tt/60.0) << " minutes, " << tt-60.0*int(tt/60.0) << " seconds).\n";
   }
   
   v=FP.GetAssign();
   m=FP.GetMedoids();
   pointnames=D.GetRowNames();   
   break;
  }
  case DTYPE:
  { 
   SymmetricMatrix<double> D(dissim_file);
   Rcpp::checkUserInterrupt();
   if (!D.TestDistDisMat())
   {
     ostringstream errst;
     errst << "  Sorry, the matrix in file " << dissim_file << " is not a distance/dissimilarity matrix.\n";
     errst << "  It has either non-zero elements in the main diagonal or null or negative elements outside it.\n";
     errst << "  The PAM algorithm does not work with this type of matrices.\n";
     Rcpp::stop(errst.str());
   }
   if (DEB & DEBPP)
     Rcpp::Rcout << "  Matrix is a correct distance/dissimilarity matrix.\n";
 
   FastPAM<double> FP(&D,k,initmet,max_iter);
 
   indextype ncells=D.GetNRows();
   if (ncells<1000)
   {
    nt=1;
    if (DEB & DEBPP)
     Rcpp::Rcout << "Calculating with a single thread, since you have only " << ncells << " vectors and the overhead of using threads would be excessive.\n";
   }
   FP.Init(initial_med,nt);
 
   if (max_iter!=0)
    FP.Run(nt); 
   
   if (DEB & DEBPP)
   {
    Rcpp::Rcout << "Time summary ";
    if (nt==1)
     Rcpp::Rcout << " (serial implementation).\n";
    else
     Rcpp::Rcout << " (parallel implementation with " << nt << " threads).\n";
    Rcpp::Rcout << "   Initalization: " << FP.GetInTime() << " s (method " << init_method << ").\n";
    Rcpp::Rcout << "   Optimization:  " << FP.GetOptTime() << " s in " << FP.GetNumIter() << " iterations";
    if (FP.GetNumIter()!=0)
     Rcpp::Rcout << " (" << FP.GetOptTime()/double(FP.GetNumIter()) << " seconds/iteration).\n";
    else
     Rcpp::Rcout << ".\n";
     
    double tt=FP.GetInTime()+FP.GetOptTime();
    Rcpp::Rcout << "   Total time:    " << tt << " s (" << int(tt/60.0) << " minutes, " << tt-60.0*int(tt/60.0) << " seconds).\n";
   }
   
   v=FP.GetAssign();
   m=FP.GetMedoids();
   pointnames=D.GetRowNames();    
   break;
  }
  default: Rcpp::stop("This function can operate only with binary symmetric matrices of type float or double.\n");
           break;
 }
  
 Rcpp::NumericVector clasif(v.size());
 for (indextype i=0; i<(unsigned int)clasif.length(); i++)
  clasif[i]=v[i]+1;
   
 Rcpp::NumericVector med(m.size());
 for (indextype i=0; i<(unsigned int)med.length(); i++)
  med[i]=m[i]+1;
  
 // If the points have names, we want to keep those names, too, in the clustering result for both, medoids and classified points.
 if (pointnames.size()>0)
 {
  Rcpp::StringVector cnames(pointnames.size());
  for (indextype i=0; i<(unsigned int)clasif.length(); i++)
   cnames(i)=Rcpp::String(pointnames[i]);
  clasif.names() = cnames;
  
  Rcpp::StringVector mnames(m.size());
  for (indextype i=0; i<(unsigned int)med.length(); i++)
   mnames(i)=Rcpp::String(pointnames[m[i]]);
  med.names() = mnames;
 }
 v.clear();
 m.clear();
  
 Rcpp::List ret;
 ret["med"] = med;
 ret["clasif"] = clasif;

 return ret;
}

//' GetTD
//'
//' Function that takes a PAM classification (as returned by ApplyPAM) and the dissimilarity matrix and returns the value of the TD function
//' (sum of dissimilarities between each point and its closest medoid, divided by the number of points).
//' This function is mainly for debugging/internal use.
//'
//' @param L            A list of two numeric vectors, L["med","clasif"], as returned by ApplyPAM (please, consult the help of ApplyPAM for details)
//' @param dissim_file  A string with the name of the binary file that contains the symmetric matrix of dissimilarities. Such matrix
//'                     should have been generated by CalcAndWriteDissimilarityMatrix.
//' @return TD          The value of the TD function.
//' @examples
//' # Synthetic problem: 10 random seeds with coordinates in [0..20]
//' # to which random values in [-0.1..0.1] are added
//' M<-matrix(0,100,500)
//' rownames(M)<-paste0("rn",c(1:100))
//' for (i in (1:10))
//' {
//'  p<-20*runif(500)
//'  Rf <- matrix(0.2*(runif(5000)-0.5),nrow=10)
//'  for (k in (1:10))
//'  {
//'   M[10*(i-1)+k,]=p+Rf[k,]
//'  }
//' }
//' tmpfile1=paste0(tempdir(),"/pamtest.bin")
//' tmpdisfile1=paste0(tempdir(),"/pamDL2.bin")
//' JWriteBin(M,tmpfile1,dtype="float",dmtype="full")
//' CalcAndWriteDissimilarityMatrix(tmpfile1,tmpdisfile1,distype="L2",restype="float",nthreads=0)
//' L <- ApplyPAM(tmpdisfile1,10,init_method="BUILD")
//' # Final value of sum of distances to closest medoid
//' GetTD(L,tmpdisfile1)
//' @export
// [[Rcpp::export]]
double GetTD(Rcpp::List L,std::string dissim_file)
{
 unsigned char mtype,ctype;
 MatrixType(dissim_file,mtype,ctype);
 if (mtype!=MTYPESYMMETRIC)
  Rcpp::stop("This function can operate only with binary symmetric matrices.\n");
  
 Rcpp::NumericVector med=L["med"];
 Rcpp::NumericVector clas=L["clasif"];
 
 double TD=0.0;
 
 switch (ctype)
 {
  case FTYPE:
  { 
   SymmetricMatrix<float> D(dissim_file);
 
   for (indextype k=0;k<(unsigned int)clas.length();k++)
    TD += double(D.Get(k,med[clas[k]-1]-1));
   break;
  }
  case DTYPE:
  { 
   SymmetricMatrix<double> D(dissim_file);
   
   for (indextype k=0;k<(unsigned int)clas.length();k++)
    TD += double(D.Get(k,med[clas[k]-1]-1));
   break;
  }
  default: Rcpp::stop("This function can operate only with binary symmetric matrices of type float or double.\n");
           break;
 }
 return(TD/double(clas.length()));
}


