// Copyright (C) 1997-1999  Adrian Trapletti
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Library General Public
// License as published by the Free Software Foundation; either
// version 2 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Library General Public License for more details.
//
// You should have received a copy of the GNU Library General Public
// License along with this library; if not, write to the Free
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

//
// Feedforward neural network package
//
 

#ifndef _NEURO_HH_
#define _NEURO_HH_


#include <iostream.h>
#include "linal.hh"
#include "intset.hh"


class estim_proc;


// activation function of the output units
const int LIN  = 0;  // linear
const int SOFT = 1;  // softmax

// activation function of the hidden units
const int SIG = 0;  // logistic sigmoid
const int TAN = 1;  // tanh

// objective function  
const int SSE     = 0;  // sum-of-squares 
const int ENTROPY = 1;  // cross-entropy
const int MAD     = 2;  // mean absolute deviation
const int GSSE    = 3;  // generalized sum-of-squares

// regularizer 
const int NOR   = 0;  // no regularizer
const int WDR   = 1;  // weight decay 
const int LASSO = 2;  // least absolute shrinkage and selection operator

// estimation procedures
const int GRDDSC = 0;  // simple gradient descent with momentum term 
const int STPDSC = 1;  // steepest descent
const int FRPRMN = 2;  // conjugate gradient 
const int DFPMIN = 3;  // quasi-Newton 
const int SANN   = 4;  // simulated annealing 
const int NRSANN = 5;  // Numerical Recipes simulated annealing 


class ffnet  // feedforward neural network
{
  friend istream& operator>> (istream& s, ffnet& net);  // deallocate and read            
  friend ostream& operator<< (ostream& s, const ffnet& net);              
  
private:
  long nin, nhid, nout;  // number of input, hidden and output units
  int hid_t;  // activation function of the hidden units
  int out_t;  // activation function of the output units
  int shortc;  // shortcut connections?
  vec w;  
  /* all connections w(1..(nin+1)*nhid+(nhid+1)*nout+nin*nout) 
     
     NONLINEAR PART
     --------------
     w(1)                                    = bias                hidden unit 1  
     w(2)                                    = input unit 1     to hidden unit 1
     w(3)                                    = input unit 2     to hidden unit 1
     ...
     w(nin+1)                                = input unit nin   to hidden unit 1
     w(nin+2)                                = bias                hidden unit 2
     ...
     w(2*nin+2)                              = input unit nin   to hidden unit 2
     ...
     w((nin+1)*nhid)                         = input unit nin   to hidden unit nhid
     w((nin+1)*nhid+1)                       = bias                output unit 1
     w((nin+1)*nhid+2)                       = hidden unit 1    to output unit 1
     w((nin+1)*nhid+3)                       = hidden unit 2    to output unit 1
     ...
     w((nin+1)*nhid+nhid+1)                  = hidden unit nhid to output unit 1
     ...
     w((nin+1)*nhid+(nhid+1)*nout)           = hidden unit nhid to output unit nout

     LINEAR PART
     -----------
     w((nin+1)*nhid+(nhid+1)*nout+1)         = input unit 1     to output unit 1  
     w((nin+1)*nhid+(nhid+1)*nout+2)         = input unit 2     to output unit 1
     ...
     w((nin+1)*nhid+(nhid+1)*nout+nin)       = input unit nin   to output unit 1
     ...
     w((nin+1)*nhid+(nhid+1)*nout+nin*nout)) = input unit nin   to output unit nout

     REMARKS
     -------
     -If the net has shortcuts, then the weight vector consists of the NONLINEAR and LINEAR PART.
     -Without shortcuts, only the NONLINEAR PART is used.
     -If the net has shortcuts and the number of hidden units is zero, then the weight 
     vector consists of all the bias units from the hidden layer to the output layer 
     (NONLINEAR PART), i.e., with nhid set to zero, and of the LINEAR PART. */
  
public: 
  ffnet (long input, long hidden, long output, int hid_type, int out_type, int shortcut);  
  // default weight initialization is randomly from standard Normal distribution
  ffnet (const ffnet& net);  // copy constructor
  ffnet& set_weights (const vec& weight);  // set weight vector
  ffnet& operator= (const ffnet& net);  // assignment 
  vec predict (const vec& x) const;  // prediction from given input
  mat predict (const mat& x) const;  /* prediction from given input array of observations;
					each row equals one observation */
  double train (const mat& x, const mat& y, const intset& fixed_weights, 
		int objective_function, int regularizer, 
		double reg_control, const estim_proc& estim);
  /* This procedure fits the ffnet to a data set. The fitting procedure is very flexible
     and allows the user to choose among a number of different optimization strategies.
     
     1. It is possible to fix any set of weights during training. 
     
     2. The objective function can either be the sum-of-squares (SSE), the mean 
     absolute deviation (MAD), the generalized sum-of-squares (GSSE), or the 
     cross-entropy (ENTROPY). The latter works together with the softmax output, 
     and the two former need the linear output. 
     
     3. The objective function possibly contains an additive penalty term for 
     the model complexity. The so called regularizer term can either be the weight 
     decay (WDR) or the least absolute shrinkage and selection operator (LASSO). 
     The weight of the regularizer relative to the objective function is determined 
     by some control parameter. 
     
     4. Different types of optimization procedures can be used to minimize the 
     overall objective function. For an overview c.f. "W.H. Press, S.A. Teukolsky, 
     W.T. Vetterling, and B.P. Flannery (1995): Numerical Recipes in C: The Art of 
     Scientific Computing, 2nd Edition, Cambridge, MA, Chapter 10", short "NRC", and
     "C.M. Bishop (1995): Neural Networks for Pattern Recognition, Clarendon 
     Press, Oxford, Chapter 7", short "NNPR". Train provides simple gradient descent 
     with momentum term (GRDDSC) and steepest descent (STPDSC) c.f. "NNPR, 263-274", 
     conjugate gradient (FRPRMN) and BFGS quasi-Newton (DFPMIN) c.f. "NRC, 420-430", 
     simulated annealing (SANN) and Numerical Recipes simulated annealing (NRSANN) 
     "NRC, 444-455". SANN uses an algorithm similar to "D. Mitra, F. Romeo, and 
     A. Sangiovanni-Vincentelli (1986): Convergence and Finite-Time Behaviour of
     Simulated Annealing, Adv. Appl. Prob. 18, 747-771". The next candidate state
     is drawn from a Gaussian probability distribution with mean equal to the 
     current state and variance-covariance matrix equal to the identity matrix times 
     the current temperature. The cooling schedules for SANN and NRSANN are of the 
     same type as suggested in Mitra et al., c.f. also "C.J.P. Belisle (1992): 
     Convergence Theorems for a Class of Simulated Annealing Algorithms on R^d, 
     J. Appl. Prob. 29, 885-895" and "M. Locatelli (1996): Convergence Properties of
     Simulated Annealing for Continuous Global Optimization, J. Appl. Prob. 33, 1127-
     1140".
     
     5. Different optimization strategies are possible, i.e., different strategies how to use 
     these procedures. Batch based training corresponds to the classical deterministic 
     function optimization approach where the whole data set is used to define an 
     objective function. This function is minimized by some optimization procedure. 
     For epoch based training only a part of the whole data set, called the epoch set, 
     is used to define the actual objective function. After the an optimization procedure has 
     performed a number of iterations on the actual function, it stops. Then a new epoch set is 
     chosen by sampling randomly without replacement from the whole data set and the
     procedure starts again until convergence. This strategy is a stochastic optimization
     strategy and GRDDSC, STPDSC, FRPRMN, and DFPMIN can be used in this mode.
     
     Input: input observations x, each row is one observation; target observations y;
     set of fixed weights during training fixed_weights; objective function; type 
     of regularizer; weight of the regularizer relative to the objective function reg_control;
     container for the information about the estimation procedure estim */
  double train (const mat& x, const mat& y, int objective_function, int regularizer, 
		double reg_control, const estim_proc& estim);  
  // as above, but without fixing any weights
  mat hess (const mat& x, const mat& y, const intset& fixed_weights, 
	    int objective_function, int regularizer, double reg_control);
  // compute hessian matrix of given objective function at the current weight vector
  mat hess (const mat& x, const mat& y, int objective_function, 
	    int regularizer, double reg_control);
  // compute hessian matrix, but without fixing any weights
  
  vec get_weights () const { return w; }  // get weight vector
  ~ffnet () {}
};


class estim_proc  // container for the information about an estimation procedure 
{
  friend class ffnet;
  
private:
  int type, trace, itmax, kmax, epoch, itepoch;
  double tol, eta, alpha, ti;
  
public:  
  void init (int ty, int tr, int itm, double to); 
  /* Initializer for FRPRMN and DFPMIN in batch mode.
     Input: type ty of estimation procedure; trace == NO for no output during estimation,
     trace == PRINT for output to stdout, and trace == RPLOT for output via R callback (this works 
     only with the R interface); the maximum number of iterations itm the estimation procedure
     iters; for FRPRMN tol is the convergence tolerance on the objective function value,
     for DFPMIN tol is the convergence tolerance on zeroing the gradient, c.f. "NRC, 
     420-430". */
  void init (int ty, int tr, int itm, double to, int ep, int itep);  
  /* Initializer for FRPRMN and DFPMIN in epoch based mode. 
     Input: type ty of estimation procedure; trace see above; the number of times a 
     new epoch set is chosen; for FRPRMN tol is the convergence tolerance on the 
     objective function value, for DFPMIN tol is the convergence tolerance on zeroing 
     the gradient, c.f. "NRC, 420-430"; the size of the epoch set ep; the number of iterations
     a given procedure computes for a given epoch set itep. */
  void init (int ty, int tr, int itm, double to, double et, double al);  
  /* Initializer for GRDDSC in batch mode.
     Input: type ty of estimation procedure; trace see above; the maximum number of 
     iterations itm the estimation procedure iters; tol is the convergence tolerance on 
     the objective function value; et is the learning rate; al is the momentum rate. */
  void init (int ty, int tr, int itm, double to, double et, double al, int ep, int itep);  
  // Initializer for GRDDSC in epoch based mode. See above for a describtion.
  void init (int ty, int tr, int itm, int km, double t);  
  /* Initializer for SANN and NRSANN.
     Input: type ty of estimation procedure; trace see above; the number of total iterations 
     itm (an iteration is not defined in the usual sense. One iteration equals one 
     function evaluation here); the number of iterations at each temperature km; the 
     starting temperature t. */
  
  estim_proc () {}                                                  
  ~estim_proc () {}                                                
};


#endif  // _NEURO_HH_














