/* bfsplit.cc
 *
 * Copyright (C) 2009-2011 CNRS
 *
 * This file is part of SiLiX.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public Li1
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include<cstdlib>
#include<fstream>
#include<iostream>
#include<map>
#include<string>
#include<sstream>
#include <limits>
#include<boost/program_options.hpp>
using namespace std;
namespace po = boost::program_options;

int main(int argc, char** argv)
{
// BEGIN PROGRAM OPTIONS //

  // Declare a group of options that will be
  // allowed only on command line
  po::options_description generic("Generic options");
  generic.add_options()
    ("help,h", "Display this information");
  // Declare a group of options that will be
  // allowed both on command line and in
  // config file
  int min;
  string odir, oprefix;
  po::options_description config("Configuration");
  config.add_options()
  ("odir,o",
	po::value<string>(&odir)->default_value("."),
	"DIRECTORY for output files (current directory by default)")
  ("min,n", po::value<int>(&min)->default_value(0),
   "Minimum size allowed")
   ("prefix,p",
         po::value<string>(&oprefix)->default_value(""),
         "Prefix for output files (chosen automatically by default)");
  // Hidden options, will be allowed both on command line and
  // in config file, but will not be shown to the user.
  po::options_description hidden("Hidden options");
  hidden.add_options()
    ("input-file", po::value< vector<string> >(), "input file");

  po::options_description cmdline_options;
  cmdline_options.add(generic).add(config).add(hidden);
  po::options_description visible("");
  visible.add(generic).add(config);
  po::positional_options_description pd;
  pd.add("input-file", 2);
  po::variables_map vm;
  store(po::command_line_parser(argc, argv).
	options(cmdline_options).positional(pd).run(), vm);
  po::notify(vm);


  if (vm.count("help")) {
    stringstream infostream;
    infostream<<argv[0]<<" [OPTION...] FILE.NET/FASTA FILE.FNODES ";
    infostream<<" - Family spliter"<<endl<<endl
	      <<"From nodes prefixed by a family id,"<<endl
	      <<"and from .net or .fasta files,"
	      <<"split into multiple .net or .fasta files associated to each family."<<endl;

    cout<<"General use:"<<endl<<"  "<<infostream.str()<<endl;
    cout << visible << "\n";
    return 0;
  }
  bool badinput = false;
  if (!vm.count("input-file"))
    badinput = true;
  else if (vm["input-file"].as< vector<string> >().size()!=2)
    badinput = true;
  if (badinput){
    cerr<<"Error : uncorrect number of command line arguments"<<endl;
    return 1;
  }
  vector<string> inputfiles = vm["input-file"].as< vector<string> >();
  // END PROGRAM OPTIONS //

  // families number and size
  cerr<<"Treating "<<inputfiles[1]<<endl;
  ifstream f1;
  f1.open(inputfiles[1].c_str());
  if (!f1.is_open()){
	cerr<<"Impossible to open "<<inputfiles[1]<<endl;
	exit(1);
  }
  string line;
  map<string, int> dico;
  map<string, string> seq2fam;
  getline(f1, line);
  while (line.length()){
    istringstream linestream(line);
    string famid;
    linestream>>famid;
    string seq;
    linestream>>seq;
    ++dico[famid];
    seq2fam[seq] = famid;
    line.clear();
    getline(f1, line);
  }
  f1.close();

  // selecting families
  map<string, int> fam2int;
  int nbfam = 0;
  for (map<string, int>::iterator it=dico.begin(); it!=dico.end();){
    if (it->second<min)
      dico.erase(it++);
    else{
      string famid = it->first;
      fam2int[famid] = nbfam;
      nbfam++;
      ++it;
    }
  }

  // extension choice
  size_t dotpos = inputfiles[0].rfind('.');
  string extension = inputfiles[0].substr(dotpos);
  size_t slashpos = inputfiles[0].rfind('/');
  string prefix;
  string dir;
  if (slashpos == string::npos){
    dir = ".";
    prefix = inputfiles[0].substr(0, dotpos);
  }
  else{
    dir = inputfiles[0].substr(0, slashpos);
    prefix = inputfiles[0].substr(slashpos+1, dotpos-slashpos-1);
  }
  if (oprefix.length()){ // if given by user
    prefix = oprefix;
  }

  // creating files handles
  ofstream *vout = new ofstream[nbfam];
  for (map<string, int>::iterator it=dico.begin(); it!=dico.end(); it++){
    string famid = it->first;
    int i = fam2int[famid];
    vout[i].open(string(odir+"/"+prefix+"_"+famid+extension).c_str(), ios_base::out);
  }

  // processing
  cerr<<"Treating "<<inputfiles[0]<<endl;
  ifstream f0;
  f0.open(inputfiles[0].c_str());
  if (!f0.is_open()){
    cerr<<"Impossible to open "<<inputfiles[0]<<endl;
    exit(1);
  }
  getline(f0, line);
  if ((extension==".fasta")||(extension==".fa")){
    // Splitting fasta file
    int iout = -1;
    while (line.length()){
      if (line[0]=='>'){
        iout = -1;
        istringstream linestream(line);
        string seq;
        linestream>>seq;
        seq = seq.substr(1);
        map<string, string>::iterator it = seq2fam.find(seq);
        if (it!=seq2fam.end()){
          map<string, int>::iterator indico = dico.find(it->second);
          if (indico!=dico.end()){
            iout = fam2int[it->second];
          }
        }
        else{
          cerr<<"Unknown sequence id "<<seq<<" in Fasta file"<<endl;
        }
      }
      if (iout>=0)
        vout[iout]<<line<<endl;
      line.clear();
      getline(f0, line);
    }
  }
  else{
    if (extension==".net"){
      // Splitting network file
      while (line.length()){
        istringstream linestream(line);
        string seq1;
        linestream>>seq1;
        map<string, string>::iterator it1 = seq2fam.find(seq1);
        if (it1!=seq2fam.end()){
            map<string, int>::iterator indico = dico.find(it1->second);
            if (indico!=dico.end())
              vout[fam2int[it1->second]]<<line<<endl;
        }
        else{
          cerr<<"Unknown sequence id "<<seq1<<" in Net file"<<endl;
        }
        line.clear();
        getline(f0, line);
      }
    }
    else{
      cerr<<"Error: authorized file extensions are .fasta/.fa or .net only."<<endl;
      exit(1);
    }
  }


  // closing files handles
  for (int i=0; i<nbfam; i++)
    vout[i].close();
  delete [] vout;
}
