Ordenar lista de atributos extraidos del texto

[ permalink ] [ download ]
/*
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Library General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

#include "lex.yy.cc"  // FlexLexer

#include <iostream>   // cout, endl, 
#include <string>     // string, 
#include <fstream>    // ifstream, is_open(), rdbuf(), 
                      // filebuf, pubseekoff(), pubseekpos(), sgetn
  	 	      // ofstream, write()
#include <sstream>    // ostringstream

using namespace std;

int main (int argc, char* argv[])
{ 
    string words_in_file_name;
    string words_out_file_name;

    string word;
    double word_number;
    char buffer[100];

    ostringstream words_out_file_line;

    typedef map<string, double, less<string> > MAP;

    MAP corpus_word_counter;
    double corpus_word_number;

    typedef multimap<double,string> MMAP;
    MMAP sorted_corpus_word_counter;

//    double MIN_FRECUENCY = 0.0002;
//    double MAX_FRECUENCY = 0.01;


    switch (argc)
    {
       	case 3 :
             words_in_file_name = argv[1];
             words_out_file_name = argv[2];

             break;
        default :
            cout << "ERROR: two parameters must be provided." << endl;
            return 1;
    }

    // load attribute into an stl map

    ifstream words_in_file (words_in_file_name.c_str());
    if (! words_in_file.is_open())
    { 
	cout << "Error opening file: '" << words_in_file_name <<"'" << endl; 
	exit (1); 
    }

    corpus_word_number = 0;

    while(!words_in_file.getline(buffer, 100).eof())
    {        
       string buffer_str(buffer);
      
       string::size_type delimiter_pos = buffer_str.find_first_of(' ',0);
      
       word = buffer_str.substr(0, delimiter_pos)  ;
       word_number = atof( ( buffer_str.substr(delimiter_pos+1, string::npos) ).c_str() );

       corpus_word_counter.insert(pair<string ,double >(word, word_number));
       corpus_word_number = corpus_word_number + word_number;

    }

    words_in_file.close();

    // sort it

    MAP::iterator mapit = corpus_word_counter.begin();

    while (mapit != corpus_word_counter.end())
    {
       sorted_corpus_word_counter.insert(pair<double, string>(mapit->second,mapit->first));
       mapit++;
    }

    // write

    ofstream words_out_file (words_out_file_name.c_str());
    if (! words_out_file.is_open())
    { 
	cout << "Error opening file: '" << words_out_file_name <<"'" << endl; 
	exit (1); 
    }

    MMAP::iterator mmapit = sorted_corpus_word_counter.begin();

    while (mmapit != sorted_corpus_word_counter.end())
    {

         words_out_file_line.str("");

//       if ( (MIN_FRECUENCY < ((mmapit->first)/corpus_word_number)) && 
//            (((mmapit->first)/corpus_word_number) < MAX_FRECUENCY) )
//       {
	 words_out_file_line << mmapit->second << endl; //<< " " << (mmapit->first/corpus_word_number) << endl;
         words_out_file.write((words_out_file_line.str()).c_str(),(words_out_file_line.str()).size());
//       }
//       cout << mmapit->first << " - " << mmapit->second << endl;
       mmapit++;
    }

    words_out_file.close();

    return 0;
} // main
hits counter