home
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Library General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "lex.yy.cc" // FlexLexer
#include <iostream> // cout, endl,
#include <string> // string,
#include <fstream> // ifstream, is_open(), rdbuf(),
// filebuf, pubseekoff(), pubseekpos(), sgetn
// ofstream, write()
#include <sstream> // ostringstream
using namespace std;
int main (int argc, char* argv[])
{
string words_in_file_name;
string words_out_file_name;
string word;
double word_number;
char buffer[100];
ostringstream words_out_file_line;
typedef map<string, double, less<string> > MAP;
MAP corpus_word_counter;
double corpus_word_number;
typedef multimap<double,string> MMAP;
MMAP sorted_corpus_word_counter;
// double MIN_FRECUENCY = 0.0002;
// double MAX_FRECUENCY = 0.01;
switch (argc)
{
case 3 :
words_in_file_name = argv[1];
words_out_file_name = argv[2];
break;
default :
cout << "ERROR: two parameters must be provided." << endl;
return 1;
}
// load attribute into an stl map
ifstream words_in_file (words_in_file_name.c_str());
if (! words_in_file.is_open())
{
cout << "Error opening file: '" << words_in_file_name <<"'" << endl;
exit (1);
}
corpus_word_number = 0;
while(!words_in_file.getline(buffer, 100).eof())
{
string buffer_str(buffer);
string::size_type delimiter_pos = buffer_str.find_first_of(' ',0);
word = buffer_str.substr(0, delimiter_pos) ;
word_number = atof( ( buffer_str.substr(delimiter_pos+1, string::npos) ).c_str() );
corpus_word_counter.insert(pair<string ,double >(word, word_number));
corpus_word_number = corpus_word_number + word_number;
}
words_in_file.close();
// sort it
MAP::iterator mapit = corpus_word_counter.begin();
while (mapit != corpus_word_counter.end())
{
sorted_corpus_word_counter.insert(pair<double, string>(mapit->second,mapit->first));
mapit++;
}
// write
ofstream words_out_file (words_out_file_name.c_str());
if (! words_out_file.is_open())
{
cout << "Error opening file: '" << words_out_file_name <<"'" << endl;
exit (1);
}
MMAP::iterator mmapit = sorted_corpus_word_counter.begin();
while (mmapit != sorted_corpus_word_counter.end())
{
words_out_file_line.str("");
// if ( (MIN_FRECUENCY < ((mmapit->first)/corpus_word_number)) &&
// (((mmapit->first)/corpus_word_number) < MAX_FRECUENCY) )
// {
words_out_file_line << mmapit->second << endl; //<< " " << (mmapit->first/corpus_word_number) << endl;
words_out_file.write((words_out_file_line.str()).c_str(),(words_out_file_line.str()).size());
// }
// cout << mmapit->first << " - " << mmapit->second << endl;
mmapit++;
}
words_out_file.close();
return 0;
} // main