/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "lex.yy.cc" // FlexLexer #include // cout, endl, #include // string, #include // ifstream, is_open(), rdbuf(), // filebuf, pubseekoff(), pubseekpos(), sgetn // ofstream, write() #include // ostringstream using namespace std; int main (int argc, char* argv[]) { string words_in_file_name; string words_out_file_name; string word; double word_number; char buffer[100]; ostringstream words_out_file_line; typedef map > MAP; MAP corpus_word_counter; double corpus_word_number; typedef multimap MMAP; MMAP sorted_corpus_word_counter; // double MIN_FRECUENCY = 0.0002; // double MAX_FRECUENCY = 0.01; switch (argc) { case 3 : words_in_file_name = argv[1]; words_out_file_name = argv[2]; break; default : cout << "ERROR: two parameters must be provided." << endl; return 1; } // load attribute into an stl map ifstream words_in_file (words_in_file_name.c_str()); if (! words_in_file.is_open()) { cout << "Error opening file: '" << words_in_file_name <<"'" << endl; exit (1); } corpus_word_number = 0; while(!words_in_file.getline(buffer, 100).eof()) { string buffer_str(buffer); string::size_type delimiter_pos = buffer_str.find_first_of(' ',0); word = buffer_str.substr(0, delimiter_pos) ; word_number = atof( ( buffer_str.substr(delimiter_pos+1, string::npos) ).c_str() ); corpus_word_counter.insert(pair(word, word_number)); corpus_word_number = corpus_word_number + word_number; } words_in_file.close(); // sort it MAP::iterator mapit = corpus_word_counter.begin(); while (mapit != corpus_word_counter.end()) { sorted_corpus_word_counter.insert(pair(mapit->second,mapit->first)); mapit++; } // write ofstream words_out_file (words_out_file_name.c_str()); if (! words_out_file.is_open()) { cout << "Error opening file: '" << words_out_file_name <<"'" << endl; exit (1); } MMAP::iterator mmapit = sorted_corpus_word_counter.begin(); while (mmapit != sorted_corpus_word_counter.end()) { words_out_file_line.str(""); // if ( (MIN_FRECUENCY < ((mmapit->first)/corpus_word_number)) && // (((mmapit->first)/corpus_word_number) < MAX_FRECUENCY) ) // { words_out_file_line << mmapit->second << endl; //<< " " << (mmapit->first/corpus_word_number) << endl; words_out_file.write((words_out_file_line.str()).c_str(),(words_out_file_line.str()).size()); // } // cout << mmapit->first << " - " << mmapit->second << endl; mmapit++; } words_out_file.close(); return 0; } // maink