Html to text. Invoke a flex c++ parser

[ permalink ] [ download ]
/*
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Library General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

#include "lex.yy.cc"  // FlexLexer

#include <dirent.h>   // DIR, opendir(), readdir(), closedir(), S_ISDIR
                      // dirent
#include <sys/stat.h> // stat()
#include <iostream>   // cout, endl, 
#include <string>     // string, 
#include <fstream>    // ifstream, is_open(), rdbuf(), 
                      // filebuf, pubseekoff(), pubseekpos(), sgetn
  	 	      // ofstream, write()

using namespace std;

int main (int argc, char* argv[])
{ 
    string html_directory;
    string txt_directory;
    string html_document_name;
    string txt_document_name; 

    switch (argc)	
    {
       	case 3 :
             html_directory = argv[1];
             txt_directory = argv[2];
	     if ( html_directory.at(html_directory.length()-1) != '/' )
             {
               html_directory.append("/");
             } 
	     if ( txt_directory.at(txt_directory.length()-1) != '/' )
             {
               txt_directory.append("/");
             } 
             break;
        default :
            cout << "ERROR: two directory parameters must be provided." << endl;
            return 1;
    }
	
    DIR *html_dir = opendir(html_directory.c_str());
		
    if(html_dir == NULL)
    {
        cout << "Error opening directory: '" << html_directory << "'"<< endl; 
        exit (1); 
    }
	
    struct dirent *entry;
    for(; NULL != (entry = readdir(html_dir)); )
    {
        struct stat st;

        // Skip dots
        if ( '.' == entry->d_name[0] && 
           ( '\0' == entry->d_name[1] || ( '.' == entry->d_name[1] && '\0' == entry->d_name[2]))
           )
        { 
            // do nothing
        }
        else
        {
            // process file
		  
            html_document_name = html_directory + entry->d_name;
  	    txt_document_name = txt_directory + entry->d_name;

	    txt_document_name.replace(txt_document_name.length()-4,'4',"txt");

            if(0 == stat(html_document_name.c_str(), &st)) // get file status
            {
	      if (!S_ISDIR(st.st_mode)) // skip directories
                {
                    cout << "processing: '" << html_document_name << "' ..." << endl;
                    cout << "writing: '"    << txt_document_name << "' ..." << endl;
	  
		    // setup for lex processing

		    filebuf fb_html;
		    fb_html.open (html_document_name.c_str(),ios::in);
  		    istream html_document(&fb_html);

		    filebuf fb_txt;
		    fb_txt.open (txt_document_name.c_str(),ios::out);
  		    ostream txt_document(&fb_txt);

		    FlexLexer *lexer = new yyFlexLexer(&html_document,&txt_document);
		    lexer->yylex();

  		    fb_txt.close();
  		    fb_html.close();


	        }  // skip directories
            } // get file status
			  
        } // process file
	  
    } // for
	
    closedir(html_dir);	
	
    return 0;
} // main
hits counter