/* html2txt
 *	Copyright (C) 1998, 1999,2000 Free Software Foundation, Inc.
 *     Copyright (C) DG9EP 1997-2000
 *
 * html2txt is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * GnuPG is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
 *--------------------------------------------------------------------------
 * Optionen: (s. usage() ).
 *--------------------------------------------------------------------------
 * todo:
 *  - -p -> More/Page Modus
 *  - Ausgabe in cp437
 *  - Autodetect preferred codepage
 *  - Zeilenumbruch bei <n> Zeichen
 *  - Batchmode:     html2txt -b *.h*
 *  - Merge all HTML-Files, which are hyperlinked together into one TXT-File
 *--------------------------------------------------------------------------
 *  Changelog: <p>
 *<ul>
 * <li>??.01.97  first revision, in Pascal for DOS
 * <li>?????.97  Ported to OS/2
 * <li>?????.97  Published in han-radio-packet-radio-net
 * <li>?????.97  ported to C
 * <li>09.03.97  DOS published in Simtel; all published in compunerve
 * <li>27.06.97  port to win32
 * <li>29.06.97  * New versioncount, started with 1.2
 *               * Complete rewrite of charset and &; support
 *               * Comments will be suppressed now
 *               * little fixes
 * <li>06.07.97  * &copy; und &reg; works again
 *               * options: -p -w -d  (1.3)
 *               * Tabellen werden nun nicht breiter als Bildschirm
 * <li>02.08.97  * literales & wird nun hoffentlich richtig behandelt
 *               * </Hn> machte keinen Zeilenumbruch mehr. Fixed.
 *               * Unterstreichungen nun ein Zeichen l„nger
 *	          * Versuche fr Linksammlung
 * <li>18.01.00  Unter GPL gestellt.
 *</ul>
 *
 */


#define VERDATE "2.Aug.97"
#define VER "1.32"


#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#ifdef __WATCOMC__
#include <conio.h>
#endif


#ifdef __OS2__
#define OS "OS/2"
#elif __DOS__
#define OS "DOS"
#elif __NT__
#define OS "NT"
#else
#define OS "OS:??"
#endif

#define csVERS "*** HTML2TXT ('HTML to text')  Version "VER" ("OS"), "VERDATE" (c)dg9ep ***"

#define CRLF    '\n'
#define cnRAND  58

#define bool int

static FILE *fIn,*fOut;

static long  nStartSpalte, nCharInLine, nRead, nWrite, nStrich ;
static long  nIn, nSo;
static char  cLast;
static bool  fPre;
static long  nCRLF = 0;
static long  nLine = 1;
static long  nRef  = 1;
static int   nHyp  = 0 ;
static char  sSond[355],sRef[355];
static struct {
    int more;
    int charANSI;
} opt;
static enum  t_state { cNorm, cInTag, cSond, cInComment } state ;


void writeFOut( char *s);
char translate( unsigned char c );

#define szsize 64

static char *szTab[szsize] = {
  /*192 */ "Agrave"  ,   /*193 */ "Aacute"  ,
  /*194 */ "Acirc"   ,   /*195 */ "Atilde"  ,
  /*196 */ "Auml"    ,   /*197 */ "Aring"   ,
  /*198 */ "Aelig"   ,   /*199 */ "Ccedil"  ,
  /*200 */ "Egrave"  ,   /*201 */ "Eacute"  ,
  /*202 */ "Ecirc"   ,   /*203 */ "Euml"    ,
  /*204 */ "Igrave"  ,   /*205 */ "Iacute"  ,
  /*206 */ "Icirc"   ,   /*207 */ "Iuml"    ,
  /*208 */ "ETH"     ,   /*209 */ "Ntilde"  ,
  /*210 */ "Ograve"  ,   /*211 */ "Oacute"  ,
  /*212 */ "Ocirc"   ,   /*213 */ "Otilde"  ,
  /*214 */ "Ouml"    ,   /*215 */ "XXXXXX"  ,
  /*216 */ "Oslash"  ,   /*217 */ "Ugrave"  ,
  /*218 */ "Uacute"  ,   /*219 */ "Ucirc"   ,
  /*220 */ "Uuml"    ,   /*221 */ "Yacute"  ,
  /*222 */ "THORN"   ,   /*223 */ "szlig"   ,
  /*224 */ "agrave"  ,   /*225 */ "aacute"  ,
  /*226 */ "acirc"   ,   /*227 */ "atilde"  ,
  /*228 */ "auml"    ,   /*229 */ "aring"   ,
  /*230 */ "aelig"   ,   /*231 */ "ccedil"  ,
  /*232 */ "egrave"  ,   /*233 */ "eacute"  ,
  /*234 */ "ecirc"   ,   /*235 */ "euml"    ,
  /*236 */ "igrave"  ,   /*237 */ "iacute"  ,
  /*238 */ "icirc"   ,   /*239 */ "iuml"    ,
  /*240 */ "eth"     ,   /*241 */ "ntilde"  ,
  /*242 */ "ograve"  ,   /*243 */ "oacute"  ,
  /*244 */ "ocirc"   ,   /*245 */ "otilde"  ,
  /*246 */ "ouml"    ,   /*247 */ "XXXXXX"  ,
  /*248 */ "oslash"  ,   /*249 */ "ugrave"  ,
  /*250 */ "uacute"  ,   /*251 */ "ucirc"   ,
  /*252 */ "uuml"    ,   /*253 */ "yacute"  ,
  /*254 */ "thorn"   ,   /*255 */ "yuml"
};

static unsigned char
iso2ascii[] = { /* ISO 8859-1 Latin Alphabet 1 to IBM Code Page 850 (International) */
0x20, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   0x08, 0x20, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,   0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,   0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,   0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,   0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,   0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,   0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,

186, 205, 201, 187, 200, 188, 204, 185,  203, 202, 206, 223, 220, 219, 254, 242,
179, 196, 218, 191, 192, 217, 195, 180,  194, 193, 197, 176, 177, 178, 213, 159,
255, 173, 189, 156, 207, 190, 221, 245,  249, 184, 166, 174, 170, 240, 169, 238,
248, 241, 253, 252, 239, 230, 244, 250,  247, 251, 167, 175, 172, 171, 243, 168,
183, 181, 182, 199, 142, 143, 146, 128,  212, 144, 210, 211, 222, 214, 215, 216,
209, 165, 227, 224, 226, 229, 153, 158,  157, 235, 233, 234, 154, 237, 232, 225,
133, 160, 131, 198, 132, 134, 145, 135,  138, 130, 136, 137, 141, 161, 140, 139,
208, 164, 149, 162, 147, 228, 148, 246,  155, 151, 163, 150, 129, 236, 231, 152
};

/*===================================================================*/
void usage (char*);
/*===================================================================*/

int
main(int argc, char **argv)
{
    char  c;
    int   rc, i,j ;
    char  *sOut, *sIn;
    int   nArg=1;

#ifdef deutsch
   fprintf(stderr,"\n\n"csVERS"\nWandelt HTML Dateien in Texte (ANSI oder ASCII) um\n");
#else
   fprintf(stderr,"\n\n"csVERS"\nConverts HTML files into text (ANSI or ASCII) files\n");
#endif

    fIn = stdin;
    fOut = stdout;
    sIn = "(stdin)";
    sOut = "(stdout)";

    /* Option auswerten */
    for( i=1; i <argc; i++) {
        /*fprintf(stderr, "%s\n", argv[i] );*/

        if( argv[i][0]=='-' ) {
            for( j=1; j < strlen(argv[i]); j++) {
                switch( argv[i][j] ) {
                    case 'd' : opt.charANSI=0;
                    case 'w' : opt.charANSI++;
                    case 'p' : opt.more++;
                               break;
                    default  :
                  /*case 'h' :
                    case '?' : */
                               usage(argv[0]);
                               return 255;
                }
            }
        } else {
            if( nArg == 1)   { sIn  = argv[i]; fIn = NULL; }
            if( nArg == 2)   { sOut = argv[i]; fOut= NULL; }
            if( nArg >= 3)   { usage(argv[0]); return 255; }
            nArg++;
        }
    }

    if( fOut == NULL )
        opt.more = 0;

    fprintf(stderr,"%s ---> %s\n",sIn,sOut);


    if( fIn == NULL )
      if( (fIn = fopen(sIn,"r")) == NULL ) {
          fprintf(stderr,
          #ifdef deutsch
                    "Fehler beim ™ffnen von %s rc=%d\n",sIn,errno);
          #else
                    "Error opening %s rc=%d\n",sIn,errno);
          #endif
          return 1;
      }

    if( fOut == NULL )
       if( (fOut = fopen(sOut,"w")) == NULL ) {
           #ifdef deutsch
            fprintf(stderr, "Fehler beim Erzeugen von %s  rc=%d\n",sOut,errno);
           #else
            fprintf(stderr, "Error creating %s  rc=%d\n",sOut,errno);
           #endif
           return 2;
       }

    state = cNorm;
    nStrich = 0;
    nStartSpalte = 1;
    nCharInLine = 0;
    cLast = 0;
    nRead = nWrite = 0;

    fPre = 0;
    while( (rc=fgetc(fIn)) != EOF ) {
       c = rc;
       nRead++;
       doOneChar( c );
    } /*while*/

    fprintf(fOut,"\n---------------------------------------------------------------------------\n" \
                   "Generated from %s \n" \
                   "by %s \n"     \
                   "---------------------------------------------------------------------------\n"
                   , sIn, csVERS );
    fclose(fIn);
    fclose(fOut);
    #ifdef deutsch
     fprintf(stderr,"%ld Zeichen gelesen\n"   \
                    "%ld Zeichen geschrieben\n\n",  nRead, nWrite );
    #else
     fprintf(stderr,"%ld character read\n"   \
                    "%ld character written\n\n",  nRead, nWrite );
    #endif
    return 0;
}


void
doOneChar( char c )
{
    char  sHack[3];
    bool  fDo;

    switch (state) {

       case cNorm :
           switch (c) {
              case '&' : state = cSond;  nSo = 0; break;
              case '<' : state = cInTag; nIn = 0; break;
              default  : fDo = 1;
                         if( !fPre ) {
                            if( c=='\t' )  c=' ';
                            if( c==10 )    c='\n';
                            if( c=='\n' ) {
                                if( cLast != ' ' )
                                    c = ' ';
                                else
                                    fDo = 0;
                            }
                         }

                         if( (nCharInLine+nStartSpalte > cnRAND) && (c==' ') && !fPre ) {
                             writeFOut("\n"); /* Umbrechen */
                         } else {
                             if( (c==' ') && (cLast==' ') )  fDo = 0;
                             if( (c==' ') && (cLast=='\n') )  fDo = 0;
                             if( fDo ) {
                                 sHack[0] = c;
                                 sHack[1] = '\0';
                                 writeFOut(sHack);
                                 nCharInLine++;
                             }
                         }
                         cLast = c;
           } /*switch*/
           break; /* case cNorm */

       case cSond :
            nSo++;
            if( nSo == 1 )  sSond[0] = '\0';

            if( (c == '&') || (c == ' ') || (c == ';') || (nSo>7) ) {
                int i;
                int fFound = 0;

                if( sSond[0] =='#' ) {
                    i = atoi( &sSond[1] );
                    if( i==153 ) strcpy(sSond,"(TM)");
                    else {
                        sSond[0] = (char)i;
                        sSond[1] = '\0';
                    }
                    fFound = 1;
                } else {
                    fFound = 1;
                    if(      !strcmp(sSond, "lt"  ) )  strcpy( sSond,"<");
                    else if( !strcmp(sSond, "gt"  ) )  strcpy( sSond,">");
                    else if( !strcmp(sSond, "quot") )  strcpy( sSond,"\"");
                    else if( !strcmp(sSond, "amp" ) )  strcpy( sSond,"&");
                    else if( !strcmp(sSond, "nbsp") )  strcpy( sSond," ");
                    else if( !strcmp(sSond, "copy") )  strcpy( sSond,"(c)");
                    else if( !strcmp(sSond, "reg" ) )  strcpy( sSond,"(R)");
                    else {
                        fFound = 0;
                        for( i=0; i<szsize; i++) {
                            if( !strcmp(sSond, szTab[i]) ) {
                                sSond[0] = (char)i+192;
                                sSond[1] = '\0';
                                fFound = 1;
                                break;
                            }
                        }
                    }
                }

                state = cNorm;
                if( fFound ) {
                    writeFOut(sSond);
                    cLast = sSond[0];
                } else { /* Dann schreiben wir eben alles literal zurck */
                    char sTmp[300];

                    writeFOut("&");
                    sprintf(sTmp,"%s%c",sSond,c);
                    for( i=0; i<strlen(sTmp); i++)
                        doOneChar(sTmp[i]) ; /* Rekursion .. */
                }
            } else {
                sHack[0] = c; sHack[1] = '\0';
                strcat(sSond,sHack);
            }
       break;


     case cInTag:
        nIn++;
        if( nIn == 1 ) {sSond[0] = '\0'; nStrich = 0; }

        if( (nIn == 4) & (!strcmp(sSond, "!--")) ) {
            state=cInComment;
            nHyp=0;
        }
        if( (c == '>') || (nIn >= sizeof(sSond)) ) {
            char *pc, sArgs[200];

            pc = strstr(sSond," ");
            if( pc != NULL ) {
                *pc='\0';
                strcpy(sArgs,++pc);
            }
            strupr(sSond);

            if(      !strcmp(sSond, "BR") )     strcpy(sSond,"\n");
            else if( !strcmp(sSond, "P") )      strcpy(sSond,"\n\n");

            else if( !strcmp(sSond, "LI") )     strcpy(sSond,"\n* ");
            else if( !strcmp(sSond, "/UL") )    strcpy(sSond,"\n");
            else if( !strcmp(sSond, "/OL") )    strcpy(sSond,"\n");

            else if( !strcmp(sSond, "DL") )     strcpy(sSond,"\n");
            else if( !strcmp(sSond, "/DL") )    { strcpy(sSond,"\n"); nStartSpalte = 0; }
            else if( !strcmp(sSond, "DT") )     { strcpy(sSond,"\n* ");  nStartSpalte = 0; }
            else if( !strcmp(sSond, "DD") )     { strcpy(sSond,"\n "); nStartSpalte = 8; }

            else if( !strcmp(sSond, "PRE"))     { strcpy(sSond,"\n"); fPre = 1; }
            else if( !strcmp(sSond, "/PRE"))    { strcpy(sSond,"\n"); fPre = 0; }

            else if( !strcmp(sSond, "TR") )     { strcpy(sSond,"\n"); nStartSpalte = 0;}
            else if( !strcmp(sSond, "TD") )     { strcpy(sSond,"  "); }
            else if( !strcmp(sSond, "/TABLE") ) { strcpy(sSond,"\n"); nStartSpalte = 0;}

            else if( !strcmp(sSond, "A") ) { 
                /* href= analysieren */
                /* test auf # */
                strcpy(sRef,sArgs); 
                strcpy(sSond,"");
            }
            else if( !strcmp(sSond, "/A") ) { 
                /*  suchen von sRef im Speicher */
                /*  nRef,sRef */
                /* sprintf(sSond," [%d] ",nRef); */
                sSond[0] = '\0';
                nRef++;
                sRef[0] = '\0';
            }
            
            else if( !strcmp(sSond, "HR" ) ) {
                 strcpy(sSond,"\n---------------------------------------------------------------------\n");
            }
            else if(  !strcmp(sSond,"H1")
                   || !strcmp(sSond,"H2")
                   || !strcmp(sSond,"H3")
                   || !strcmp(sSond,"H4")
                   || !strcmp(sSond,"H5")
                   || !strcmp(sSond,"H6") ) {
                       strcpy(sSond,"\n\n");
            }
            else if( ((sSond[0]=='/') && (sSond[1]=='H') && (sSond[2]>='0')&& (sSond[2]<='6'))
                     || !strcmp(sSond,"/TITLE") ) {
                     /* Einen Strich unterm Titel */
                     strcpy(sSond,"\n\n");
                     nStrich = nCharInLine+1;
            }
            else if( !strcmp(sSond,"/TD") ){ int i;
                     sSond[0] = '\0';
                     for( i=1; i <= (nCharInLine % 10); i++)
                         strncat(sSond," ",1);
                     nStartSpalte = nCharInLine + strlen(sSond);
            }
            else {
                     /* strstr(sArgs,"ALT="); */
                     sSond[0] = '\0';
            }
            writeFOut(sSond);
            state = cNorm;
       } else {
            sHack[0] = c; sHack[1] = '\0';
            strcat(sSond,sHack);
       }
       break; /* InTag */

     case cInComment:
       if( (nHyp == 2) & (c=='>') ) {
           state = cNorm;
       } else {
          if( c=='-' )    nHyp++;
          else            nHyp=0;
       }
       break;

    }  /*case*/
}



void
writeFOut( char *s)
{
    char sSond[255];

    /* Strich malen */
    if( s[0] == '\n' ) {
       nCharInLine = 0;
       if( nStrich > 0 ) {
           /* Eine Zeile mit Strichen erzeugen */
           if( nStrich > 80 )  nStrich = 75;
           { int i=0; char* p = sSond;
              for(;i<nStrich;i++,p++)
                 *p = '-';
           }
           sSond[nStrich] = '\0';
           fprintf(fOut, "\n%s", sSond);
           nLine++;
           nWrite += nStrich;
           nStrich = 0;
       }
       if( !strcmp(s, "\n" ) ) nCRLF++;
    } else nCRLF = 0;

    if( nCRLF < 3 ) { int i;
        if( opt.charANSI ) {
            for(i=0;i<strlen(s);i++)
               if( s[i] == '\n' )
                   nLine++;
        } else {
            for(i=0;i<strlen(s);i++)
               if( ( s[i] = iso2ascii[(int)s[i]] ) == '\n' )
                   nLine++;
        }
        fputs(s,fOut);
        nWrite += strlen(s);
    }

    /* Einrckung durchfhren */
    if( s[0] == '\n' ) {
       if( nStartSpalte >0 ) {
          int i;
          for(i=0;i<nStartSpalte;i++)
             fputc(' ',fOut);
       }
#ifdef __WATCOMC__
       if( opt.more ) {
           if( (nLine % 25) == 0 ) {
               getch();
           }
       }
#endif
    }
}


void
usage (char *s)
{
    fprintf(stderr,
              "\n"                                                      \
              "usage: %s [-dpw?h] [ infileHTML [outfileTXT] ]\n"        \
              "\n"                                                      \
              " -p   stops every 25 line and waits for keystroke\n"     \
              " -d   uses charset CP 850 (DOS) for output (Default)\n"  \
              " -w   uses charset ISO-8859-1 (Windows) for output\n"    \
              "\n"
      , s);
}


