/* html2txt
* Copyright (C) 1998, 1999,2000 Free Software Foundation, Inc.
* Copyright (C) DG9EP 1997-2000
*
* html2txt is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* GnuPG is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*--------------------------------------------------------------------------
* Optionen: (s. usage() ).
*--------------------------------------------------------------------------
* todo:
* - -p -> More/Page Modus
* - Ausgabe in cp437
* - Autodetect preferred codepage
* - Zeilenumbruch bei Zeichen
* - Batchmode: html2txt -b *.h*
* - Merge all HTML-Files, which are hyperlinked together into one TXT-File
*--------------------------------------------------------------------------
* Changelog:
*
* - ??.01.97 first revision, in Pascal for DOS
*
- ?????.97 Ported to OS/2
*
- ?????.97 Published in han-radio-packet-radio-net
*
- ?????.97 ported to C
*
- 09.03.97 DOS published in Simtel; all published in compunerve
*
- 27.06.97 port to win32
*
- 29.06.97 * New versioncount, started with 1.2
* * Complete rewrite of charset and &; support
* * Comments will be suppressed now
* * little fixes
*
- 06.07.97 * © und ® works again
* * options: -p -w -d (1.3)
* * Tabellen werden nun nicht breiter als Bildschirm
*
- 02.08.97 * literales & wird nun hoffentlich richtig behandelt
* * machte keinen Zeilenumbruch mehr. Fixed.
* * Unterstreichungen nun ein Zeichen l„nger
* * Versuche fr Linksammlung
*
- 18.01.00 Unter GPL gestellt.
*
*
*/
#define VERDATE "2.Aug.97"
#define VER "1.32"
#include
#include
#include
#include
#ifdef __WATCOMC__
#include
#endif
#ifdef __OS2__
#define OS "OS/2"
#elif __DOS__
#define OS "DOS"
#elif __NT__
#define OS "NT"
#else
#define OS "OS:??"
#endif
#define csVERS "*** HTML2TXT ('HTML to text') Version "VER" ("OS"), "VERDATE" (c)dg9ep ***"
#define CRLF '\n'
#define cnRAND 58
#define bool int
static FILE *fIn,*fOut;
static long nStartSpalte, nCharInLine, nRead, nWrite, nStrich ;
static long nIn, nSo;
static char cLast;
static bool fPre;
static long nCRLF = 0;
static long nLine = 1;
static long nRef = 1;
static int nHyp = 0 ;
static char sSond[355],sRef[355];
static struct {
int more;
int charANSI;
} opt;
static enum t_state { cNorm, cInTag, cSond, cInComment } state ;
void writeFOut( char *s);
char translate( unsigned char c );
#define szsize 64
static char *szTab[szsize] = {
/*192 */ "Agrave" , /*193 */ "Aacute" ,
/*194 */ "Acirc" , /*195 */ "Atilde" ,
/*196 */ "Auml" , /*197 */ "Aring" ,
/*198 */ "Aelig" , /*199 */ "Ccedil" ,
/*200 */ "Egrave" , /*201 */ "Eacute" ,
/*202 */ "Ecirc" , /*203 */ "Euml" ,
/*204 */ "Igrave" , /*205 */ "Iacute" ,
/*206 */ "Icirc" , /*207 */ "Iuml" ,
/*208 */ "ETH" , /*209 */ "Ntilde" ,
/*210 */ "Ograve" , /*211 */ "Oacute" ,
/*212 */ "Ocirc" , /*213 */ "Otilde" ,
/*214 */ "Ouml" , /*215 */ "XXXXXX" ,
/*216 */ "Oslash" , /*217 */ "Ugrave" ,
/*218 */ "Uacute" , /*219 */ "Ucirc" ,
/*220 */ "Uuml" , /*221 */ "Yacute" ,
/*222 */ "THORN" , /*223 */ "szlig" ,
/*224 */ "agrave" , /*225 */ "aacute" ,
/*226 */ "acirc" , /*227 */ "atilde" ,
/*228 */ "auml" , /*229 */ "aring" ,
/*230 */ "aelig" , /*231 */ "ccedil" ,
/*232 */ "egrave" , /*233 */ "eacute" ,
/*234 */ "ecirc" , /*235 */ "euml" ,
/*236 */ "igrave" , /*237 */ "iacute" ,
/*238 */ "icirc" , /*239 */ "iuml" ,
/*240 */ "eth" , /*241 */ "ntilde" ,
/*242 */ "ograve" , /*243 */ "oacute" ,
/*244 */ "ocirc" , /*245 */ "otilde" ,
/*246 */ "ouml" , /*247 */ "XXXXXX" ,
/*248 */ "oslash" , /*249 */ "ugrave" ,
/*250 */ "uacute" , /*251 */ "ucirc" ,
/*252 */ "uuml" , /*253 */ "yacute" ,
/*254 */ "thorn" , /*255 */ "yuml"
};
static unsigned char
iso2ascii[] = { /* ISO 8859-1 Latin Alphabet 1 to IBM Code Page 850 (International) */
0x20, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x20, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
186, 205, 201, 187, 200, 188, 204, 185, 203, 202, 206, 223, 220, 219, 254, 242,
179, 196, 218, 191, 192, 217, 195, 180, 194, 193, 197, 176, 177, 178, 213, 159,
255, 173, 189, 156, 207, 190, 221, 245, 249, 184, 166, 174, 170, 240, 169, 238,
248, 241, 253, 252, 239, 230, 244, 250, 247, 251, 167, 175, 172, 171, 243, 168,
183, 181, 182, 199, 142, 143, 146, 128, 212, 144, 210, 211, 222, 214, 215, 216,
209, 165, 227, 224, 226, 229, 153, 158, 157, 235, 233, 234, 154, 237, 232, 225,
133, 160, 131, 198, 132, 134, 145, 135, 138, 130, 136, 137, 141, 161, 140, 139,
208, 164, 149, 162, 147, 228, 148, 246, 155, 151, 163, 150, 129, 236, 231, 152
};
/*===================================================================*/
void usage (char*);
/*===================================================================*/
int
main(int argc, char **argv)
{
char c;
int rc, i,j ;
char *sOut, *sIn;
int nArg=1;
#ifdef deutsch
fprintf(stderr,"\n\n"csVERS"\nWandelt HTML Dateien in Texte (ANSI oder ASCII) um\n");
#else
fprintf(stderr,"\n\n"csVERS"\nConverts HTML files into text (ANSI or ASCII) files\n");
#endif
fIn = stdin;
fOut = stdout;
sIn = "(stdin)";
sOut = "(stdout)";
/* Option auswerten */
for( i=1; i = 3) { usage(argv[0]); return 255; }
nArg++;
}
}
if( fOut == NULL )
opt.more = 0;
fprintf(stderr,"%s ---> %s\n",sIn,sOut);
if( fIn == NULL )
if( (fIn = fopen(sIn,"r")) == NULL ) {
fprintf(stderr,
#ifdef deutsch
"Fehler beim ™ffnen von %s rc=%d\n",sIn,errno);
#else
"Error opening %s rc=%d\n",sIn,errno);
#endif
return 1;
}
if( fOut == NULL )
if( (fOut = fopen(sOut,"w")) == NULL ) {
#ifdef deutsch
fprintf(stderr, "Fehler beim Erzeugen von %s rc=%d\n",sOut,errno);
#else
fprintf(stderr, "Error creating %s rc=%d\n",sOut,errno);
#endif
return 2;
}
state = cNorm;
nStrich = 0;
nStartSpalte = 1;
nCharInLine = 0;
cLast = 0;
nRead = nWrite = 0;
fPre = 0;
while( (rc=fgetc(fIn)) != EOF ) {
c = rc;
nRead++;
doOneChar( c );
} /*while*/
fprintf(fOut,"\n---------------------------------------------------------------------------\n" \
"Generated from %s \n" \
"by %s \n" \
"---------------------------------------------------------------------------\n"
, sIn, csVERS );
fclose(fIn);
fclose(fOut);
#ifdef deutsch
fprintf(stderr,"%ld Zeichen gelesen\n" \
"%ld Zeichen geschrieben\n\n", nRead, nWrite );
#else
fprintf(stderr,"%ld character read\n" \
"%ld character written\n\n", nRead, nWrite );
#endif
return 0;
}
void
doOneChar( char c )
{
char sHack[3];
bool fDo;
switch (state) {
case cNorm :
switch (c) {
case '&' : state = cSond; nSo = 0; break;
case '<' : state = cInTag; nIn = 0; break;
default : fDo = 1;
if( !fPre ) {
if( c=='\t' ) c=' ';
if( c==10 ) c='\n';
if( c=='\n' ) {
if( cLast != ' ' )
c = ' ';
else
fDo = 0;
}
}
if( (nCharInLine+nStartSpalte > cnRAND) && (c==' ') && !fPre ) {
writeFOut("\n"); /* Umbrechen */
} else {
if( (c==' ') && (cLast==' ') ) fDo = 0;
if( (c==' ') && (cLast=='\n') ) fDo = 0;
if( fDo ) {
sHack[0] = c;
sHack[1] = '\0';
writeFOut(sHack);
nCharInLine++;
}
}
cLast = c;
} /*switch*/
break; /* case cNorm */
case cSond :
nSo++;
if( nSo == 1 ) sSond[0] = '\0';
if( (c == '&') || (c == ' ') || (c == ';') || (nSo>7) ) {
int i;
int fFound = 0;
if( sSond[0] =='#' ) {
i = atoi( &sSond[1] );
if( i==153 ) strcpy(sSond,"(TM)");
else {
sSond[0] = (char)i;
sSond[1] = '\0';
}
fFound = 1;
} else {
fFound = 1;
if( !strcmp(sSond, "lt" ) ) strcpy( sSond,"<");
else if( !strcmp(sSond, "gt" ) ) strcpy( sSond,">");
else if( !strcmp(sSond, "quot") ) strcpy( sSond,"\"");
else if( !strcmp(sSond, "amp" ) ) strcpy( sSond,"&");
else if( !strcmp(sSond, "nbsp") ) strcpy( sSond," ");
else if( !strcmp(sSond, "copy") ) strcpy( sSond,"(c)");
else if( !strcmp(sSond, "reg" ) ) strcpy( sSond,"(R)");
else {
fFound = 0;
for( i=0; i') || (nIn >= sizeof(sSond)) ) {
char *pc, sArgs[200];
pc = strstr(sSond," ");
if( pc != NULL ) {
*pc='\0';
strcpy(sArgs,++pc);
}
strupr(sSond);
if( !strcmp(sSond, "BR") ) strcpy(sSond,"\n");
else if( !strcmp(sSond, "P") ) strcpy(sSond,"\n\n");
else if( !strcmp(sSond, "LI") ) strcpy(sSond,"\n* ");
else if( !strcmp(sSond, "/UL") ) strcpy(sSond,"\n");
else if( !strcmp(sSond, "/OL") ) strcpy(sSond,"\n");
else if( !strcmp(sSond, "DL") ) strcpy(sSond,"\n");
else if( !strcmp(sSond, "/DL") ) { strcpy(sSond,"\n"); nStartSpalte = 0; }
else if( !strcmp(sSond, "DT") ) { strcpy(sSond,"\n* "); nStartSpalte = 0; }
else if( !strcmp(sSond, "DD") ) { strcpy(sSond,"\n "); nStartSpalte = 8; }
else if( !strcmp(sSond, "PRE")) { strcpy(sSond,"\n"); fPre = 1; }
else if( !strcmp(sSond, "/PRE")) { strcpy(sSond,"\n"); fPre = 0; }
else if( !strcmp(sSond, "TR") ) { strcpy(sSond,"\n"); nStartSpalte = 0;}
else if( !strcmp(sSond, "TD") ) { strcpy(sSond," "); }
else if( !strcmp(sSond, "/TABLE") ) { strcpy(sSond,"\n"); nStartSpalte = 0;}
else if( !strcmp(sSond, "A") ) {
/* href= analysieren */
/* test auf # */
strcpy(sRef,sArgs);
strcpy(sSond,"");
}
else if( !strcmp(sSond, "/A") ) {
/* suchen von sRef im Speicher */
/* nRef,sRef */
/* sprintf(sSond," [%d] ",nRef); */
sSond[0] = '\0';
nRef++;
sRef[0] = '\0';
}
else if( !strcmp(sSond, "HR" ) ) {
strcpy(sSond,"\n---------------------------------------------------------------------\n");
}
else if( !strcmp(sSond,"H1")
|| !strcmp(sSond,"H2")
|| !strcmp(sSond,"H3")
|| !strcmp(sSond,"H4")
|| !strcmp(sSond,"H5")
|| !strcmp(sSond,"H6") ) {
strcpy(sSond,"\n\n");
}
else if( ((sSond[0]=='/') && (sSond[1]=='H') && (sSond[2]>='0')&& (sSond[2]<='6'))
|| !strcmp(sSond,"/TITLE") ) {
/* Einen Strich unterm Titel */
strcpy(sSond,"\n\n");
nStrich = nCharInLine+1;
}
else if( !strcmp(sSond,"/TD") ){ int i;
sSond[0] = '\0';
for( i=1; i <= (nCharInLine % 10); i++)
strncat(sSond," ",1);
nStartSpalte = nCharInLine + strlen(sSond);
}
else {
/* strstr(sArgs,"ALT="); */
sSond[0] = '\0';
}
writeFOut(sSond);
state = cNorm;
} else {
sHack[0] = c; sHack[1] = '\0';
strcat(sSond,sHack);
}
break; /* InTag */
case cInComment:
if( (nHyp == 2) & (c=='>') ) {
state = cNorm;
} else {
if( c=='-' ) nHyp++;
else nHyp=0;
}
break;
} /*case*/
}
void
writeFOut( char *s)
{
char sSond[255];
/* Strich malen */
if( s[0] == '\n' ) {
nCharInLine = 0;
if( nStrich > 0 ) {
/* Eine Zeile mit Strichen erzeugen */
if( nStrich > 80 ) nStrich = 75;
{ int i=0; char* p = sSond;
for(;i0 ) {
int i;
for(i=0;i