/*
extract headers from HTML files
in format suitable for turning into permuted index
*/
#include
#include
#include
#include
/*
maximum sizes for input line and for name in tag
*/
#define MAX_LINE 512
#define MAX_NAME 64
/*
functions
all return 0 for OK, 1 for errors
*/
int do_file( char *, FILE * ) ;
int parse_line( char * ) ;
int print_line( char *, char *) ;
int print_header_problem( char * ) ;
int sanity() ;
void die( char * ) ;
char *prog_name ;
int max_level ;
char *current_file ;
int main(int argc, char* argv[])
{
char *p ;
int temp, done, status ;
FILE *fp ;
prog_name = *argv ;
argc--,argv++ ;
max_level = 9 ;
if(argc && *argv ) {
p = *argv ;
if( p[0] == '-' ) {
if( isdigit(p[1]) && p[2] == '\0' ) {
max_level = p[1] - 0 ;
argc-- ;
argv++ ;
}
else die("unknown option") ;
} }
status = done = 0 ;
if( argc == 0) {
if( (status = do_file("STDIN", stdin)) == 0 )
done++ ;
}
else {
/*
printf("ARGC = %d\n", argc ) ;
*/
while( argc-- ) {
p = *argv++ ;
/*
printf("ARGV P %s %s\n", *argv, p) ;
*/
if( p == NULL ) {
fprintf(stderr, "%s: null filename pointer\n", prog_name) ;
status++ ;
}
else if( (fp = fopen(p,"r")) == NULL ) {
fprintf(stderr, "%s: cannot open file %s\n", prog_name, p) ;
status++ ;
}
else {
if( (temp = do_file(p, fp)) != 0 )
status++ ;
done++ ;
fclose(fp) ;
}
fflush(stderr) ;
fflush(stdout) ;
}
}
/*
printf("%s: %d files processed, %d with errors\n", prog_name, done, status) ;
*/
return( status ? 1 : 0 ) ;
}
void die( char *message )
{
fflush(stdout) ;
fprintf(stderr, "%s: %s\n", prog_name, message) ;
exit(1) ;
}
int header_flags[10] ;
int in_header ;
char buffer[MAX_LINE+1] ;
char label[MAX_NAME+1] ;
int do_file( char *file, FILE *fp )
{
int i, status, x, y ;
char *base, *p ;
status = 0 ;
in_header = 0 ;
label[0] = '\0' ;
for( i = 0 ; i < 10 ; i++ )
header_flags[i] = 0 ;
current_file = file ;
while( base = fgets(buffer, MAX_LINE, fp) ) {
// count < and > characters in line
for( x = y = 0, p = base ; *p ; p++ )
switch( *p ) {
case '<':
x++ ;
break ;
case '>':
y++ ;
break ;
default:
break ;
}
// skip line if no < or >
if( x == 0 && y == 0 )
continue ;
// report error for unequal count
else if( x != y ) {
if( strncmp( base, "", 3) ) {
fflush(stdout) ;
fprintf(stderr, "%s in file %s: unequal < > counts %d %d\n",
prog_name, file, x, y ) ;
fprintf(stderr, "%s: %s\n", prog_name, base) ;
fflush(stderr) ;
status = 1 ;
}
continue ;
}
// parse lines containing tags
else
if( parse_line(base) )
status = 1 ;
// check that header labelling is sane
for( i = x = y = 0 ; i < 10 ; i++ ) {
// count non-zero entries
if( x = header_flags[i] )
y++ ;
// should be in 0 or 1 headers at a time
if( x > 1 || x < 0 )
status = 1 ;
}
if( y > 1 )
status = 1 ;
}
return status ;
}
int parse_line( char *data )
{
char *p, *q, *end ;
int x ;
// set end pointer
for( end = data ; *end ; end++ )
;
// trim off trailing returns or newlines
for( p = end - 1, q = end ; q > data ; p--,q-- ) {
switch( *p ) {
case '\012':
case '\015':
*p = '\0' ;
continue ;
default:
break ; // out of switch()
}
break ; // out of for()
}
end = q ;
p = data ;
while( p < end ) {
// find tag delimiters
if( *p == '<') {
for( q = p + 1 ; *q ; q++ )
if( *q == '<' || *q == '>' )
break ;
// if we find another '<'
// restart tag search from it
if( *q == '<' ) {
p = q ;
continue ;
}
// "<>" is not interesting
if( q == p + 1 ) {
fflush(stdout) ;
fprintf(stderr, "%s: null tag\n", prog_name) ;
fprintf(stderr, "%s: line\n", prog_name, data) ;
fflush(stderr) ;
p = q + 1 ;
continue ;
}
// ignore delimiters once found
*q = '\0' ;
p++ ;
// p points to tag contents, null terminated
switch( *p ) {
// save contents of tags
case 'a' :
case 'A' :
if( p[1] == ' ' &&
(p[2] == 'n' || p[2] == 'N') &&
(p[3] == 'a' || p[3] == 'A') &&
(p[4] == 'm' || p[4] == 'M') &&
(p[5] == 'e' || p[5] == 'E') &&
p[6] == '=' )
strncpy(label, p + 7, MAX_NAME) ;
break ;
case 'b' :
case 'B' :
if( in_header && strlen(p) == 2 &&
(p[1] == 'r' || p[1] == 'R') )
putchar(' ') ;
break ;
// header tags
case 'h' :
case 'H' :
if( strlen(p) == 2 && isdigit(p[1]) ) {
if( in_header )
fprintf(stderr, "%s: bad header nesting in %s\n",
prog_name, current_file) ;
x = p[1] - '0' ;
in_header = 1 ;
header_flags[x]++ ;
printf("%s\t%s\tH%d\t", current_file, label, x) ;
}
break ;
// only care about end-of-header
case '/':
p++ ;
switch( *p ) {
case 'h' :
case 'H' :
if( strlen(p) == 2 && isdigit(p[1]) ) {
if( ! in_header )
fprintf(stderr, "%s: bad header nesting in %s\n",
prog_name, current_file) ;
x = p[1] - '0' ;
in_header = 0 ;
header_flags[x]-- ;
printf("\n") ;
}
break ;
}
break ;
// uninteresting tag, look for next
default :
break ;
}
// tag done, point p beyond it
p = q + 1 ;
}
else if( in_header ) {
if( isprint(*p) && *p != '\n' )
putchar(*p) ;
else
putchar(' ');
p++ ;
}
else
p++ ;
}
return(0) ;
}
int print_line( char *tag, char *text)
{
printf("%%s\ts\t%s\t%s\t\n", current_file, label, tag, text) ;
return 0 ;
}
int print_header_problem( char *file )
{
int i ;
fflush(stdout) ;
fprintf(stderr, "%s: HEADER TAG PROBLEM in file %s\n", prog_name, file) ;
fprintf(stderr, "%s: counts", prog_name) ;
for ( i = 0 ; i < 10 ; i++ )
fprintf(stderr, "\t%d", i) ;
fprintf(stderr,"\n") ;
fflush(stderr) ;
return(0) ;
}