/* extract headers from HTML files in format suitable for turning into permuted index */ #include #include #include #include /* maximum sizes for input line and for name in tag */ #define MAX_LINE 512 #define MAX_NAME 64 /* functions all return 0 for OK, 1 for errors */ int do_file( char *, FILE * ) ; int parse_line( char * ) ; int print_line( char *, char *) ; int print_header_problem( char * ) ; int sanity() ; void die( char * ) ; char *prog_name ; int max_level ; char *current_file ; int main(int argc, char* argv[]) { char *p ; int temp, done, status ; FILE *fp ; prog_name = *argv ; argc--,argv++ ; max_level = 9 ; if(argc && *argv ) { p = *argv ; if( p[0] == '-' ) { if( isdigit(p[1]) && p[2] == '\0' ) { max_level = p[1] - 0 ; argc-- ; argv++ ; } else die("unknown option") ; } } status = done = 0 ; if( argc == 0) { if( (status = do_file("STDIN", stdin)) == 0 ) done++ ; } else { /* printf("ARGC = %d\n", argc ) ; */ while( argc-- ) { p = *argv++ ; /* printf("ARGV P %s %s\n", *argv, p) ; */ if( p == NULL ) { fprintf(stderr, "%s: null filename pointer\n", prog_name) ; status++ ; } else if( (fp = fopen(p,"r")) == NULL ) { fprintf(stderr, "%s: cannot open file %s\n", prog_name, p) ; status++ ; } else { if( (temp = do_file(p, fp)) != 0 ) status++ ; done++ ; fclose(fp) ; } fflush(stderr) ; fflush(stdout) ; } } /* printf("%s: %d files processed, %d with errors\n", prog_name, done, status) ; */ return( status ? 1 : 0 ) ; } void die( char *message ) { fflush(stdout) ; fprintf(stderr, "%s: %s\n", prog_name, message) ; exit(1) ; } int header_flags[10] ; int in_header ; char buffer[MAX_LINE+1] ; char label[MAX_NAME+1] ; int do_file( char *file, FILE *fp ) { int i, status, x, y ; char *base, *p ; status = 0 ; in_header = 0 ; label[0] = '\0' ; for( i = 0 ; i < 10 ; i++ ) header_flags[i] = 0 ; current_file = file ; while( base = fgets(buffer, MAX_LINE, fp) ) { // count < and > characters in line for( x = y = 0, p = base ; *p ; p++ ) switch( *p ) { case '<': x++ ; break ; case '>': y++ ; break ; default: break ; } // skip line if no < or > if( x == 0 && y == 0 ) continue ; // report error for unequal count else if( x != y ) { if( strncmp( base, "", 3) ) { fflush(stdout) ; fprintf(stderr, "%s in file %s: unequal < > counts %d %d\n", prog_name, file, x, y ) ; fprintf(stderr, "%s: %s\n", prog_name, base) ; fflush(stderr) ; status = 1 ; } continue ; } // parse lines containing tags else if( parse_line(base) ) status = 1 ; // check that header labelling is sane for( i = x = y = 0 ; i < 10 ; i++ ) { // count non-zero entries if( x = header_flags[i] ) y++ ; // should be in 0 or 1 headers at a time if( x > 1 || x < 0 ) status = 1 ; } if( y > 1 ) status = 1 ; } return status ; } int parse_line( char *data ) { char *p, *q, *end ; int x ; // set end pointer for( end = data ; *end ; end++ ) ; // trim off trailing returns or newlines for( p = end - 1, q = end ; q > data ; p--,q-- ) { switch( *p ) { case '\012': case '\015': *p = '\0' ; continue ; default: break ; // out of switch() } break ; // out of for() } end = q ; p = data ; while( p < end ) { // find tag delimiters if( *p == '<') { for( q = p + 1 ; *q ; q++ ) if( *q == '<' || *q == '>' ) break ; // if we find another '<' // restart tag search from it if( *q == '<' ) { p = q ; continue ; } // "<>" is not interesting if( q == p + 1 ) { fflush(stdout) ; fprintf(stderr, "%s: null tag\n", prog_name) ; fprintf(stderr, "%s: line\n", prog_name, data) ; fflush(stderr) ; p = q + 1 ; continue ; } // ignore delimiters once found *q = '\0' ; p++ ; // p points to tag contents, null terminated switch( *p ) { // save contents of tags case 'a' : case 'A' : if( p[1] == ' ' && (p[2] == 'n' || p[2] == 'N') && (p[3] == 'a' || p[3] == 'A') && (p[4] == 'm' || p[4] == 'M') && (p[5] == 'e' || p[5] == 'E') && p[6] == '=' ) strncpy(label, p + 7, MAX_NAME) ; break ; case 'b' : case 'B' : if( in_header && strlen(p) == 2 && (p[1] == 'r' || p[1] == 'R') ) putchar(' ') ; break ; // header tags case 'h' : case 'H' : if( strlen(p) == 2 && isdigit(p[1]) ) { if( in_header ) fprintf(stderr, "%s: bad header nesting in %s\n", prog_name, current_file) ; x = p[1] - '0' ; in_header = 1 ; header_flags[x]++ ; printf("%s\t%s\tH%d\t", current_file, label, x) ; } break ; // only care about end-of-header case '/': p++ ; switch( *p ) { case 'h' : case 'H' : if( strlen(p) == 2 && isdigit(p[1]) ) { if( ! in_header ) fprintf(stderr, "%s: bad header nesting in %s\n", prog_name, current_file) ; x = p[1] - '0' ; in_header = 0 ; header_flags[x]-- ; printf("\n") ; } break ; } break ; // uninteresting tag, look for next default : break ; } // tag done, point p beyond it p = q + 1 ; } else if( in_header ) { if( isprint(*p) && *p != '\n' ) putchar(*p) ; else putchar(' '); p++ ; } else p++ ; } return(0) ; } int print_line( char *tag, char *text) { printf("%%s\ts\t%s\t%s\t\n", current_file, label, tag, text) ; return 0 ; } int print_header_problem( char *file ) { int i ; fflush(stdout) ; fprintf(stderr, "%s: HEADER TAG PROBLEM in file %s\n", prog_name, file) ; fprintf(stderr, "%s: counts", prog_name) ; for ( i = 0 ; i < 10 ; i++ ) fprintf(stderr, "\t%d", i) ; fprintf(stderr,"\n") ; fflush(stderr) ; return(0) ; }