diff options
Diffstat (limited to 'doc/utils/html2four.c')
-rw-r--r-- | doc/utils/html2four.c | 298 |
1 files changed, 298 insertions, 0 deletions
diff --git a/doc/utils/html2four.c b/doc/utils/html2four.c new file mode 100644 index 000000000..fc1100d01 --- /dev/null +++ b/doc/utils/html2four.c @@ -0,0 +1,298 @@ +/* + extract headers from HTML files + in format suitable for turning into permuted index +*/ + +#include <ctype.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +/* + maximum sizes for input line and for name in <a> tag +*/ +#define MAX_LINE 512 +#define MAX_NAME 64 + +/* + functions + all return 0 for OK, 1 for errors +*/ +int do_file( char *, FILE * ) ; +int parse_line( char * ) ; +int print_line( char *, char *) ; +int print_header_problem( char * ) ; +int sanity() ; + +void die( char * ) ; + +char *prog_name ; +int max_level ; +char *current_file ; + +int main(int argc, char* argv[]) +{ + char *p ; + int temp, done, status ; + FILE *fp ; + + prog_name = *argv ; + argc--,argv++ ; + + max_level = 9 ; + if(argc && *argv ) { + p = *argv ; + if( p[0] == '-' ) { + if( isdigit(p[1]) && p[2] == '\0' ) { + max_level = p[1] - 0 ; + argc-- ; + argv++ ; + } + else die("unknown option") ; + } } + + status = done = 0 ; + if( argc == 0) { + if( (status = do_file("STDIN", stdin)) == 0 ) + done++ ; + } + else { +/* + printf("ARGC = %d\n", argc ) ; +*/ + while( argc-- ) { + p = *argv++ ; +/* + printf("ARGV P %s %s\n", *argv, p) ; +*/ + if( p == NULL ) { + fprintf(stderr, "%s: null filename pointer\n", prog_name) ; + status++ ; + } + else if( (fp = fopen(p,"r")) == NULL ) { + fprintf(stderr, "%s: cannot open file %s\n", prog_name, p) ; + status++ ; + } + else { + if( (temp = do_file(p, fp)) != 0 ) + status++ ; + done++ ; + fclose(fp) ; + } + fflush(stderr) ; + fflush(stdout) ; + } + } +/* + printf("%s: %d files processed, %d with errors\n", prog_name, done, status) ; +*/ + return( status ? 1 : 0 ) ; +} + +void die( char *message ) +{ + fflush(stdout) ; + fprintf(stderr, "%s: %s\n", prog_name, message) ; + exit(1) ; +} + +int header_flags[10] ; +int in_header ; + +char buffer[MAX_LINE+1] ; +char label[MAX_NAME+1] ; + +int do_file( char *file, FILE *fp ) +{ + int i, status, x, y ; + char *base, *p ; + + status = 0 ; + in_header = 0 ; + label[0] = '\0' ; + for( i = 0 ; i < 10 ; i++ ) + header_flags[i] = 0 ; + current_file = file ; + + while( base = fgets(buffer, MAX_LINE, fp) ) { + // count < and > characters in line + for( x = y = 0, p = base ; *p ; p++ ) + switch( *p ) { + case '<': + x++ ; + break ; + case '>': + y++ ; + break ; + default: + break ; + } + // skip line if no < or > + if( x == 0 && y == 0 ) + continue ; + // report error for unequal count + else if( x != y ) { + if( strncmp( base, "<!--", 4) && strncmp(base, "-->", 3) ) { + fflush(stdout) ; + fprintf(stderr, "%s in file %s: unequal < > counts %d %d\n", + prog_name, file, x, y ) ; + fprintf(stderr, "%s: %s\n", prog_name, base) ; + fflush(stderr) ; + status = 1 ; + } + continue ; + } + // parse lines containing tags + else + if( parse_line(base) ) + status = 1 ; + // check that header labelling is sane + for( i = x = y = 0 ; i < 10 ; i++ ) { + // count non-zero entries + if( x = header_flags[i] ) + y++ ; + // should be in 0 or 1 headers at a time + if( x > 1 || x < 0 ) + status = 1 ; + } + if( y > 1 ) + status = 1 ; + } + return status ; +} + +int parse_line( char *data ) +{ + char *p, *q, *end ; + int x ; + + // set end pointer + for( end = data ; *end ; end++ ) + ; + // trim off trailing returns or newlines + for( p = end - 1, q = end ; q > data ; p--,q-- ) { + switch( *p ) { + case '\012': + case '\015': + *p = '\0' ; + continue ; + default: + break ; // out of switch() + } + break ; // out of for() + } + end = q ; + p = data ; + while( p < end ) { + // find tag delimiters + if( *p == '<') { + for( q = p + 1 ; *q ; q++ ) + if( *q == '<' || *q == '>' ) + break ; + // if we find another '<' + // restart tag search from it + if( *q == '<' ) { + p = q ; + continue ; + } + // "<>" is not interesting + if( q == p + 1 ) { + fflush(stdout) ; + fprintf(stderr, "%s: null tag\n", prog_name) ; + fprintf(stderr, "%s: line\n", prog_name, data) ; + fflush(stderr) ; + p = q + 1 ; + continue ; + } + // ignore delimiters once found + *q = '\0' ; + p++ ; + // p points to tag contents, null terminated + switch( *p ) { + // save contents of <a name= > tags + case 'a' : + case 'A' : + if( p[1] == ' ' && + (p[2] == 'n' || p[2] == 'N') && + (p[3] == 'a' || p[3] == 'A') && + (p[4] == 'm' || p[4] == 'M') && + (p[5] == 'e' || p[5] == 'E') && + p[6] == '=' ) + strncpy(label, p + 7, MAX_NAME) ; + break ; + case 'b' : + case 'B' : + if( in_header && strlen(p) == 2 && + (p[1] == 'r' || p[1] == 'R') ) + putchar(' ') ; + break ; + // header tags + case 'h' : + case 'H' : + if( strlen(p) == 2 && isdigit(p[1]) ) { + if( in_header ) + fprintf(stderr, "%s: bad header nesting in %s\n", + prog_name, current_file) ; + x = p[1] - '0' ; + in_header = 1 ; + header_flags[x]++ ; + printf("%s\t%s\tH%d\t", current_file, label, x) ; + } + break ; + // only care about end-of-header + case '/': + p++ ; + switch( *p ) { + case 'h' : + case 'H' : + if( strlen(p) == 2 && isdigit(p[1]) ) { + if( ! in_header ) + fprintf(stderr, "%s: bad header nesting in %s\n", + prog_name, current_file) ; + x = p[1] - '0' ; + in_header = 0 ; + header_flags[x]-- ; + printf("\n") ; + } + break ; + } + break ; + // uninteresting tag, look for next + default : + break ; + } + // tag done, point p beyond it + p = q + 1 ; + } + else if( in_header ) { + if( isprint(*p) && *p != '\n' ) + putchar(*p) ; + else + putchar(' '); + p++ ; + } + else + p++ ; + } + return(0) ; +} + +int print_line( char *tag, char *text) +{ + printf("%%s\ts\t%s\t%s\t\n", current_file, label, tag, text) ; + return 0 ; +} + +int print_header_problem( char *file ) +{ + int i ; + fflush(stdout) ; + fprintf(stderr, "%s: HEADER TAG PROBLEM in file %s\n", prog_name, file) ; + fprintf(stderr, "%s: counts", prog_name) ; + for ( i = 0 ; i < 10 ; i++ ) + fprintf(stderr, "\t%d", i) ; + fprintf(stderr,"\n") ; + fflush(stderr) ; + return(0) ; +} + |