diff options
Diffstat (limited to 'doc/utils/four2perm.c')
-rw-r--r-- | doc/utils/four2perm.c | 140 |
1 files changed, 140 insertions, 0 deletions
diff --git a/doc/utils/four2perm.c b/doc/utils/four2perm.c new file mode 100644 index 000000000..5b575c1b5 --- /dev/null +++ b/doc/utils/four2perm.c @@ -0,0 +1,140 @@ +#include <ctype.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> + +#define MAX_LINE 512 + +void die( char * ) ; + +char buffer[MAX_LINE+1] ; +char *prog_name ; + +void die( char *message ) +{ + fflush(stdout) ; + fprintf(stderr, "%s: %s\n", prog_name, message) ; + exit(1) ; +} + +int main(int argc, char* argv[]) +{ + int errors ; + prog_name = *argv ; + if( argc != 1 ) + die("pure filter, takes no arguments") ; + errors = 0 ; + while( fgets(buffer, MAX_LINE, stdin)) + errors += do_line(buffer) ; + exit(errors ? 1 : 0 ) ; +} + +int do_line(char *data) +{ + char *p, *q, *r, *end, *before, *after ; + // expecting two tab-separated fields + // point r to 2nd, null terminate 1st + for( r = data ; *r && *r != '\t' ; r++ ) + ; + if( *r != '\t' ) + return(1) ; + end = r++ ; + *end = '\0' ; + for( q = r ; *q ; q++ ) + if( *q == '\n' ) + *q = '\0' ; + if( !strlen(r) ) + return(1) ; + // within 1st, parse as space-separated + // p will point to current word, q past its end + // before & after point to rest of text + // spaces converted to nulls & back as req'd + before = "" ; + for( p = data ; p < end ; p = q + 1 ) { + if( p > data ) { + before = data ; + p[-1] = '\0' ; + } + // find end of word + for( q = p ; *q && *q != ' ' ; q++ ) + ; + if( q == end ) + after = "" ; + else if( q < end ) { + after = q + 1 ; + *q = '\0' ; + } + else assert(0) ; + print_line(before, p, after, r) ; + if( q < end ) + *q = ' ' ; + if( p > data ) + p[-1] = ' ' ; + } + return(0) ; +} + +// print formatted line for permuted index +// two tab-separated fields +// 1st is sort key +// 2nd is printable line +// pipe it through something like +// sort -F | awk -F '\t' '{print $2}' +// to get final output + +print_line( char *before, char *word, char *after, char *tag) +{ + int i , x, y, z ; +/* + printf("%s\t%s\t%s\t%s\n", before, word, after, tag) ; +*/ + if( list_word(word) ) + return ; + x = strlen(before) ; + y = strlen(word) ; + z = strlen(after) ; + // put in sortable field + // strip out with awk after sorting + printf("%s %s\t", word, after) ; + // shorten before string to fit field + for( ; x > 30 ; x-- ) + before++ ; + printf("%30s", before) ; + // print keyword, html tagged + printf(" %s%s</a> ", tag, word) ; + // padding, outside tag + for( ; y < 18 ; y++ ) + putchar(' ') ; + if( z ) + printf("%s", after) ; + printf("\n") ; +} + +// avoid indexing on common English words + +char *list[] = { + "the", "of", "a", "an", "to", "and", "or", "if", "for", "at", + "am", "is", "are", "was", "were", "have", "has", "had", "be", "been", + "on", "some", "with", "any", "into", "as", "by", "in", "out", + "that", "then", "this", "that", "than", "these", "those", + "he", "his", "him", "she", "her", "hers", "it", "its", + "&", "", "+", "-", "=", "--", "<", ">", "<=", ">=", + "!", "?", "#", "$", "%", "/", "\\", "\"", "\'", + NULL + } ; +// interrogative words like "how" and "where" deliberately left out of +// above list because users might want to search for "how to..." etc. + +// return 1 if word in list, else 0 +// case-insensitive comparison + +list_word( char *p ) +{ + char **z ; + for( z = list ; *z != NULL ; z++ ) + if( ! strcasecmp( p, *z ) ) + return 1 ; + return 0 ; +} + |