/*----------------------------------------------------------------------*\ | Chew and Spit -- a program to ingest roughly-formatted metadata and | | attempt to populate a properly-structured parse tree, then output | | both parseable metadata and, separately, whatever could not be | | handled. | | | | Peter N. Schweitzer (U.S. Geological Survey, Reston, VA 20192) | \*----------------------------------------------------------------------*/ #include #include #include #include #include "config.h" #include "local.h" #include "item.h" #include "keyword.h" #include "actions.h" #include "revision.h" extern enum fgdc_keyword best_match (char *string, char **value); extern void add_aliases (char *alias_file); extern void write_text (char *name, struct item *p); FILE *out = NULL; static FILE *info,*leftover; static int verbose = 0; static char *prefix = NULL; static int prefix_length = 0; static void add_related_file (char *type, char *name); static char *related_file (char *type); static char *empty_string = ""; /*----------------------------------------------------------------------*\ | Snarf the whole file, and dole out one line at a time on request. | \*----------------------------------------------------------------------*/ static char *buffer = NULL; static char *buffer_ptr = NULL; static char *buffer_end = NULL; static char *read_text_file (char *name) { long n; FILE *fp; char *dst,*end; long size; if (!name) return (NULL); dst = NULL; if (fp = fopen (name,"rb")) { if (fseek (fp,0L,SEEK_END) == 0) { size = 1L + ftell (fp); if (size > 1L) { if (dst = (char *) malloc (size)) { rewind (fp); n = fread (dst,1,size,fp); end = dst + n; *end = 0; } else fprintf (out,"Error: could not allocate space for file %s\n",name); } else fprintf (out,"Error: Input file is empty.\n"); } fclose (fp); } else fprintf (out,"Error: could not open input file %s\n",name); return (dst); } static char *next_line (void) { char *this_line; if (!buffer || !buffer_ptr || !buffer_end) return (NULL); if (buffer_ptr >= buffer_end) return (NULL); this_line = buffer_ptr; while (buffer_ptr < buffer_end) if (*buffer_ptr == '\r') { /* CR */ *buffer_ptr++ = 0; if (*buffer_ptr == '\n') *buffer_ptr++ = 0; /* CR LF */ break; } else if (*buffer_ptr == '\n') { /* LF */ *buffer_ptr++ = 0; break; } else buffer_ptr++; /* not EOL */ return (this_line); } /*----------------------------------------------------------------------*\ | Handle character encoding \*----------------------------------------------------------------------*/ static char character_encoding[256] = "ISO-8859-1"; void set_character_encoding (char *new_value) { if (strlen (new_value) < 256) strcpy (character_encoding,new_value); else { memcpy (character_encoding,new_value,255); character_encoding[255] = 0; } } char *get_character_encoding (void) { return (character_encoding); } /*----------------------------------------------------------------------*\ \*----------------------------------------------------------------------*/ struct item *chew_and_spit (char *input_file) { int line_number,n; char *line; char *s,*v; struct item *p,*q,*r; struct item *root = NULL,*current = NULL; enum fgdc_keyword key; enum fgdc_keyword *e; int found; /* Open the input file */ if (buffer = read_text_file (input_file)) { buffer_end = buffer + strlen (buffer); buffer_ptr = buffer; /* Create root node of tree */ p = allocate_item (1); p->key = WMetadata; p->d = text_of (p->key); p->original = 1; p->len = strlen (p->d); p->indent = 0; p->next = p->prev = p->child = NULL; root = p; current = root; /* Read lines from the input file */ line_number = 0; while (line = next_line()) { line_number++; s = line + strlen (line) - 1; while (s >= line && isspace(*s)) *s-- = 0; if (s >= line) { s = line; /* while (*s && !isalpha (*s)) s++; */ while (*s && (isspace (*s) || isdigit (*s) || *s == '.' || *s == '-')) s++; if (prefix) if (memcmp (prefix,s,prefix_length) == 0) { s += prefix_length; key = best_match (s,&v); } else { key = Wunknown; v = s; } else key = best_match (s,&v); } else key = Wblank; /*----------------------------------------------------------*\ | Uncomment the following line to show which element was | | recognized on each line. | fprintf (stderr,"%3d: %s\n",line_number,text_of(key)); \*----------------------------------------------------------*/ switch (key) { case Wnull: break; case Wblank: if (current->key == Wblank) break; /* collapse multiple blanks */ if (current->key == Wunknown) { p = insert_item_after (current); p->key = key; p->indent = current->indent; if (p->d = (char *) malloc (1)) { *p->d = 0; p->len = 0; p->original = 0; } if (verbose) fprintf (info,"%3d: blank line added to %s\n",line_number,text_of(current->parent->key)); current = p; } else if (element_list_of (current->key)) { fprintf (info,"%3d: blank line omitted\n",line_number); } else { p = add_child (current); p->key = key; p->indent = 1 + current->indent; if (p->d = (char *) malloc (1)) { *p->d = 0; p->len = 0; p->original = 0; } if (verbose) fprintf (info,"%3d: blank line added as child of %s\n",line_number,text_of(current->key)); } break; /* end of Wblank case */ case Wunknown: /*----------------------------------------------*\ | This line contains a scalar value. Put the | | entire line into a scalar element. | \*----------------------------------------------*/ if (current->key == Wunknown || current->key == Wblank) { /*------------------------------------------*\ | current is a scalar child; this line | | should be its next sibling. | \*------------------------------------------*/ p = insert_item_after (current); p->key = key; p->indent = current->indent; for (s=line; *s && isspace(*s); s++); n = 1 + strlen (s); if (p->d = (char *) malloc (n)) { strcpy (p->d,s); p->len = n; p->original = 0; } if (verbose) fprintf (info,"%3d: text added to %s\n",line_number,text_of(current->parent->key)); current = p; } else if (e = element_list_of (current->key)) { /*--------------------------------------*\ | current is a compound. It should not| | have scalar children. | | | | However, if current has only one | | child element and that child is a | | scalar, you can insert that element | | as a container for this text. | \*--------------------------------------*/ found = 0; for (n=0; e[n] != Wnull; n++); if (n == 1) { if (element_list_of (e[0]) == NULL) { /*----------------------------------*\ | Text is an orphan grandchild of | | current. Add the child that is | | parent of the text. | \*----------------------------------*/ p = add_child (current); p->key = e[0]; p->indent = 1 + current->indent; p->d = text_of (p->key); p->len = 1 + strlen (p->d); p->original = 0; if (verbose) fprintf (info,"%3d: %s added as a child of %s to hold text\n",line_number,text_of(p->key),text_of(current->key)); /*----------------------------------*\ | Add scalar element to hold the | | text. | \*----------------------------------*/ r = add_child (p); r->key = Wunknown; r->indent = 1 + p->indent; for (s=line; *s && isspace(*s); s++); n = 1 + strlen (s); if (r->d = (char *) malloc (n)) { strcpy (r->d,s); r->len = n; r->original = 0; } if (verbose) fprintf (info,"%3d: text added as child of %s\n",line_number,text_of(p->key)); current = r; found = 1; } } if (!found) { fprintf (leftover,"%3d: %s\n",line_number,line); if (verbose) fprintf (info,"%3d: text could not be placed\n",line_number); } } else { /*--------------------------------------*\ | current is a recognized element that | | contains scalar children. | \*--------------------------------------*/ p = add_child (current); p->key = key; p->indent = 1 + current->indent; for (s=line; *s && isspace(*s); s++); n = 1 + strlen (s); if (p->d = (char *) malloc (n)) { strcpy (p->d,s); p->len = n; p->original = 0; } if (verbose) fprintf (info,"%3d: text added as child of %s\n",line_number,text_of(current->key)); current = p; } break; /* end of Wunknown case */ default: /*----------------------------------------------*\ | This line contains a recognized element. | | If it contains text also, recognize that. | \*----------------------------------------------*/ found = 0; /*----------------------------------------------*\ | Go up the tree; at each node q, fit node as | | child, sibling, or orphan grandchild of q. | \*----------------------------------------------*/ for (q = current; q && !found; q=q->parent) { /*------------------------------------------*\ | If q is not compound, it cannot contain | | this element. | \*------------------------------------------*/ if (element_list_of(q->key)) { enum fgdc_keyword *e,*ee; /* Child of q? */ for (e = element_list_of(q->key); *e != Wnull; e++) if (*e == key) { p = add_child (q); p->key = key; p->indent = 1 + q->indent; p->d = text_of (p->key); p->len = 1 + strlen (p->d); p->original = 1; if (verbose) fprintf (info,"%3d: element %s made a child of %s\n",line_number,text_of(p->key),text_of(q->key)); current = p; found = 1; } /* Sibling of q? */ if (!found && q->parent && element_list_of (q->parent->key)) for (e = element_list_of(q->parent->key); *e != Wnull; e++) if (*e == key) { p = add_child (q->parent); p->key = key; p->indent = 1 + q->parent->indent; p->d = text_of (p->key); p->len = 1 + strlen (p->d); p->original = 1; if (verbose) fprintf (info,"%3d: element %s made a sibling of %s\n",line_number,text_of(p->key),text_of(q->key)); current = p; found = 1; } /* Orphan grandchild of q? */ if (!found && q->parent) for (e = element_list_of(q->key); *e != Wnull && !found; e++) if (element_list_of(*e)) for (ee = element_list_of(*e); *ee != Wnull && !found; ee++) if (*ee == key) { /* add child for *e */ p = add_child (q); p->key = *e; p->indent = 1 + q->indent; p->d = text_of(*e); p->len = 1 + strlen (p->d); p->original = 0; /* add child of *e for *ee */ r = add_child (p); r->key = key; r->indent = 1 + p->indent; r->d = text_of (r->key); r->len = 1 + strlen (r->d); r->original = 1; if (verbose) fprintf (info,"%3d: element %s added as child of %s to contain %s\n", line_number,text_of(p->key),text_of(q->key),text_of(r->key)); current = r; found = 1; } } } /*----------------------------------------------*\ | If there is a nonempty text value on this | | line, and if the new element takes a scalar | | value, make it a child of the new element. | \*----------------------------------------------*/ if (found) { if (v) { while (*v && (isspace(*v) || *v == ':' || *v == '=')) v++; if (*v) { if (element_list_of (current->key)) { fprintf (leftover,"%3d: %s\n",line_number,v); if (verbose) fprintf (info,"%3d: text following %s could not be placed\n",line_number,text_of(current->key)); } else { p = add_child (current); p->key = Wunknown; p->indent = 1 + current->indent; n = 1 + strlen (v); if (p->d = (char *) malloc (n)) { strcpy (p->d,v); p->len = n; p->original = 0; if (verbose) fprintf (info,"%3d: text following element name made child of %s\n", line_number,text_of(current->key)); } } } } } else if (current->key == Wunknown || current->key == Wblank) { /*--------------------------------------*\ | If you couldn't find a place for it | | but you're in the middle of a scalar | | value, assume the line is part of | | the scalar value. | \*--------------------------------------*/ p = insert_item_after (current); p->key = Wunknown; p->indent = 1 + current->indent; for (s=line; *s && isspace(*s); s++); n = 1 + strlen (s); if (p->d = (char *) malloc (n)) { strcpy (p->d,s); p->len = n; p->original = 0; } if (verbose) fprintf (info,"%3d: element %s could not be placed, appending to %s\n", line_number,text_of(key),text_of(current->parent->key)); current = p; } else if (current->child && current->child->key == Wunknown) { /*--------------------------------------*\ | Do the same if current is not scalar | | but has a scalar child (this will | | occur if the scalar child begins on | | the same line as the element name). | \*--------------------------------------*/ p = add_child (current); p->key = Wunknown; p->indent = 1 + current->indent; for (s=line; *s && isspace(*s); s++); n = 1 + strlen (s); if (p->d = (char *) malloc (n)) { strcpy (p->d,s); p->len = n; p->original = 0; } if (verbose) fprintf (info,"%3d: element %s could not be placed, appending to %s\n", line_number,text_of(key),text_of(current->key)); current = p; } else { fprintf (leftover,"%3d: %s\n",line_number,line); if (verbose) fprintf (info,"%3d: element %s could not be placed\n", line_number,text_of(key)); } break; /* end of default case */ } /* end of switch (key) */ } free (buffer); buffer = NULL; buffer_ptr = NULL; buffer_end = NULL; } else { fprintf (stderr,"Error: could not open input file %s\n",input_file); return (NULL); } return (root); } static void fudge_attribute_domains (void) { enum fgdc_keyword *e; for (e = element_list_of (WEnumerated_Domain); *e != Wnull; e++) if (*e == WAttribute) *e = Wnull; for (e = element_list_of (WRange_Domain); *e != Wnull; e++) if (*e == WAttribute) *e = Wnull; } #ifdef NEED_STRDUP static char *strdup (char *string) { char *ptr = empty_string; if (string) if (*string) if (ptr = (char *) malloc (1 + strlen (string))) strcpy (ptr,string); return (ptr); } #endif int main (int argc, char *argv[]) { int i; char *config_file = NULL; char *input_file = NULL; char *ext_file = NULL; char *alias_file = NULL; char *output_file = NULL; char *info_file = NULL; char *leftover_file = NULL; char file_name [1024]; char input_file_name [1024]; struct item *root; void *opt; char *ext_list[8]; int ext_count = 0; char *s,*t; int spaces_are_bad = 1; char *language = NULL; char *profile_name = NULL; #ifdef _WIN32 spaces_are_bad = 0; #endif out = stderr; if (argc > 1) { for (i=1; i < argc; i++) if (strcmp (argv[i],"-v") == 0) verbose = 1; else if (strcmp (argv[i],"-i") == 0) { i++; if (i < argc) info_file = argv[i]; } else if (strcmp (argv[i],"-e") == 0) { i++; if (i < argc) leftover_file = argv[i]; } else if (strcmp (argv[i],"-o") == 0) { i++; if (i < argc) output_file = argv[i]; } else if (strcmp (argv[i],"-a") == 0) { i++; if (i < argc) alias_file = argv[i]; } else if (strcmp (argv[i],"-c") == 0) { i++; if (i < argc) config_file = argv[i]; } else if (strcmp (argv[i],"-l") == 0) { i++; if (i < argc) language = argv[i]; } else if (strcmp (argv[i],"-ext") == 0) { i++; if (i < argc && ext_count < 8) ext_list [ext_count++] = strdup (argv[i]); } else input_file = argv[i]; if (config_file) read_configuration (config_file); /*--------------------------------------------------------------*\ | Use requested language. Command-line argument overrides what | is specified in the config file. \*--------------------------------------------------------------*/ if (!language) if (opt = find_option (NULL,"input")) if (opt = find_option (opt,"language")) if (s = text_of_option (opt)) language = s; /*--------------------------------------------------------------*\ | Use requested profile. \*--------------------------------------------------------------*/ if (opt = find_option (NULL,"input")) if (opt = find_option (opt,"profile")) if (s = text_of_option (opt)) profile_name = s; use_element_names (language,profile_name); /* in keyword.c */ if (profile_name) use_profile (profile_name); /* in actions.c */ /*--------------------------------------------------------------*\ | Read local extensions | \*--------------------------------------------------------------*/ if (opt = find_option (NULL,"input")) if (opt = find_option (opt,"extensions")) { if (ext_file = text_of_option (opt)) read_local (ext_file); while (opt = find_next_option (opt,"extensions")) if (ext_file = text_of_option (opt)) read_local (ext_file); update_element_lists (); } if (opt = find_option (NULL,"input")) if (opt = find_option (opt,"prefix")) { prefix = text_of_option (opt); if (prefix) prefix_length = strlen (prefix); } fudge_attribute_domains(); if (input_file) { /*----------------------------------------------------------*\ | Clip standard file extension off the input file name | | Standard extensions are txt,sgml,sgm,xml,text,met,bin | \*----------------------------------------------------------*/ strcpy (input_file_name,input_file); s = input_file_name + strlen (input_file_name) - 1; while (s > input_file_name && *s != '.') s--; if (s > input_file_name && *s == '.') if (stricmp (s,".txt" ) == 0) *s = 0; else if (stricmp (s,".text") == 0) *s = 0; else if (stricmp (s,".met" ) == 0) *s = 0; else if (stricmp (s,".bin" ) == 0) *s = 0; else for (i=0; i < ext_count; i++) if (*ext_list[i] != '.') { if (stricmp (s+1,ext_list[i]) == 0) *s = 0; } else { if (stricmp (s,ext_list[i]) == 0) *s = 0; } /*----------------------------------------------------------*\ | If you didn't find a standard extension, look for a | | custom extension specified by the user in the config | | file under input:ext | \*----------------------------------------------------------*/ if (s > input_file_name && *s == '.') if (opt = find_option (NULL,"input")) if (opt = find_option (opt,"ext")) do { if (t = text_of_option (opt)) if (stricmp (s,t) == 0) { *s = 0; break; } else if (stricmp (s+1,t) == 0) { *s = 0; break; } } while (opt = find_next_option (opt,"ext")); /*----------------------------------------------------------*\ | Create output file names from input file stem or the | | name specified by the user on the command line. | \*----------------------------------------------------------*/ if (!info_file) if (opt = find_option (NULL,"output")) if (opt = find_option (opt,"info")) if (opt = find_option (opt,"file")) if (s = text_of_option (opt)) { sprintf (file_name,s,input_file_name); if (spaces_are_bad) for (s=file_name; *s; s++) if (isspace(*s)) *s = '_'; if (strcmp (file_name,input_file) != 0) info_file = file_name; else fprintf (stderr,"Warning: requested info file %s would overwrite the input file\n",file_name); } if (info_file) add_related_file ("info",info_file); if (!leftover_file) if (opt = find_option (NULL,"output")) if (opt = find_option (opt,"errors")) if (s = text_of_option (opt)) { sprintf (file_name,s,input_file_name); if (spaces_are_bad) for (s=file_name; *s; s++) if (isspace(*s)) *s = '_'; if (strcmp (file_name,input_file) != 0) leftover_file = file_name; else fprintf (stderr,"Warning: requested leftover file %s would overwrite the input file\n",file_name); } if (leftover_file) add_related_file ("leftover",leftover_file); if (!output_file) if (opt = find_option (NULL,"output")) if (opt = find_option (opt,"text")) if (opt = find_option (opt,"cns")) if (s = text_of_option (opt)) { sprintf (file_name,s,input_file_name); if (spaces_are_bad) for (s=file_name; *s; s++) if (isspace(*s)) *s = '_'; if (strcmp (file_name,input_file) != 0) output_file = file_name; else fprintf (stderr,"Warning: requested output file %s would overwrite the input file\n",file_name); } if (output_file) add_related_file ("output",output_file); } /*--------------------------------------------------------------*\ | The guts of the action \*--------------------------------------------------------------*/ if (info_file = related_file ("info")) if (info = fopen (info_file,"w")) verbose = 1; else info = stdout; else info = stdout; fprintf (info,"cns %s - Peter N. Schweitzer (U.S. Geological Survey)\n",revision.cns); if (leftover_file = related_file ("leftover")) if (leftover = fopen (leftover_file,"w")) ; else leftover = stderr; else leftover = stderr; add_aliases (alias_file); root = chew_and_spit (input_file); if (output_file = related_file ("output")) write_text (output_file,root); } else { fprintf (stderr,"Chew and Spit, %s - Peter N. Schweitzer (U.S. Geological Survey)\n",revision.cns); fprintf (stderr,"Usage: %s [-c config_file] [-l language] [-i info_file] [-a aliases] [-e leftovers] [-o output_file] input_file\n",argv[0]); } return (0); } /*----------------------------------------------------------------------*\ | Code to keep track of related files (input and output) in a simple | | database keyed to the function of the files. | | These are used to build hypertext links among the different formats | | that are created as output. | \*----------------------------------------------------------------------*/ struct file_index { char type[16]; char name[1024]; }; static struct file_index file_index[16]; static int file_index_length = 0; static void add_related_file (char *type, char *name) { if (file_index_length < 16) { char cwd[1024]; if (getcwd (cwd, 1024)) { char *s = cwd + strlen (cwd) - 1; if (*s != '/') { *s++ = '/'; *s = 0; } if (memcmp (name,cwd,strlen(cwd)) == 0) name += strlen (cwd); } if ((strlen (type) < 16) && (strlen (name) < 1024)) { strcpy (file_index[file_index_length].type,type); strcpy (file_index[file_index_length].name,name); file_index_length++; } } } static char *related_file (char *type) { int i; for (i=0; i < file_index_length; i++) if (stricmp (file_index[i].type,type) == 0) return (file_index[i].name); return (NULL); } static void dump_related_files (void) { int i; for (i=0; i < file_index_length; i++) { fprintf (out,"%d:\n",i); fprintf (out," Type: %s\n",file_index[i].type); fprintf (out," Name: %s\n",file_index[i].name); } } /*----------------------------------------------------------------------*\ \*----------------------------------------------------------------------*/