#include #include #include #include // Any line that begins with # is passed through unchanged #define SKIPCPPLINES // Any line that begins with ## is stripped of ##, then passed through unchanged #define SKIPESCAPEDLINES #define nl_check(ch, line, in, out) do { if (ch == '\n') { line += 1; line += cpp_check (in, out); } } while (0) void open_files (int argc, char *argv[], FILE **in, FILE **out); void process (FILE *in, FILE *out); void handle_tag (FILE *in, FILE *out, int *line); void handle_comment (FILE *in, FILE *out, int *line, bool *in_bad_comment_tag); int cpp_check (FILE *in, FILE *out); void clean_up (FILE *in, FILE *out); int main (int argc, char *argv[]) { FILE *in, *out; // open_files calls exit() on failure open_files (argc, argv, &in, &out); process (in, out); clean_up (in, out); return 0; } void open_files (int argc, char *argv[], FILE **in, FILE **out) { switch (argc) { case 1: printf ("%s: missing input and output filenames\n", argv[0]); exit (1); case 2: printf ("truequotes: missing output filename\n"); exit (1); case 3: break; default: printf ("truequotes: %d too many arguments\n", argc - 3); exit (1); } *in = fopen (argv[1], "r"); if (! *in) { printf ("truequotes: error opening input file %s\n", argv[1]); exit (1); } *out = fopen (argv[2], "w"); if (! *out) { printf ("truequotes: error opening output file %s\n", argv[2]); exit (1); } } /* Comment tags get removed, because it makes it easier to prove correctness and the output files are not intended to be read or written by humans. Anything inside a tag is passed through without change. Microsoft Word works interactively. It has a simple rule that a quote after whitespace is a left quote, otherwise it is a right quote. Obviously, this is wrong. ("'Tricky' isn't the word I'd use.") But it seems to be a reasonable fascimile, and has the advantage that "bugs" earlier in the document don't cause problems later in the document. All HTML tags are treated as whitespace for this purpose. */ void process (FILE *in, FILE *out) { const char LSQ[] = "‘", RSQ[] = "’"; const char LDQ[] = "“", RDQ[] = "”"; bool prev_ws; int ch, line = 1; cpp_check (in, out); while (true) { ch = getc (in); if (feof (in)) break; switch (ch) { case '<': ungetc (ch, in); handle_tag (in, out, &line); prev_ws = true; break; case '\'': fputs (prev_ws ? LSQ : RSQ, out); prev_ws = false; break; case '"': fputs (prev_ws ? LDQ : RDQ, out); prev_ws = false; break; default: putc (ch, out); prev_ws = isspace (ch); nl_check (ch, line, in, out); } // switch (ch) } // while (true) } // process (in, out) /* There is a left-bracket ready to be read. Consume the next tag in the file, if any, or read to end of file. */ void handle_tag (FILE *in, FILE *out, int *line) { int tag_start_line = *line; int ch; // Eat "<" getc (in); ch = getc (in); if (feof (in)) { printf ("truequotes: unterminated tag starting on line %d", tag_start_line); return; } if (ch == '!') { /* A comment tag begins with . A comment begins with --, contains zero or more characters but does not contain --, and ends with --. */ // Only report garbage once per comment tag bool in_bad_comment_tag = false; while (true) { ch = getc (in); if (feof (in)) { printf ("truequotes: unterminated comment tag starting on line %d\n", tag_start_line); return; } switch (ch) { case '>': // End of comment tag return; case '-': ungetc (ch, in); handle_comment (in, out, line, &in_bad_comment_tag); break; default: if (! in_bad_comment_tag && ! isspace (ch)) { printf ("truequotes: garbage in comment tag on line %d\n", *line); in_bad_comment_tag = true; } nl_check (ch, *line, in, out); } // switch ch } // while true } // if comment tag // Not a comment tag, pass it through else { bool in_dq = false; putc ('<', out); putc (ch, out); nl_check (ch, *line, in, out); while (true) { ch = getc (in); if (feof (in)) { printf ("truequotes: premature EOF at line %d\n", *line); return; } putc (ch, out); switch (ch) { case '"': in_dq = ! in_dq; break; case '>': if (! in_dq) return; case '\n': nl_check (ch, *line, in, out); } } // while true } // else not comment tag } void handle_comment (FILE *in, FILE *out, int *line, bool *in_bad_comment) { int comment_start_line; int ch; // Eat "-" getc (in); ch = getc (in); if (feof (in)) { printf ("truequotes: bad comment on line %d\n", *line); return; } if (ch != '-') { if (! *in_bad_comment) printf ("truequotes: garbage in comment tag on line %d\n", *line); *in_bad_comment = true; nl_check (ch, *line, in, out); return; } // Look for a closing -- comment_start_line = *line; while (true) { ch = getc (in); if (feof (in)) { printf ("truequotes: comment starting line %d runs to EOF\n", comment_start_line); return; } nl_check (ch, line, in, out); if (ch == '-') { ch = getc (in); if (ch == '-') return; nl_check (ch, line, in, out); } // if ch == - } // while true } // handle comment /* Check for consecutive lines beginning with #, and pass them through. Return count of lines passed through. */ #ifndef SKIPCPPLINES int cpp_check (FILE *, FILE *) { return 0; } #else int cpp_check (FILE *in, FILE *out) { int skipped = 0, escaped = 0; while (true) { int ch = getc (in); if (feof (in)) break; if (ch != '#') { ungetc (ch, in); break; } #ifdef SKIPESCAPEDLINES // Just read a #. If the next one is also a #, then eat it. Otherwise, write a # and put it back ch = getc (in); if (feof (in)) { putc ('#', out); break; } if (ch != '#') { putc ('#', out); putc (ch, out); } #else // Just read a #. Write it out putc ('#', out); #endif // Read this line, and write it to out while (true) { ch = getc (in); if (feof (in)) break; putc (ch, out); if (ch == '\n') break; } skipped++; } return skipped; } #endif void clean_up (FILE *in, FILE *out) { fclose (in); fclose (out); }