
/*
 *  From garlick@ecst.csuchico.edu Tue Apr  5 09:46:10 1994
 *  Article: 12169 of comp.infosystems.www
 *  From: garlick@ecst.csuchico.edu (Jim Garlick)
 *  Subject: convert nroff output to html
 *  Date: Mon, 4 Apr 94 19:14:46 GMT-1:00
 *  Organization: California State University, Chico
 *
 *  Here's a C program I wrote which converts the output of nroff to html by 
 *  mapping backspace/overstrikes to <B>, backspace/underbar to <I>, and
 *  several  overstruck chars to chars that are sane in html.
 *  It escapes the html special chars '<', '>', and '&'.
 *
 *  It's a filter.  To convert a troff document to html, run through nroff
 *  and pipe the output through man2html.
 *
 *  Awfully simple-minded, but the results aren't half bad!
 *
 *  Jim Garlick
 *  CSU Chico
 *
 *  --  Almost entirely rewritten by Bruce Korb  23 Sep 2002
 *
 * man2html.c -- convert nroff output to html
 *
 * Convert backspace/overstrikes to bold.
 * Convert backspace/underbar (either order) to italic.
 * Convert bar/backspace/dash to `+' (also handle bold rendering)
 * Convert bar/backspace/equals to `*' (also handle bold rendering)
 * Convert plus/backspace/o to `o'
 * Convert `<' to `&lt';  `>' to `&gt';  and `&' to `&amp'
 * If -u specified, compress duplicate blank lines.
 *
 * $Id: man2html.c,v 1.3 2002/09/28 16:39:38 bkorb Exp $
 */

#include <sys/types.h>
#include <sys/stat.h>

#include <stdio.h>
#include <ctype.h>
#include <errno.h>
#include <regex.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>

#include "opts.h"

#ifndef NUL
#  define NUL '\0'
#endif

#ifndef EXIT_SUCCESS
#  define EXIT_SUCCESS 0
#endif

#ifndef EXIT_FAILURE
#  define EXIT_FAILURE 1
#endif

#define BUFSIZE                 0x2000 /* 8KB */
#define CHAR_NORMAL             ' '
#define CHAR_BOLD               (' '|1)
#define CHAR_ITALIC             (' '|2)
#define CHAR_BOLD_ITALIC        (CHAR_BOLD | CHAR_ITALIC)
#define IS_MODE_BOLD(m)         ((m) & 1)
#define IS_MODE_ITALIC(m)       ((m) & 2)

tSCC zHtml[] =
"<!DOCTYPE html PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n\
<html><head><title>%s</title>\n\
 <meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\"/>\
 <meta name=\"author\"    content=\"Bruce Korb\"></meta>\
 <meta name=\"generator\" content=\"man2html " MAN2HTML_VERSION "\"></meta>\
</head>";

tSCC zXhtml[] =
"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n\
<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\
     \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n\
<html><head><title>%s</title>\n\
 <meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\" />\
 <meta name=\"author\"    content=\"Bruce Korb\" />\
 <meta name=\"generator\" content=\"man2html " MAN2HTML_VERSION "\" />\
</head>\n";

tSCC zBody[]      = "<body bgcolor=\"#ffffff\">";
tSCC zBold[]      = "<b>";
tSCC zNotBold[]   = "</b>";
tSCC zItalic[]    = "<em>";
tSCC zNotItalic[] = "</em>";
tSCC zEndBody[]   = "</body>\n";
tSCC zEndHtml[]   = "</body></html>\n";

tSCC zArgCt[]    = "man2html ERROR:  too many arguments\n";
tSCC zNoTty[]    = "man2html ERROR:  input cannot be from a tty\n";
tSCC zNoPage[]   = "man2html ERROR:  there is no man page for %s\n";
tSCC zFsErr[]    = "man2html ERROR:  fserr %d (%s) opening %s on line %d\n";
tSCC zNotFound[] = "man2html ERROR:  ``%s'' is not a file and not a man page\n";
tSCC zHdrPat[]   = "man2html ERROR:  RE error %d (%s):\n"
                   "\t``%s'' is not a valid regex\n";

char* pzTitle    = "man2html generated man page";

tSCC zPea[] = "\n\
</pre><center>This man page was converted to HTML by \
<a href=\"http://autogen.sourceforge.net/man2html.html\">\
man2html</a></center>";

char  buf[ BUFSIZE ];
regex_t headerRe;


FILE* findInput( tCC* pzFile );
void  usage( void );
int   html( char* s, int lineno );
void  trim( char* pz );
FILE* openInput( tCC* pzFile );
void  setAttributes( char* pzSrc, unsigned char* pAttr );

int
main( int argc, char** argv )
{
    FILE* inFp = stdin;
    /*
     * Options.
     */
    {
        int ct = optionProcess( &man2htmlOptions, argc, argv );
        switch (argc - ct) {
        default:
            fputs( zArgCt, stderr );
            USAGE( EXIT_FAILURE );

        case 0:
            if (isatty( STDIN_FILENO )) {
                fputs( zNoTty, stderr );
                USAGE( EXIT_FAILURE );
            }
            break;

        case 1:
            inFp = findInput( argv[ ct ]);
        }
    }

    if (ENABLED_OPT( HEADER )) {
        int err = regcomp( &headerRe, OPT_ARG( HEADER ),
                           REG_EXTENDED | REG_NOSUB );
        if (err != 0) {
            regerror( err, &headerRe, buf, sizeof( buf ));
            fprintf( stderr, zHdrPat, err, buf, OPT_ARG( HEADER ));
            exit( EXIT_FAILURE );
        }
    }

    /*
     *  Open the tags for the form type we are creating.
     *  Always open a <pre>, the rest are conditional.
     */
    switch (OPT_VALUE_FORM) {
    case FORM_XHTML:
        printf( zXhtml, pzTitle );
        goto form_body;

    case FORM_PAGE:
        printf( zHtml, pzTitle );

    case FORM_BODY:
    form_body:
        fputs( zBody, stdout );

    case FORM_PRE:
        fputs( "<h2>NAME</h2><pre>", stdout );
    }

    /*
     * Process each line.  Count unresolved backspaces and issue
     * a warning message at the end.
     */
    {
        int bksp   = 0;
        int lineno = 0;

        while (fgets( buf, BUFSIZE, inFp )) {
            trim( buf );
            bksp += html(buf, ++lineno);
        }

        /*
         * Warn about any unresolved backspaces.
         */
        if (bksp > 0)
            fprintf(stderr, 
                    "man2html: warning: %d unresolved backspaces\n", bksp);

        if (lineno < 2) {
            fputs( "man2html: warning: no man page\n", stderr );
            fputs( "        No man page data were found.", stdout );
        }
    }

    /*
     * Close html envelopes, always starting with "</pre>".
     */
    if (! HAVE_OPT( PEA ))
        fputs( zPea, stdout );
    else
        fputs( "</pre>", stdout );

    switch (OPT_VALUE_FORM) {
    case FORM_BODY: fputs( zEndBody, stdout ); break;
    case FORM_XHTML:
    case FORM_PAGE: fputs( zEndHtml, stdout ); break;
    }

    return EXIT_SUCCESS;
}


void
setAttributes( char* pzSrc, unsigned char* pAttr )
{
    char* pzDest = pzSrc;

    /* 
     * pass 1: set character attributes (and delete overstrikes)
     */
    memset( pAttr, CHAR_NORMAL, BUFSIZE-1 );
    pAttr[ BUFSIZE-1 ] = NUL;

    /*
     *  Some man programs put a space at the start of every line.
     */
    if ((pzSrc[0] == ' ') && (pzSrc[1] != ' '))
        pzSrc++;

    for ( ; ; pzDest++, pAttr++) {
        switch (*pzDest = *(pzSrc++)) {
        default:    continue;
        case NUL:   *pAttr = NUL; return;
        case '\b':  break;
        }

        /* detect a backspace/overstrike (bold <b>) */
        if (pzDest[-1] == *pzSrc) {
            pAttr[-1] |= CHAR_BOLD;

        /* detect an underbar/backspace (italic <em>) */ 
        } else if (pzDest[-1] == '_') {
            pAttr[-1] |= CHAR_ITALIC;
            pzDest[-1] = *pzSrc; /* move italicized char */

        /* detect a backspace/underbar (reverse of above) */
        } else if (*pzSrc == '_') {
            pAttr[-1] |= CHAR_ITALIC;

        } else switch (pzDest[-1]) {
        case '+':
            switch (*pzSrc) {
            case '-':
            case '|':
                /* convert + & - and + & | become  `+' */
                pzDest[-1] = '+';
                break;

            case 'o':
                /* convert + & o becomes bold 'o' */
                pzDest[-1] = 'o';
                pAttr[-1] |= CHAR_BOLD;
                break;

            default: continue;
            }
            break;

        case '|':
            switch (*pzSrc) {
                /* | & = becomes '*' */
            case '=': pzDest[-1] = '*'; break;

                /* | & - becomes '+' */
            case '-': pzDest[-1] = '+'; break;
            default:  continue;
            }
            break;

        case '*':
            /* * & = and * & | becomes '*' */
            switch (*pzSrc) {
            case '=':
            case '|': pzDest[-1] = '*'; break;
            default:  continue;
            }
            break;

        case 'o':
            switch (*pzSrc) {
            case '+':
                pzDest[-1] = 'o';
                pAttr[-1] |= CHAR_BOLD;
                break;

            default:  continue;
            }
            break;

        default:  continue;
        }

        pzDest  -= 1;      /* omit the back space */
        pAttr   -= 1;
        pzSrc++;           /* advance past copied char */
    }
}


/*
 * Given a line, print it out as html.
 * Return the number of unprocessed backspaces in this line.
 */
int
html( char* p, int lineno )
{
    static int name_seen = 0;
    static int last_line_was_blank = 1;

    int bksp = 0;
    int boldFirst;
    unsigned char charMode[2] = { CHAR_NORMAL, NUL };
    unsigned char buf2[BUFSIZE], *pMode = buf2;

    /* 
     * two bits in each element of buf2 indicate attributes of 
     * corresponding character in `s'.
     *    bit 1, when set, indicates bold
     *    bit 2, when set, indicates underscore
     */
    setAttributes( p, buf2 );

    if (! name_seen) {
        if (strncmp( p, "NAME", 4 ) != 0)
            return 0;
        name_seen = 1;
        return 0;
    }

    if (ENABLED_OPT( HEADER )) {
        int res = regexec( &headerRe, p, 0, NULL, 0 );
        switch (res) {
        case 0: return 0;
        case REG_NOMATCH: break;
        default:
            regerror( res, &headerRe, buf, sizeof( buf ));
            fprintf( stderr, zHdrPat, res, buf, OPT_ARG( HEADER ));
            exit( EXIT_FAILURE );
        }
    }

    /*
     *  IF the first character on the line is not white space,
     *  THEN it is either a blank line, a header line or a section name.
     */
    if (! isspace( p[0] )) {
        if (*p != NUL) {
            /*
             *  The only other text starting in column 1 is a section name.
             */
            printf( "\n</pre><h2>%s</h2><pre>", p );
            last_line_was_blank = 0;
        }
        else {
            if (HAVE_OPT( BLANK_LINES )) {
                if (last_line_was_blank)
                    return 0;
                last_line_was_blank = 1;
            }
            fputc( '\n', stdout );
        }
        return 0;
    }

    /*
     *  Header lines also sometimes start with a lot of spaces.  Here,
     *  if we find more than 26 at the start of the line, we drop the line.
     */
    {
        int spct = strspn( p, " " );

        /*
         *  The input line starts with spaces.
         */
        fputc( '\n', stdout );
        last_line_was_blank = 0;
        if (spct > 0) {
            fwrite( p, 1, spct, stdout );
            p += spct;
            pMode += spct;
        }
    }

    for (;;) {
        /*
         *  Write all the chars that are in the current mode.
         *  We always start in CHAR_NORMAL mode on every line.
         */
        size_t spanCt = strspn( (const char*)pMode, (const char*)charMode );

        pMode += spanCt;
        while (spanCt-- > 0) {
            /* print the char, escaping the three html special chars */
            switch (*p) {
                tSCC zBSWarn[] = "man2html: warning, \\b on line %d\n";

            case NUL: goto Woops;
            case '<': fputs("&lt;",  stdout); break;
            case '>': fputs("&gt;",  stdout); break;
            case '&': fputs("&amp;", stdout); break;

            case '\b':
                if (HAVE_OPT( BS_WARNING ))
                    fprintf( stderr, zBSWarn, lineno ); 
                bksp++;
                /* FALLTHROUGH */

            default:
                fputc( *p, stdout );
            }
            p++;
        }

        if (*p == NUL)
            break;

        /*
         *  Never change state for space characters.
         */
        if (*p == ' ') {
            fputc( ' ', stdout );
            p++;
            pMode++;
            continue;
        }

        /*
         *  We are going to change state.  The HTML we emit depends upon:
         *
         *  * The current mode
         *  * The next mode
         *  * Which was first:  bold or italic?
         */
        switch (charMode[0]) {
        case CHAR_NORMAL:
            /*
             *  Going into bold from normal:  bold is first, else not.
             */
            if (IS_MODE_BOLD( *pMode )) {
                fputs( "<b>", stdout );
                boldFirst = 1;
            }
            else {
                boldFirst = 0;
            }
            if (IS_MODE_ITALIC( *pMode ))
                fputs( "<em>", stdout );
            break;

        case CHAR_BOLD:
            /*
             *  IF we are switching from bold, bold isn't first anymore.
             */
            if (! IS_MODE_BOLD( *pMode )) {
                fputs( "</b>", stdout );
                boldFirst = 0;
            }

            if (IS_MODE_ITALIC( *pMode ))
                fputs( "<em>", stdout );
            break;

        case CHAR_ITALIC:
            if (! IS_MODE_ITALIC( *pMode )) {
                if (IS_MODE_BOLD( *pMode )) {
                    /*
                     *  an overstrike/underbar is ambiguous.  Remain in italic
                     *  when this happens.
                     */
                    if (*p == '_')
                        *pMode = CHAR_ITALIC;
                    else {
                        fputs( "</em><b>", stdout );
                        boldFirst = 1;
                    }
                }
                else
                    fputs( "</em>", stdout );
            }
            else {
                /*
                 *  We're staying in italic mode, but adding bold
                 */
                fputs( "<b>", stdout );
                boldFirst = 0;
            }
            break;

        case CHAR_BOLD_ITALIC:
            /*
             *  We're dropping one or both modes.  This is the only state where
             *  we must pay attention to which was first, obviously.
             */
            if (IS_MODE_BOLD( *pMode )) {
                /*
                 *  Drop italic, keep bold
                 */
                if (boldFirst)
                    fputs( "</em>", stdout);
                else {
                    fputs( "</b></em><b>", stdout );
                    boldFirst = 1;
                }
            }

            else if (IS_MODE_ITALIC( *pMode )) {
                /*
                 *  Drop bold, keep italic
                 */
                if (! boldFirst)
                    fputs( "</b>", stdout);
                else {
                    fputs( "</em></b><em>", stdout );
                    boldFirst = 0;
                }
            }

            /*
             *  Dropping both.  Close them in the right order
             */
            else if (boldFirst)
                fputs( "</em></b>", stdout );
            else
                fputs( "</b></em>", stdout );
        }

        /*
         *  Set the new mode & write out all chars in that mode.
         */
        charMode[0] = *pMode;
    }

    switch (charMode[0]) {
    case CHAR_NORMAL:
        break;

    case CHAR_BOLD:
        fputs( "</b>", stdout );
        break;

    case CHAR_ITALIC:
        fputs( "</em>", stdout );
        break;

    case CHAR_BOLD_ITALIC:
        if (boldFirst)
            fputs( "</em></b>", stdout );
        else
            fputs( "</b></em>", stdout );
    }

    return bksp;

 Woops:
    fprintf( stderr, "Consistency error on line %d", lineno );
    exit( EXIT_FAILURE );
}

void
trim( char *s )
{
    char* p = s + strlen( s );
    while ((p > s) && isspace( p[-1] ))  p--;
    p[0] = '\0';
}


FILE*
openInput( tCC* pzFile )
{
    /*
     *  Sometimes, the catman page has been compressed.
     */
    sprintf( buf, "file %s", pzFile );
    {
        FILE* fp = popen( buf, "r" );
        char* pz;

        if (fp == NULL) {
            fprintf( stderr, zFsErr, errno, strerror( errno ), buf, __LINE__ );
            exit( EXIT_FAILURE );
        }
        pz = fgets( buf, sizeof( buf ), fp );
        pclose( fp );
        if (pz == (char*)NULL) {
            fprintf( stderr, zFsErr, errno, strerror( errno ), buf, __LINE__ );
            exit( EXIT_FAILURE );
        }
    }

    /*
     *  If it has been compressed, uncompress it.
     */
    {
        FILE* fp;

        if (strstr( buf, "compressed" ) != NULL) {
            sprintf( buf, "gunzip -c %s", pzFile );
            fp = popen( buf, "r" );
            pzFile = buf;
        }
        else
            fp = freopen( pzFile, "r", stdin );

        if (fp == NULL) {
            fprintf( stderr, zFsErr, errno, strerror(errno), pzFile, __LINE__);
            exit( EXIT_FAILURE );
        }
        return fp;
    }
}


FILE*
findInput( tCC* pzFile )
{
    /*
     *  By constraining names to a paultry 128 bytes we will know that our
     *  formatted strings will fit into 8KB buffers.
     */
    if (strlen( pzFile ) > 128) {
        fputs( "Man page names are constrained to 128 bytes\n", stderr );
        exit( EXIT_FAILURE );
    }

    /*
     *  Create a page title, if we need to
     */
    if (OPT_VALUE_FORM == FORM_PAGE) {
        sprintf( buf, "man2html generated man page for %s", pzFile );
        pzTitle = strdup( buf );
    }

    /*
     *  Either we have a file we can access directly, or we have to hunt for
     *  a catman page.  Figure out which.
     */
    {
        struct stat sb;
        if (  (stat( pzFile, &sb ) == 0)
           && S_ISREG( sb.st_mode )) 

            return openInput( pzFile );
    }

    if (strchr( pzFile, '/' ) != (char*)NULL) {
        fprintf( stderr, zNotFound, pzFile );
        USAGE( EXIT_FAILURE );
    }

    {
        char* pz = strchr( pzFile, '.' );
        if (pz == NULL)
            sprintf( buf, "man %s", pzFile );
        else {
            tSCC zMan[] = "set -x ; man -";
            char* pzCmd = buf + sizeof( zMan ) - 1;
            strcpy( buf, zMan );

            /*
             *  Some mans use -S for section, others -s others just the number
             */
            if (OPT_ARG( SECTION_FLAG )[0] == '-')
                *(pzCmd++) = OPT_ARG( SECTION_FLAG )[1];
            else if (OPT_ARG( SECTION_FLAG )[0] == NUL)
                pzCmd--;
            else
                *(pzCmd++) = OPT_ARG( SECTION_FLAG )[0];

            strcpy( pzCmd, pz+1 );
            pzCmd += strlen( pzCmd );
            *(pzCmd++) = ' ';
            memcpy( pzCmd, pzFile, pz - pzFile );
            pzCmd[ pz - pzFile ] = NUL;
        }
    }

    {
        FILE* inFp = popen( buf, "r" );
        if (inFp == NULL) {
            fprintf( stderr, zFsErr, errno, strerror( errno ), buf, __LINE__ );
            exit( EXIT_FAILURE );
        }
        return inFp;
    }
}
/*
 * Local Variables:
 * c-file-style: "stroustrup"
 * indent-tabs-mode: nil
 * tab-width: 4
 * End:
 * end of autogen.c */
