/* vim: sta et sw=4
 */

/*
 * $Id: utf8trans.c,v 1.5 2003/05/21 17:00:26 stevecheng Exp $
 *
 * (C) 2001 Steve Cheng <stevecheng@users.sourceforge.net>
 *
 * See ../COPYING for the copyright status of this software.
 *
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>

#include "hash.h"

/* UCS-4 character */
typedef unsigned int CHAR;

#ifdef HAVE_GETOPT_H
#include <getopt.h>

/* Long-option specification */
struct option long_options[] =
{
    { "version", 0, 0, 'v' },
    { "help", 0, 0, 'h' },
    
    { 0, 0, 0, 0 }
};
#endif

const char *prog_name;
const char *charmap = NULL;
struct hashtable *charmap_hash;

void print_version(void);
void print_usage();
void add_translation(CHAR codepoint, char *translation);
void parse_charmap(FILE *stream);
char *encode_utf8(CHAR codepoint);
void translate(FILE *in, FILE *out);

#if !HAVE_GETLINE
ssize_t getline(char **lineptr, size_t *n, FILE *stream);
#endif

int
main(int argc, char *argv[])
{
    FILE *charmap_f;
    int optind;
    
    prog_name = argv[0];
    optind = do_options(argc, argv);
    
    charmap_hash = hash_new(HASH_SIZE);
    
    /* Read translation spec */
    charmap = argv[optind];
    charmap_f = fopen(charmap, "r");
    if(!charmap_f) {
        fprintf(stderr, "%s: %s: %s\n", 
                prog_name,
                charmap,
                strerror(errno));
        exit(1);
    }

    parse_charmap(charmap_f);

    fclose(charmap_f);

    optind++;
    
    if(!argv[optind]) {
        translate(stdin, stdout);
    }
    else {
        int i;
        FILE *f;
        for(i = optind; argv[i]; i++) {
            f = fopen(argv[i], "r");
            if(!f) {
                fprintf(stderr, "%s: %s: %s\n",
                        prog_name,
                        argv[i],
                        strerror(errno));
                exit(1);
            }
            translate(f, stdout);
            fclose(f);
        }
    }

    hash_delete(charmap_hash);

    return 0;
}

void
print_version(void)
{
    puts("utf8trans (part of docbook2X" 
#ifdef HAVE_CONFIG_H
    VERSION
#endif
        ")");

    puts("$Revision: 1.5 $ $Date: 2003/05/21 17:00:26 $");
    puts("<URL:http://docbook2x.sourceforge.net/>\n");
    
    puts("Copyright (C) 2000-2001 Steve Cheng\n"
         "This is free software; see the source for copying conditions.\n"
         "There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR\n"
         "A PARTICULAR PURPOSE.");
}

void
print_usage()
{
    printf("Usage: %s [options] CHARMAP [FILES...]\n", prog_name);
    puts("Transliterate UTF-8 characters according to a table.\n");
    
    puts("  -v, --version           display version information and exit\n"
         "  -h, --help              display this usage information\n");

    puts("See utf8trans(1) for details on this program.\n");
}
    
int
do_options(int argc, char *argv[])
{
    int optc;

#ifdef HAVE_GETOPT_H
    while((optc = getopt_long(argc, argv, "vh", 
                long_options, NULL)) != -1)
#else
    while((optc = getopt(argc, argv, "vh")) != -1)
#endif
    {
        switch(optc) {

        /* --version */
        case 'v':
            print_version();
            exit(0);
            
        /* --help */
        case 'h':
            print_usage();
            exit(0);

        case '?':
        default:
            exit(1);
        }
    }

    if(optind > argc-1) {
        fprintf(stderr, "%s: must specify charmap\n", prog_name);
        exit(1);
    }

    return optind;
}

void add_translation(CHAR codepoint, char *translation)
{
    int len = strlen(translation);
    char *value = malloc(len + 1);
    strcpy(value, translation);

    hash_put(charmap_hash, codepoint, value);
}

char *
get_translation(CHAR codepoint)
{
    char *translation;
    if(hash_get(charmap_hash, codepoint, &translation))
        return translation;
    else
        return encode_utf8(codepoint);
}

char *
encode_utf8(CHAR c)
{
    static char buf[7];
    
    if(c < 0x80) {
        buf[0] = c;
        buf[1] = '\0';
    } else if(c < 0x800) {
        buf[0] = 0xC0 | c>>6;
        buf[1] = 0x80 | c & 0x3F;
        buf[2] = '\0';
    } else if(c < 0x10000) {
        buf[0] = 0xE0 | c>>12;
        buf[1] = 0x80 | c>>6 & 0x3F;
        buf[2] = 0x80 | c & 0x3F;
        buf[3] = '\0';
    } else if (c < 0x200000) {
        buf[0] = 0xF0 | c>>18;
        buf[1] = 0x80 | c>>12 & 0x3F;
        buf[2] = 0x80 | c>>6 & 0x3F;
        buf[3] = 0x80 | c & 0x3F;
        buf[4] = '\0';
    } else if (c < 0x4000000) {
        buf[0] = 0xF8 | c>>24;
        buf[1] = 0x80 | c>>18 & 0x3F;
        buf[2] = 0x80 | c>>12 & 0x3F;
        buf[3] = 0x80 | c>>6 & 0x3F;
        buf[4] = 0x80 | c & 0x3F;
        buf[5] = '\0';
    } else if (c < 0x80000000) {
        buf[0] = 0xFC | c>>30;
        buf[1] = 0x80 | c>>24 & 0x3F;
        buf[2] = 0x80 | c>>18 & 0x3F;
        buf[3] = 0x80 | c>>12 & 0x3F;
        buf[4] = 0x80 | c>>6 & 0x3F;
        buf[5] = 0x80 | c & 0x3F;
        buf[6] = '\0';
    } else {
        /* Oops */
        abort();
    }
    
    return buf;
}


        


#define IS_HEXDIGIT(c) ((c) == '0' || (c) == '1' || (c) == '2' || (c) == '3' \
        || (c) == '4' || (c) == '5' || (c) == '6' || (c) == '7' \
        || (c) == '8' || (c) == '9' || (c) == 'A' || (c) == 'B' \
        || (c) == 'C' || (c) == 'D' || (c) == 'E' || (c) == 'F' \
        || (c) == 'a' || (c) == 'b' || (c) == 'c' || (c) == 'd' \
        || (c) == 'e' || (c) == 'f')

#define IS_SPACE(c) ((c) == ' ' || (c) == '\t')

void parse_charmap(FILE *stream)
{
    char *buf = NULL;
    size_t bufsize = 0;
    char *p, *c, *t;
    int linecount = 0;

    CHAR codepoint;
    
    while(!feof(stream)) {
        linecount++;
        if(getline(&buf, &bufsize, stream) == -1) {
            if(!feof(stream)) {
                fprintf(stderr, "%s: parse spec: %s\n", 
                        prog_name, 
                        strerror(errno));
                exit(2);
            }
            goto nextline;
        }

        /* Chomp newline */
        p = buf + (strlen(buf)-1);
        if(*p == '\n') *p = '\0';

        /* Skip to codepoint */
        for(c = buf; *c && IS_SPACE(*c); c++);

        /* Skip empty lines and comment lines */
        if(*c == '\0' || *c == '#')
            goto nextline;

        t = NULL;

        /* Parse the codepoint (a number in hex) */
        for(p = c; *p; p++) {
            if(!IS_HEXDIGIT(*p)) {
                if(!IS_SPACE(*p)) {
                    fprintf(stderr, "%s: "
                            "parse codepoint: not a hex digit: %s\n",
                            prog_name,p);
                    goto nextline;
                }

                *p = '\0';
                
                if(sscanf(c, "%x", &codepoint) != 1) {
                    fprintf(stderr, "%s: "
                            "parse codepoint: not a valid hex number\n",
                            prog_name);
                    goto nextline;
                }

                t = ++p;
                break;
            }
        }

        if(t) {
            add_translation(codepoint, t);
        } else {
            /* No translation text */
            if(sscanf(c, "%x", &codepoint) != 1) {
                fprintf(stderr, "%s: "
                        "parse codepoint: not a valid hex number\n",
                        prog_name);
                goto nextline;
            }
            add_translation(codepoint, "");
        }
nextline: ;
    }

    if(buf)
        free(buf);
}

unsigned int translate_utf8(unsigned char leadbyte, FILE *stream)
{
    CHAR character;
    unsigned int b;
    int n, i;

    /* UTF-8 sequence leading byte */
    if((leadbyte & 0xC0) == 0xC0) {
        /* Count bytes and eat lead bits */
        b = leadbyte;
        for(n = 0; b & 0x80; b<<=1, n++);
        b = (b & 0xFF) >> n;

        if(n > 6 || n < 2) return 0xFFFD;

        switch(n) {
            case 6: b <<= 6;
            case 5: b <<= 6;
            case 4: b <<= 6;
            case 3: b <<= 6;
            case 2: b <<= 6;
        }
        character = b;
        
        for(i = n; i>1; i--) {
            b = fgetc(stream);
            if(b == EOF) return 0xFFFD;
            if((b & 0xC0) != 0x80) return 0xFFFD;
            b &= 0x3F;

            switch(i) {
                case 6: b <<= 6;
                case 5: b <<= 6;
                case 4: b <<= 6;
                case 3: b <<= 6;
                case 2: ;
            }
            
            character |= b;
        }

        /* Check for overlong sequences */
        switch(n) {
            case 6: if(character < 0x4000000) return 0xFFFD;
            case 5: if(character < 0x200000) return 0xFFFD;
            case 4: if(character < 0x10000) return 0xFFFD;
            case 3: if(character < 0x800) return 0xFFFD;
            case 2: if(character < 0x80) return 0xFFFD;
        }

        return character;
    }

    /* UTF-8 sequence continuation byte */
    else if((leadbyte & 0xC0) == 0x80) {
        return 0xFFFD;
    }

    /* ASCII character */
    else {
        return leadbyte;
    }
}
    

void translate(FILE *in, FILE *out)
{
    int c;
    CHAR character;
    
    while(!feof(in))
    {
        c = fgetc(in);
        if(c == EOF)
            break;

        character = translate_utf8(c, in);
        fputs(get_translation(character), out);
    }
}

#if !HAVE_GETLINE
ssize_t getline(char **lineptr, size_t *n, FILE *stream)
{
    ssize_t k = 0;
    int c;

    if(!*lineptr) {
        *lineptr = malloc(256);
        if(!*lineptr)
            return -1;
        *n = 256;
    }

    do {
        c = fgetc(stream);
        if(c == EOF) {
            if(k == 0) {
                (*lineptr)[0] = 0;
                return -1;
            }
            
            break;
        }

        if(k == *n - 1) {
            char *p = realloc(*lineptr, *n *2);
            if(!p)
                return -1;
            *lineptr = p;
        }

        (*lineptr)[k++] = c;
    } while(c != '\n');

    (*lineptr)[k] = 0;
    
    return k;
}
#endif
