/***************************************************************************
*   Copyright (C) 2004-2009 by Thomas Fischer                             *
*   fischer@unix-ag.uni-kl.de                                             *
*                                                                         *
*   This program is free software; you can redistribute it and/or modify  *
*   it under the terms of the GNU General Public License as published by  *
*   the Free Software Foundation; either version 2 of the License, or     *
*   (at your option) any later version.                                   *
*                                                                         *
*   This program is distributed in the hope that it will be useful,       *
*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*   GNU General Public License for more details.                          *
*                                                                         *
*   You should have received a copy of the GNU General Public License     *
*   along with this program; if not, write to the                         *
*   Free Software Foundation, Inc.,                                       *
*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
***************************************************************************/
#include <qiodevice.h>
#include <qregexp.h>
#include <qapplication.h>

#include <file.h>
#include <comment.h>
#include <macro.h>
#include <preamble.h>
#include <entry.h>
#include <element.h>
#include <encoderlatex.h>
#include <value.h>

#include "fileimporterbibtex.h"

#define max(a,b)  ((a)<(b)?(b):(a))

namespace BibTeX
{
    const QString extraAlphaNumChars = QString( "?'`-_:.+/$\\\"&" );
    const QRegExp htmlRegExp = QRegExp( "</?(a|pre)[^>]*>", false );

    FileImporterBibTeX::FileImporterBibTeX( bool personFirstNameFirst, QString encoding ) : FileImporter(), m_personFirstNameFirst( personFirstNameFirst ), m_currentChar( ' ' ), m_ignoreComments( FALSE ), m_lineBufferSize( 4096 ), m_encoding( encoding )
    {
        cancelFlag = FALSE;
        m_lineBuffer = new char[m_lineBufferSize];
        m_textStream = NULL;
    }


    FileImporterBibTeX::~FileImporterBibTeX()
    {
        delete[] m_lineBuffer;
    }

    File* FileImporterBibTeX::load( QIODevice *iodevice )
    {
        m_mutex.lock();
        cancelFlag = FALSE;

        QString rawText;
        const char *encodingFrom = m_encoding == "latex" ? "utf-8\0" : m_encoding.append( "\0" ).ascii();
        iconv_t iconvHandle = iconv_open( "utf-8", encodingFrom );
        char *convertedLine = new char[m_lineBufferSize * 4];
        int len;
        bool encodingOk = true;
        while ( encodingOk && iodevice->isReadable() && ( len = iodevice->readLine( m_lineBuffer, m_lineBufferSize ) ) > 0 )
        {
            evaluateParameterComments( iconvHandle, m_lineBuffer );

            char *raw = m_lineBuffer;
            char *enc = convertedLine;
            size_t encLen = m_lineBufferSize, rawLen = ( size_t )len;
            size_t result = iconv( iconvHandle, &raw, &rawLen, &enc, &encLen );

            qApp->processEvents();

            if ( result != 0 )
            {
                QString problematic = QString( m_lineBuffer ).mid( max( 0, m_lineBufferSize - encLen - 15 ), 30 );
                if ( problematic.isNull() || problematic.isEmpty() ) problematic = QString( m_lineBuffer );
                qDebug( "iconv resulted in error code %i for source encoding %s, maybe file is in different encoding? Problem is somewhere here: \"%s\"", result, encodingFrom, problematic.latin1() );
                encodingOk = false;
                break;
            }
            if ( rawLen > 0 )
            {
                qDebug( "iconv could not convert complete string, only %i out of %i chars", len - rawLen, len );
                encodingOk = false;
                break;
            }
            enc[0] = '\0';
            QString line = QString::fromUtf8( convertedLine );
            rawText.append( line );
        }
        iconv_close( iconvHandle );
        delete[] convertedLine;

        if ( !encodingOk )
        {
            qDebug( "Decoding failed, cannot load file. Please fix encoding manually." );
            m_mutex.unlock();
            return NULL;
        }

        /** Cleaning up code comming from DBLP */
        rawText = rawText.replace( htmlRegExp, "" );
        rawText = EncoderLaTeX::currentEncoderLaTeX() ->decode( rawText );
        unescapeLaTeXChars( rawText );
        m_textStream = new QTextStream( rawText, IO_ReadOnly );
        m_textStream->setEncoding( QTextStream::UnicodeUTF8 );

        File *result = new File();
        QIODevice *streamDevice = m_textStream->device();
        while ( !cancelFlag && !m_textStream->atEnd() )
        {
            emit progress( streamDevice->at(), streamDevice->size() );
            qApp->processEvents();
            Element * element = nextElement();
            if ( element != NULL )
            {
                Comment *comment = dynamic_cast<Comment*>( element );
                if ( !m_ignoreComments || comment == NULL )
                    result->appendElement( element );
                else
                    delete element;
            }
            qApp->processEvents();
        }
        emit progress( streamDevice->size(), streamDevice->size() );

        if ( cancelFlag )
        {
            qDebug( "Loading file has been canceled" );
            delete result;
            result = NULL;
        }

        delete m_textStream;

        m_mutex.unlock();
        return result;
    }

    bool FileImporterBibTeX::guessCanDecode( const QString & rawText )
    {
        QString text = EncoderLaTeX::currentEncoderLaTeX() ->decode( rawText );
        return text.find( QRegExp( "@\\w+\\{.+\\}" ) ) >= 0;
    }

    void FileImporterBibTeX::setIgnoreComments( bool ignoreComments )
    {
        m_ignoreComments = ignoreComments;
    }

    void FileImporterBibTeX::cancel()
    {
        cancelFlag = TRUE;
    }

    Element *FileImporterBibTeX::nextElement()
    {
        Token token = nextToken();

        if ( token == tAt )
        {
            QString elementType = readSimpleString();
            if ( elementType.lower() == "comment" )
                return readCommentElement();
            else if ( elementType.lower() == "string" )
                return readMacroElement();
            else if ( elementType.lower() == "preamble" )
                return readPreambleElement();
            else if ( !elementType.isEmpty() )
                return readEntryElement( elementType );
            else
            {
                qDebug( "ElementType is empty" );
                return NULL;
            }
        }
        else if ( token == tUnknown )
            return readPlainCommentElement();

        if ( token != tEOF )
            qDebug( "Don't know how to parse next token: %i", ( int )token );

        return NULL;
    }

    Comment *FileImporterBibTeX::readCommentElement()
    {
        while ( m_currentChar != '{' && m_currentChar != '(' && !m_textStream->atEnd() )
            *m_textStream >> m_currentChar;

        return new Comment( readBracketString( m_currentChar ), TRUE );
    }

    Comment *FileImporterBibTeX::readPlainCommentElement()
    {
        QString result = readLine();
        *m_textStream >> m_currentChar;
        while ( !m_textStream->atEnd() && m_currentChar != '@' && !m_currentChar.isSpace() )
        {
            result.append( '\n' ).append( m_currentChar );
            *m_textStream >> m_currentChar;
            result.append( readLine() );
            *m_textStream >> m_currentChar;
        }
        return new Comment( result, FALSE );
    }

    Macro *FileImporterBibTeX::readMacroElement()
    {
        Token token = nextToken();
        while ( token != tBracketOpen )
        {
            if ( token == tEOF )
            {
                qDebug( "Error in parsing unknown macro: Opening curly brace ({) expected" );
                return NULL;
            }
            token = nextToken();
        }

        QString key = readSimpleString();
        if ( nextToken() != tAssign )
        {
            qDebug( "Error in parsing macro '%s': Assign symbol (=) expected", key.latin1() );
            return NULL;
        }

        Macro *macro = new Macro( key );
        do
        {
            bool isStringKey = FALSE;
            QString text = readString( isStringKey ).replace( QRegExp( "\\s+" ), " " );
            if ( isStringKey )
                macro->value()->items.append( new MacroKey( text ) );
            else
                macro->value()->items.append( new BibTeX::PlainText( text ) );

            token = nextToken();
        }
        while ( token == tDoublecross );

        return macro;
    }

    Preamble *FileImporterBibTeX::readPreambleElement()
    {
        Token token = nextToken();
        while ( token != tBracketOpen )
        {
            if ( token == tEOF )
            {
                qDebug( "Error in parsing unknown preamble: Opening curly brace ({) expected" );
                return NULL;
            }
            token = nextToken();
        }

        Preamble *preamble = new Preamble( );
        do
        {
            bool isStringKey = FALSE;
            QString text = readString( isStringKey ).replace( QRegExp( "\\s+" ), " " );
            if ( isStringKey )
                preamble->value()->items.append( new MacroKey( text ) );
            else
                preamble->value()->items.append( new BibTeX::PlainText( text ) );

            token = nextToken();
        }
        while ( token == tDoublecross );

        return preamble;
    }

    Entry *FileImporterBibTeX::readEntryElement( const QString& typeString )
    {
        Token token = nextToken();
        while ( token != tBracketOpen )
        {
            if ( token == tEOF )
            {
                qDebug( "Error in parsing unknown entry: Opening curly brace ({) expected" );
                return NULL;
            }
            token = nextToken();
        }

        QString key = readSimpleString();
        Entry *entry = new Entry( typeString, key );

        token = nextToken();
        do
        {
            if ( token == tBracketClose || token == tEOF )
                break;
            else if ( token != tComma )
            {
                qDebug( "Error in parsing entry '%s': Comma symbol (,) expected", key.latin1() );
                delete entry;
                return NULL;
            }

            QString fieldTypeName = readSimpleString();
            token = nextToken();
            if ( fieldTypeName == QString::null || token == tBracketClose )
            {
                // entry is buggy, but we still accept it
                break;
            }
            else if ( token != tAssign )
            {
                qDebug( "Error in parsing entry '%s': Assign symbol (=) expected after field name '%s'", key.latin1(), fieldTypeName.latin1() );
                delete entry;
                return NULL;
            }

            /** check for duplicate fields */
            if ( entry->getField( fieldTypeName ) != NULL )
            {
                int i = 1;
                QString appendix = QString::number( i );
                while ( entry->getField( fieldTypeName + appendix ) != NULL )
                {
                    ++i;
                    appendix = QString::number( i );
                }
                fieldTypeName += appendix;
            }

            EntryField *entryField = new EntryField( fieldTypeName );

            token = readValue( entryField->value(), entryField->fieldType() );

            entry->addField( entryField );
        }
        while ( TRUE );

        return entry;
    }

    FileImporterBibTeX::Token FileImporterBibTeX::nextToken()
    {
        if ( m_textStream->atEnd() )
            return tEOF;

        Token curToken = tUnknown;

        while (( m_currentChar.isSpace() || m_currentChar == '\t' ) && !m_textStream->atEnd() )
            *m_textStream >> m_currentChar;

        switch ( m_currentChar.latin1() )
        {
        case '@':
            curToken = tAt;
            break;
        case '{':
        case '(':
            curToken = tBracketOpen;
            break;
        case '}':
        case ')':
            curToken = tBracketClose;
            break;
        case ',':
            curToken = tComma;
            break;
        case ';':
            curToken = tSemicolon;
            break;
        case '=':
            curToken = tAssign;
            break;
        case '#':
            curToken = tDoublecross;
            break;
        default:
            if ( m_textStream->atEnd() )
                curToken = tEOF;
        }

        if ( curToken != tUnknown && curToken != tEOF )
            *m_textStream >> m_currentChar;

        return curToken;
    }

    QString FileImporterBibTeX::readString( bool &isStringKey )
    {
        if ( m_currentChar.isSpace() )
        {
            m_textStream->skipWhiteSpace();
            *m_textStream >> m_currentChar;
        }

        isStringKey = FALSE;
        switch ( m_currentChar.latin1() )
        {
        case '{':
        case '(':
            return readBracketString( m_currentChar );
        case '"':
            return readQuotedString();
        default:
            isStringKey = TRUE;
            return readSimpleString();
        }
    }

    QString FileImporterBibTeX::readSimpleString( QChar until )
    {
        QString result;

        while ( m_currentChar.isSpace() )
        {
            m_textStream->skipWhiteSpace();
            *m_textStream >> m_currentChar;
        }

        if ( m_currentChar.isLetterOrNumber() || extraAlphaNumChars.contains( m_currentChar ) )
        {
            result.append( m_currentChar );
            *m_textStream >> m_currentChar;
        }

        while ( !m_textStream->atEnd() )
        {
            if ( until != '\0' )
            {
                if ( m_currentChar != until )
                    result.append( m_currentChar );
                else
                    break;
            }
            else
                if ( m_currentChar.isLetterOrNumber() || extraAlphaNumChars.contains( m_currentChar ) )
                    result.append( m_currentChar );
                else
                    break;
            *m_textStream >> m_currentChar;
        }
        return result;
    }

    QString FileImporterBibTeX::readQuotedString()
    {
        QString result;
        QChar lastChar = m_currentChar;
        *m_textStream >> m_currentChar;
        while ( !m_textStream->atEnd() )
        {
            if ( m_currentChar != '"' || lastChar == '\\' )
                result.append( m_currentChar );
            else
                break;
            lastChar = m_currentChar;
            *m_textStream >> m_currentChar;
        }

        /** read character after closing " */
        *m_textStream >> m_currentChar;

        return result;
    }

    QString FileImporterBibTeX::readLine()
    {
        QString result;
        while ( !m_textStream->atEnd() && m_currentChar != '\n' )
        {
            result.append( m_currentChar );
            *m_textStream >> m_currentChar;
        }
        return result;
    }

    QString FileImporterBibTeX::readBracketString( const QChar openingBracket )
    {
        QString result;
        QChar closingBracket = '}';
        if ( openingBracket == '(' )
            closingBracket = ')';
        int counter = 1;
        *m_textStream >> m_currentChar;
        while ( !m_textStream->atEnd() )
        {
            if ( m_currentChar == openingBracket )
                counter++;
            else if ( m_currentChar == closingBracket )
                counter--;

            if ( counter == 0 )
                break;
            else
                result.append( m_currentChar );
            *m_textStream >> m_currentChar;
        }
        *m_textStream >> m_currentChar;
        return result;
    }

    FileImporterBibTeX::Token FileImporterBibTeX::readValue( Value *value, EntryField::FieldType fieldType )
    {
        Token token = tUnknown;

        do
        {
            bool isStringKey = FALSE;
            QString text = readString( isStringKey ).replace( QRegExp( "\\s+" ), " " );

            switch ( fieldType )
            {
            case EntryField::ftKeywords:
            {
                if ( isStringKey )
                    qDebug( "WARNING: Cannot handle keywords that are macros" );
                else
                    value->items.append( new KeywordContainer( text ) );
            }
            break;
            case EntryField::ftAuthor:
            case EntryField::ftEditor:
            {
                if ( isStringKey )
                    qDebug( "WARNING: Cannot handle authors/editors that are macros" );
                else
                {
                    QStringList persons;
                    splitPersons( text, persons );
                    PersonContainer *container = new PersonContainer( m_personFirstNameFirst );
                    for ( QStringList::ConstIterator pit = persons.constBegin(); pit != persons.constEnd(); ++pit )
                        container->persons.append( new Person( *pit, m_personFirstNameFirst ) );
                    value->items.append( container );
                }
            }
            break;
            case EntryField::ftPages:
                text.replace( QRegExp( "\\s*--?\\s*" ), QChar( 0x2013 ) );
            default:
            {
                if ( isStringKey )
                    value->items.append( new MacroKey( text ) );
                else
                    value->items.append( new BibTeX::PlainText( text ) );
            }
            }

            token = nextToken();
        }
        while ( token == tDoublecross );

        return token;
    }

    void FileImporterBibTeX::unescapeLaTeXChars( QString &text )
    {
        text.replace( "\\&", "&" );
    }

    void FileImporterBibTeX::splitPersons( const QString& text, QStringList &persons )
    {
        QStringList wordList;
        QString word;
        int bracketCounter = 0;

        for ( unsigned int pos = 0;pos < text.length();++pos )
        {
            if ( text[pos] == '{' )
                ++bracketCounter;
            else if ( text[pos] == '}' )
                --bracketCounter;

            if ( text[pos] == ' ' || text[pos] == '\n' || text[pos] == '\r' )
            {
                if ( word == "and" && bracketCounter == 0 )
                {
                    persons.append( wordList.join( " " ) );
                    wordList.clear();
                }
                else if ( !word.isEmpty() )
                    wordList.append( word );

                word = "";
            }
            else
                word.append( text[pos] );
        }

        wordList.append( word );
        persons.append( wordList.join( " " ) );
    }

    void FileImporterBibTeX::evaluateParameterComments( iconv_t &iconvHandle, const char *cline )
    {
        /** simple preliminary checks before expensive conversion to QString */
        if ( cline[0] == '@' && cline[1] == 'c' )
        {
            QString line = QString( cline ).lower();
            /** check if this file requests a special encoding */
            if ( line.startsWith( "@comment{x-kbibtex-encoding=" ) && line.endsWith( "}\n" ) )
            {
                QString newEncoding = line.mid( 28, line.length() - 30 );
                qDebug( "x-kbibtex-encoding=<%s>", newEncoding.latin1() );
                if ( newEncoding == "latex" ) newEncoding = "utf-8";
                iconv_close( iconvHandle );
                iconvHandle = iconv_open( "utf-8", newEncoding.append( '\0' ).ascii() );
            }
        }
    }

}
