/*****************************************************************
* Unipro UGENE - Integrated Bioinformatics Suite
* Copyright (C) 2008 Unipro, Russia (http://ugene.unipro.ru)
* All Rights Reserved
* 
*     This source code is distributed under the terms of the
*     GNU General Public License. See the files COPYING and LICENSE
*     for details.
*****************************************************************/

#include "MSAUtils.h"

#include <datatype/MAlignment.h>
#include <core_api/DNAAlphabet.h>

namespace GB2 {

void MSAUtils::updateConsensus(const MAlignment& msa, QByteArray& cons, MSAConsensusType ctype) {
    LRegion r(0, msa.getLength());
    updateConsensus(msa, r, cons, ctype);
}

void MSAUtils::updateConsensus(const MAlignment& msa, const LRegion& region, QByteArray& cons, MSAConsensusType ctype) {
    QList<LRegion> l;
    l.append(region);
    updateConsensus(msa, l, cons, ctype);
}

void MSAUtils::updateConsensus(const MAlignment& msa, const QList<LRegion>& regions, QByteArray& cons, MSAConsensusType ctype) {
    if (msa.isEmpty()) {
        return;
    }
    int aliLen = msa.getLength();
    if (cons.length()!=aliLen) {
        cons.resize(aliLen);
    }
    if (ctype == MSAConsensusType_Default || !msa.alphabet->isAmino()) {
        char  defChar = (ctype == MSAConsensusType_Default) ? MAlignment_GapChar : ' ';
        foreach(const LRegion& r, regions) {
            for (int i = r.startPos, n = r.endPos(); i < n ; i++) {
                char pc = msa.alignedSeqs.first().sequence[i];
                if (pc == MAlignment_GapChar) {
                    pc = defChar;
                }
                for (int s = 1, nSeq = msa.getNumSequences(); s < nSeq; s++) {
                    char c = msa.alignedSeqs[s].sequence[i];
                    if (c != pc) {
                        pc = defChar;
                        break;
                    }
                }
                cons[i] = (pc == defChar) ? defChar : (ctype == MSAConsensusType_Default ? pc : '*');
            }
        }
    } else {
        assert(ctype == MSAConsensusType_ClustalW);
        /* From ClustalW doc:
            '*' indicates positions which have a single, fully conserved residue
            ':' indicates that one of the following 'strong' groups is fully conserved:
                STA, NEQK, NHQK, NDEQ, QHRK, MILV, MILF, HY, FYW, 
            '.' indicates that one of the following 'weaker' groups is fully conserved:
                CSA, ATV, SAG, STNK, STPA, SGND, SNDEQK, NDEQHK, NEQHRK, FVLIM, HFY
        */
        static QByteArray strongGroups[] = {"STA", "NEQK", "NHQK", "NDEQ", "QHRK", "MILV", "MILF", "HY", "FYW"};
        static QByteArray weakGroups[]   = {"CSA", "ATV", "SAG", "STNK", "STPA", "SGND", "SNDEQK", "NDEQHK", "NEQHRK", "FVLIM", "HFY"};
        static int maxStrongGroupLen = 4;
        static int maxWeakGroupLen = 6;

        QByteArray currentGroup;
        foreach(const LRegion& r, regions) {
            for (int ri = r.startPos, rn = r.endPos(); ri < rn ; ri++) {
                currentGroup.clear();
                for (int s = 0, nSeq = msa.getNumSequences(); s < nSeq; s++) {
                    char c = msa.alignedSeqs[s].sequence[ri];
                    if (!currentGroup.contains(c)) {
                        currentGroup.append(c);
                    }
                }
                char consChar = MAlignment_GapChar;
                if (currentGroup.size() == 1) {
                    consChar = (currentGroup[0] == MAlignment_GapChar) ? ' ' : '*';
                } else  {
                    bool ok = false;
                    int currentLen = currentGroup.length();
                    const char* currentGroupData = currentGroup.data();
                    //check strong groups
                    if (currentLen <= maxStrongGroupLen) {
                        for (int sgi=0, sgn = sizeof(strongGroups) / sizeof(QByteArray); sgi < sgn && !ok; sgi++) {
                            bool matches = true;
                            const QByteArray& sgroup = strongGroups[sgi];
                            for (int j=0; j < currentLen && matches; j++) {
                                char c = currentGroupData[j];
                                matches = sgroup.contains(c);
                            }
                            ok = matches;
                        }
                        if (ok) {
                            consChar = ':';
                        }
                    } 

                    //check weak groups
                    if (!ok && currentLen <= maxWeakGroupLen) {
                        for (int wgi=0, wgn = sizeof(weakGroups) / sizeof(QByteArray); wgi < wgn && !ok; wgi++) {
                            bool matches = true;
                            const QByteArray& wgroup = weakGroups[wgi];
                            for (int j=0; j < currentLen && matches; j++) {
                                char c = currentGroupData[j];
                                matches = wgroup.contains(c);
                            }
                            ok = matches;
                        }
                        if (ok) {
                            consChar = '.';
                        }
                    } 
                    //use default
                    if (!ok) {
                        consChar = ' ';
                    }
                } //amino
                cons[ri] = consChar;
            }
        }//all regions
    } //cons type
}


}//namespace
