//*****************************************************************************
//
// File:    DnaSequenceList.java
// Package: edu.rit.phyl.pars
// Unit:    Class edu.rit.phyl.pars.DnaSequenceList
//
// This Java source file is copyright (C) 2007 by Alan Kaminsky. All rights
// reserved. For further information, contact the author, Alan Kaminsky, at
// ark@cs.rit.edu.
//
// This Java source file is part of the Parallel Java Library ("PJ"). PJ is free
// software; you can redistribute it and/or modify it under the terms of the GNU
// General Public License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// PJ is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
// A PARTICULAR PURPOSE. See the GNU General Public License for more details.
//
// A copy of the GNU General Public License is provided in the file gpl.txt. You
// may also obtain a copy of the GNU General Public License on the World Wide
// Web at http://www.gnu.org/licenses/gpl.html.
//
//******************************************************************************

package edu.rit.phyl.pars;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;

import java.util.Arrays;
import java.util.Scanner;

/**
 * Class DnaSequenceList provides a list of {@linkplain DnaSequence}s. A name
 * may be associated with each DNA sequence. Methods for reading and writing
 * textual files of DNA sequences are also provided.
 * <P>
 * Each DNA sequence consists of a sequence of <B>sites</B>. Each site has a
 * <B>state,</B> which is a set of <B>bases</B>. The four bases are adenine,
 * cytosine, guanine, and thymine. For textual I/O, each state is represented by
 * a single character as follows:
 * <P>
 * <TABLE BORDER=0 CELLPADDING=0 CELLSPACING=0>
 * <TR><TD><I>Char.</I></TD><TD WIDTH=20> </TD>
 * <TD><I>Meaning</I></TD><TD WIDTH=20> </TD>
 * <TD><I>Set</I></TD></TR>
 * <TR><TD>A</TD><TD WIDTH=20> </TD>
 * <TD>Adenine</TD><TD WIDTH=20> </TD>
 * <TD>(A)</TD></TR>
 * <TR><TD>C</TD><TD WIDTH=20> </TD>
 * <TD>Cytosine</TD><TD WIDTH=20> </TD>
 * <TD>(C)</TD></TR>
 * <TR><TD>G</TD><TD WIDTH=20> </TD>
 * <TD>Guanine</TD><TD WIDTH=20> </TD>
 * <TD>(G)</TD></TR>
 * <TR><TD>T</TD><TD WIDTH=20> </TD>
 * <TD>Thymine</TD><TD WIDTH=20> </TD>
 * <TD>(T)</TD></TR>
 * <TR><TD>Y</TD><TD WIDTH=20> </TD>
 * <TD>pYrimidine</TD><TD WIDTH=20> </TD>
 * <TD>(C or T)</TD></TR>
 * <TR><TD>R</TD><TD WIDTH=20> </TD>
 * <TD>puRine</TD><TD WIDTH=20> </TD>
 * <TD>(A or G)</TD></TR>
 * <TR><TD>W</TD><TD WIDTH=20> </TD>
 * <TD>"Weak"</TD><TD WIDTH=20> </TD>
 * <TD>(A or T)</TD></TR>
 * <TR><TD>S</TD><TD WIDTH=20> </TD>
 * <TD>"Strong"</TD><TD WIDTH=20> </TD>
 * <TD>(C or G)</TD></TR>
 * <TR><TD>K</TD><TD WIDTH=20> </TD>
 * <TD>"Keto"</TD><TD WIDTH=20> </TD>
 * <TD>(G or T)</TD></TR>
 * <TR><TD>M</TD><TD WIDTH=20> </TD>
 * <TD>"aMino"</TD><TD WIDTH=20> </TD>
 * <TD>(A or C)</TD></TR>
 * <TR><TD>B</TD><TD WIDTH=20> </TD>
 * <TD>not A</TD><TD WIDTH=20> </TD>
 * <TD>(C or G or T)</TD></TR>
 * <TR><TD>D</TD><TD WIDTH=20> </TD>
 * <TD>not C</TD><TD WIDTH=20> </TD>
 * <TD>(A or G or T)</TD></TR>
 * <TR><TD>H</TD><TD WIDTH=20> </TD>
 * <TD>not G</TD><TD WIDTH=20> </TD>
 * <TD>(A or C or T)</TD></TR>
 * <TR><TD>V</TD><TD WIDTH=20> </TD>
 * <TD>not T</TD><TD WIDTH=20> </TD>
 * <TD>(A or C or G)</TD></TR>
 * <TR><TD>X</TD><TD WIDTH=20> </TD>
 * <TD>unknown</TD><TD WIDTH=20> </TD>
 * <TD>(A or C or G or T)</TD></TR>
 * <TR><TD>-</TD><TD WIDTH=20> </TD>
 * <TD>deletion</TD><TD WIDTH=20> </TD>
 * <TD>()</TD></TR>
 * </TABLE>
 * <P>
 * The DNA sequence file format is that used by Joseph Felsenstein's Phylogeny
 * Inference Package (PHYLIP). While the file is a plain text file, it often has
 * the extension <TT>".phy"</TT> to indicate that it is in PHYLIP format. For
 * further information, see:
 * <UL>
 * <LI>
 * PHYLIP -- <A HREF="http://evolution.genetics.washington.edu/phylip/phylip.html">http://evolution.genetics.washington.edu/phylip/phylip.html</A>
 * <LI>
 * Input file format -- <A HREF="http://evolution.genetics.washington.edu/phylip/doc/sequence.html">http://evolution.genetics.washington.edu/phylip/doc/sequence.html</A>
 * </UL>
 * <P>
 * Here is an example of an input file:
 * <P>
 * <TABLE BORDER=1 CELLPADDING=4 CELLSPACING=0>
 * <TR>
 * <TD>
 * <PRE>  5    42
 * Turkey     AAGCTNGGGC ATTTCAGGGT 
 * Salmo gair AAGCCTTGGC AGTGCAGGGT 
 * H. Sapiens ACCGGTTGGC CGTTCAGGGT 
 * Chimp      AAACCCTTGC CGTTACGCTT 
 * Gorilla    AAACCCTTGC CGGTACGCTT 
 * 
 * GAGCCCGGGC AATACAGGGT AT
 * GAGCCGTGGC CGGGCACGGT AT
 * ACAGGTTGGC CGTTCAGGGT AA
 * AAACCGAGGC CGGGACACTC AT
 * AAACCATTGC CGGTACGCTT AA</PRE>
 * </TD>
 * </TR>
 * </TABLE>
 * <P>
 * The first line contains the number of species <I>S</I> and the number of
 * sites <I>N</I> in each sequence. <I>S</I> must be &gt;= 2. <I>N</I> must be
 * &gt;= 1.
 * <P>
 * The next <I>S</I> lines contain the initial data for each species. The first
 * ten characters contain the sequence name. This must be exactly ten
 * characters, padded with blanks if necessary. Then comes one character for
 * each site in the sequence. Uppercase and lowercase are considered the same.
 * Characters other than those for the states listed above are ignored. Often, a
 * blank is inserted every ten characters for readability, but this is not
 * necessary. After these <I>S</I> lines come zero or more blank lines for
 * readability, which are ignored. If there is more sequence data, the next
 * <I>S</I> lines give the states for the next sites in the sequences. This
 * continues for the rest of the file.
 * <P>
 * This is known as the "interleaved" file format. There is also a "sequential"
 * file format, but the sequential file format is not supported.
 * <P>
 * Thus, the complete sequence for each species in the example is:
 * <P>
 * <TABLE BORDER=0 CELLPADDING=0 CELLSPACING=0>
 * <TR><TD><I>Species</I></TD><TD WIDTH=20> </TD>
 * <TD><I>Sequence</I></TD></TR>
 * <TR><TD>Turkey</TD><TD WIDTH=20> </TD>
 * <TD><TT>AAGCTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT</TT></TD></TR>
 * <TR><TD>Salmo gair</TD><TD WIDTH=20> </TD>
 * <TD><TT>AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT</TT></TD></TR>
 * <TR><TD>H. Sapiens</TD><TD WIDTH=20> </TD>
 * <TD><TT>ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA</TT></TD></TR>
 * <TR><TD>Chimp</TD><TD WIDTH=20> </TD>
 * <TD><TT>AAACCCTTGCCGTTACGCTTAAACCGAGGCCGGGACACTCAT</TT></TD></TR>
 * <TR><TD>Gorilla</TD><TD WIDTH=20> </TD>
 * <TD><TT>AAACCCTTGCCGGTACGCTTAAACCATTGCCGGTACGCTTAA</TT></TD></TR>
 * </TABLE>
 * <P>
 * In the input file, the following alternate characters can be used: X, N, and
 * ? all mean "unknown." O (capital letter O) and - (hyphen) both mean
 * "deletion." The character . (period) means "the same as the corresponding
 * site in the first species." Here is another input file with the same
 * sequences as the one above:
 * <P>
 * <TABLE BORDER=1 CELLPADDING=4 CELLSPACING=0>
 * <TR>
 * <TD>
 * <PRE>  5    42
 * Turkey     AAGCTNGGGC ATTTCAGGGT 
 * Salmo gair ..G.CTT... AG.G...... 
 * H. Sapiens .CCGGTT... .G........ 
 * Chimp      ..A.CCTT.. .G..AC.CT. 
 * Gorilla    ..A.CCTT.. .GG.AC.CT. 
 * 
 * GAGCCCGGGC AATACAGGGT AT
 * .....GT... CGGG..C... ..
 * ACAGGTT... CG.T...... .A
 * A.A..GA... CGGGACACTC ..
 * A.A..ATT.. CGGTAC.CT. .A</PRE>
 * </TD>
 * </TR>
 * </TABLE>
 * <P>
 * Here are some more example DNA sequence files:
 * <UL>
 * <LI><A HREF="doc-files/example.phy">example.phy</A>
 * <LI><A HREF="doc-files/iguana16.phy">iguana16.phy</A>
 * <LI><A HREF="doc-files/iguana18.phy">iguana18.phy</A>
 * </UL>
 *
 * @author  Alan Kaminsky
 * @version 11-May-2007
 */
public class DnaSequenceList
	{

// Hidden data members.

	DnaSequence[] mySequence;
	String[] myName;

// Exported constructors.

	/**
	 * Construct a new zero-length DNA sequence list.
	 */
	public DnaSequenceList()
		{
		this (0);
		}

	/**
	 * Construct a new DNA sequence list with the given length. Initially, all
	 * DNA sequences and names in the list are null.
	 *
	 * @param  N  Length (number of DNA sequences).
	 *
	 * @exception  NegativeArraySizeException
	 *     (unchecked exception) Thrown if <TT>N</TT> &lt; 0.
	 */
	public DnaSequenceList
		(int N)
		{
		this.mySequence = new DnaSequence [N];
		this.myName = new String [N];
		}

// Exported operations.

	/**
	 * Obtain this DNA sequence's length.
	 *
	 * @return  Length (number of DNA sequences).
	 */
	public int length()
		{
		return mySequence.length;
		}

	/**
	 * Get the DNA sequence at the given index in this DNA sequence list.
	 *
	 * @param  index  Index.
	 *
	 * @return  DNA sequence.
	 *
	 * @exception  ArrayIndexOutOfBoundsException
	 *     (unchecked exception) Thrown if <TT>index</TT> is not in the range 0
	 *     .. <TT>length()</TT>-1.
	 */
	public DnaSequence getDnaSequence
		(int index)
		{
		return mySequence[index];
		}

	/**
	 * Set the DNA sequence at the given index in this DNA sequence list.
	 *
	 * @param  index  Index.
	 * @param  seq    DNA sequence.
	 *
	 * @exception  ArrayIndexOutOfBoundsException
	 *     (unchecked exception) Thrown if <TT>index</TT> is not in the range 0
	 *     .. <TT>length()</TT>-1.
	 */
	public void setDnaSequence
		(int index,
		 DnaSequence seq)
		{
		mySequence[index] = seq;
		}

	/**
	 * Get the DNA sequence name at the given index in this DNA sequence list.
	 *
	 * @param  index  Index.
	 *
	 * @return  DNA sequence name.
	 *
	 * @exception  ArrayIndexOutOfBoundsException
	 *     (unchecked exception) Thrown if <TT>index</TT> is not in the range 0
	 *     .. <TT>length()</TT>-1.
	 */
	public String getName
		(int index)
		{
		return myName[index];
		}

	/**
	 * Set the DNA sequence name at the given index in this DNA sequence list.
	 *
	 * @param  index  Index.
	 * @param  name   Name.
	 *
	 * @exception  ArrayIndexOutOfBoundsException
	 *     (unchecked exception) Thrown if <TT>index</TT> is not in the range 0
	 *     .. <TT>length()</TT>-1.
	 */
	public void setName
		(int index,
		 String name)
		{
		myName[index] = name;
		}

	/**
	 * Read a DNA sequence list from the given input file. The input file must
	 * be in interleaved PHYLIP format.
	 * <P>
	 * The DNA sequences' states are read from the input file. The DNA
	 * sequences' scores are set to 0, 1, 2, and so on in the order the DNA
	 * sequences appear in the input file; in other words, each DNA sequence's
	 * score is its index in the DNA sequence list.
	 *
	 * @param  file  File.
	 *
	 * @return  DNA sequence list.
	 *
	 * @exception  NullPointerException
	 *     (unchecked exception) Thrown if <TT>file</TT> is null.
	 * @exception  IOException
	 *     Thrown if an I/O error occurred. Thrown if the input file's contents
	 *     were invalid.
	 */
	public static DnaSequenceList read
		(File file)
		throws IOException
		{
		Scanner filescanner = new Scanner (file);
		Scanner linescanner;
		int S, N;
		DnaSequenceList sequence;
		int[] sitecount;
		String line;

		try
			{
			// Read number of species and number of sites from first line.
			if (! filescanner.hasNextLine())
				{
				throw new IOException
					("DnaSequenceList.read(\"" + file + "\"): " +
					 "Empty file");
				}
			linescanner = new Scanner (filescanner.nextLine());
			if (! linescanner.hasNextInt())
				{
				throw new IOException
					("DnaSequenceList.read(\"" + file + "\"): " +
					 "Number of species invalid or missing");
				}
			S = linescanner.nextInt();
			if (S < 2)
				{
				throw new IOException
					("DnaSequenceList.read(\"" + file + "\"): " +
					 "Number of species must be >= 2");
				}
			if (! linescanner.hasNextInt())
				{
				throw new IOException
					("DnaSequenceList.read(\"" + file + "\"): " +
					 "Number of sites invalid or missing");
				}
			N = linescanner.nextInt();
			if (N < 1)
				{
				throw new IOException
					("DnaSequenceList.read(\"" + file + "\"): " +
					 "Number of sites must be >= 1");
				}

			// Set up DNA sequence list and site count array.
			sequence = new DnaSequenceList (S);
			sitecount = new int [S];

			// Read sequence data from groups of S lines until EOF.
			fileloop: for (;;)
				{
				speciesloop: for (int s = 0; s < S; ++ s)
					{
					// Get a line of sequence data for species s.
					if (filescanner.hasNextLine())
						{
						}
					else if (s != 0 || sitecount[s] == 0)
						{
						throw new IOException
							("DnaSequenceList.read(\"" + file + "\"): " +
							 "Missing a line of sequence data for species " +
							 (s+1));
						}
					else
						{
						break fileloop;
						}
					line = filescanner.nextLine();

					// Ignore blank lines.
					if (line.trim().equals (""))
						{
						-- s;
						continue;
						}

					// The first time, extract sequence name and create
					// DnaSequence object.
					if (sitecount[s] == 0)
						{
						if (line.length() < 10)
							{
							throw new IOException
								("DnaSequenceList.read(\"" + file + "\"): " +
								 "Name must be 10 characters for species " +
								 (s+1));
							}
						sequence.mySequence[s] = new DnaSequence (N, s);
						sequence.myName[s] = line.substring (0, 10) .trim();
						line = line.substring (10);
						}

					// Parse characters in sequence data.
					int len = line.length();
					byte[] seq = sequence.mySequence[s].mySites;
					byte[] seq0 = sequence.mySequence[0].mySites;
					int count = sitecount[s];
					for (int i = 0; i < len; ++ i)
						{
						switch (line.charAt(i))
							{
							case 'O': case 'o': case '-':
								verifyCount (count, N, file, s);
								seq[count] = (byte)  0; // ----
								++ count;
								break;
							case 'A': case 'a':
								verifyCount (count, N, file, s);
								seq[count] = (byte)  1; // ---A
								++ count;
								break;
							case 'C': case 'c':
								verifyCount (count, N, file, s);
								seq[count] = (byte)  2; // --C-
								++ count;
								break;
							case 'M': case 'm':
								verifyCount (count, N, file, s);
								seq[count] = (byte)  3; // --CA
								++ count;
								break;
							case 'G': case 'g':
								verifyCount (count, N, file, s);
								seq[count] = (byte)  4; // -G--
								++ count;
								break;
							case 'R': case 'r':
								verifyCount (count, N, file, s);
								seq[count] = (byte)  5; // -G-A
								++ count;
								break;
							case 'S': case 's':
								verifyCount (count, N, file, s);
								seq[count] = (byte)  6; // -GC-
								++ count;
								break;
							case 'V': case 'v':
								verifyCount (count, N, file, s);
								seq[count] = (byte)  7; // -GCA
								++ count;
								break;
							case 'T': case 't':
								verifyCount (count, N, file, s);
								seq[count] = (byte)  8; // T---
								++ count;
								break;
							case 'W': case 'w':
								verifyCount (count, N, file, s);
								seq[count] = (byte)  9; // T--A
								++ count;
								break;
							case 'Y': case 'y':
								verifyCount (count, N, file, s);
								seq[count] = (byte) 10; // T-C-
								++ count;
								break;
							case 'H': case 'h':
								verifyCount (count, N, file, s);
								seq[count] = (byte) 11; // T-CA
								++ count;
								break;
							case 'K': case 'k':
								verifyCount (count, N, file, s);
								seq[count] = (byte) 12; // TG--
								++ count;
								break;
							case 'D': case 'd':
								verifyCount (count, N, file, s);
								seq[count] = (byte) 13; // TG-A
								++ count;
								break;
							case 'B': case 'b':
								verifyCount (count, N, file, s);
								seq[count] = (byte) 14; // TGC-
								++ count;
								break;
							case 'X': case 'x': case 'N': case 'n': case '?':
								verifyCount (count, N, file, s);
								seq[count] = (byte) 15; // TGCA
								++ count;
								break;
							case '.':
								verifyCount (count, N, file, s);
								if (s == 0)
									{
									throw new IOException
										("DnaSequenceList.read(\"" + file +
										 "\"): " +
										 "'.' not allowed in species 1");
									}
								if (count >= sitecount[0])
									{
									throw new IOException
										("DnaSequenceList.read(\"" + file +
										 "\"): " +
										 "'.' in species " + (s+1) +
										 " has no corresponding site in species 1");
									}
								seq[count] = seq0[count];
								++ count;
								break;
							}
						}
					sitecount[s] = count;
					}
				}

			// Verify correct site count for all species.
			for (int s = 0; s < S; ++ s)
				{
				if (sitecount[s] < N)
					{
					throw new IOException
						("DnaSequenceList.read(\"" + file + "\"): " +
						 "Too few sites for species " + (s+1));
					}
				else if (sitecount[s] > N)
					{
					throw new IOException
						("DnaSequenceList.read(\"" + file + "\"): " +
						 "Too many sites for species " + (s+1));
					}
				}

			// Return DNA sequence list.
			return sequence;
			}

		finally
			{
			filescanner.close();
			}
		}

	private static void verifyCount
		(int count,
		 int N,
		 File file,
		 int s)
		throws IOException
		{
		if (count >= N)
			{
			throw new IOException
				("DnaSequenceList.read(\"" + file + "\"): " +
				 "Too many sites for species " + (s+1));
			}
		}

	/**
	 * Write this DNA sequence list to the given output file. The output file is
	 * in interleaved PHYLIP format. There are 70 sites on each output line.
	 * Periods are not used.
	 *
	 * @param  file      File.
	 *
	 * @exception  NullPointerException
	 *     (unchecked exception) Thrown if <TT>file</TT> is null.
	 * @exception  IOException
	 *     Thrown if an I/O error occurred.
	 */
	public void write
		(File file)
		throws IOException
		{
		write (file, 70, false);
		}

	/**
	 * Write this DNA sequence list to the given output file. The output file is
	 * in interleaved PHYLIP format.
	 *
	 * @param  file      File.
	 * @param  sites     Number of sites per output line.
	 * @param  periods   True to use periods, false not to use periods.
	 *
	 * @exception  NullPointerException
	 *     (unchecked exception) Thrown if <TT>file</TT> is null.
	 * @exception  IllegalArgumentException
	 *     (unchecked exception) Thrown if <TT>sites</TT> &lt;= 10.
	 * @exception  IOException
	 *     Thrown if an I/O error occurred.
	 */
	public void write
		(File file,
		 int sites,
		 boolean periods)
		throws IOException
		{
		if (sites <= 10)
			{
			throw new IllegalArgumentException
				("DnaSequenceList.write(): sites = " + sites + " illegal");
			}
			
		PrintStream ps =
			new PrintStream
				(new BufferedOutputStream
					(new FileOutputStream (file)));

		try
			{
			// Print number of species and number of sites.
			int S = mySequence.length;
			int N = mySequence[0].length();
			ps.print (S);
			ps.print (' ');
			ps.print (N);
			ps.println();

			// Print groups of sites for each species. On the first line, print
			// sequence name, padded or truncated to 10 characters.
			int lb = 0;
			int ub = Math.min (sites-10, N);
			byte[] seq0 = mySequence[0].mySites;
			while (lb < N)
				{
				for (int s = 0; s < S; ++ s)
					{
					byte[] seq = mySequence[s].mySites;
					if (lb == 0) ps.print (padName (myName[s]));
					for (int i = lb; i < ub; ++ i)
						{
						if ((lb == 0 || i > lb) && i % 10 == 0)
							{
							ps.print (' ');
							}
						if (periods && s > 0 && seq[i] == seq0[i])
							{
							ps.print ('.');
							}
						else
							{
							ps.print (DnaSequence.state2char [seq[i]]);
							}
						}
					ps.println();
					}
				ps.println();
				lb = ub;
				ub = Math.min (ub+sites, N);
				}

			// Check for I/O errors.
			if (ps.checkError())
				{
				throw new IOException
					("DnaSequenceList.write(\"" + file + "\"): I/O error");
				}
			}

		finally
			{
			ps.close();
			}
		}

	private static String padName
		(String name)
		{
		if (name == null) return padding[0];
		int len = name.length();
		if (len == 10)
			{
			return name;
			}
		else if (len > 10)
			{
			return name.substring (0, 10);
			}
		else
			{
			return name + padding[len];
			}
		}

	private static String[] padding = new String[]
		{/*0*/ "          ",
		 /*1*/ "         ",
		 /*2*/ "        ",
		 /*3*/ "       ",
		 /*4*/ "      ",
		 /*5*/ "     ",
		 /*6*/ "    ",
		 /*7*/ "   ",
		 /*8*/ "  ",
		 /*9*/ " "};

	/**
	 * Truncate this DNA sequence list to the given length. If this list is
	 * already shorter than <TT>len</TT>, the <TT>truncate()</TT> method does
	 * nothing.
	 *
	 * @param  len  Length.
	 *
	 * @exception  NegativeArraySizeException
	 *     (unchecked exception) Thrown if <TT>len</TT> &lt; 0.
	 */
	public void truncate
		(int len)
		{
		if (len < mySequence.length)
			{
			DnaSequence[] newSequence = new DnaSequence [len];
			System.arraycopy (mySequence, 0, newSequence, 0, len);
			mySequence = newSequence;
			}
		}

	/**
	 * Warn of duplicates in this DNA sequence list. For each DNA sequence that
	 * is equal to any other DNA sequence, a warning message is printed on the
	 * standard output.
	 */
	public void warnOfDuplicates()
		{
		int S = mySequence.length;
		for (int i = 0; i < S-1; ++ i)
			{
			for (int j = i+1; j < S; ++ j)
				{
				if (mySequence[i].equals (mySequence[j]))
					{
					System.out.println
						("WARNING: Sequence \"" + myName[i] +
						 "\" and sequence \"" + myName[j] +
						 "\" are identical");
					}
				}
			}
		}

	/**
	 * Excise uninformative sites from the DNA sequences in this DNA sequence
	 * list.
	 * <P>
	 * Each site in the DNA sequences is either "uninformative" or
	 * "informative," defined as follows:
	 * <UL>
	 * <LI>
	 * If the site has the same state (A, C, G, or T) in all sequences, the
	 * site is uninformative. This site will contribute no state changes to the
	 * parsimony score in every possible phylogenetic tree.
	 * <P><LI>
	 * If the site has the same state in all sequences, except for one or more
	 * sequences that have a unique state at that site (i.e., a state that
	 * appears in no other sequences at that site), the site is uninformative.
	 * The site will contribute the same number of state changes to the
	 * parsimony score in every possible phylogenetic tree, namely the number of
	 * different states that appear at that site, minus 1.
	 * <P><LI>
	 * Otherwise, the site is informative. There are at least two different
	 * states at that site, and each state appears in at least two different
	 * sequences. The site will contribute a different number of state changes
	 * to the parsimony score, depending on where the sequences appear in the
	 * phylogenetic tree.
	 * </UL>
	 * <P>
	 * Since the uninformative sites do not affect the outcome of a maximum
	 * parsimony phylogenetic tree search, the uninformative sites can be
	 * omitted from the tree scoring process to save time. The informative sites
	 * do affect the outcome and must be included in the tree scoring process.
	 * <P>
	 * The <TT>exciseUninformativeSites()</TT> method stores in the output
	 * <TT>excised</TT> argument a list of DNA sequences that are the same as
	 * those in this DNA sequence list, except all the uninformative sites have
	 * been removed. The DNA sequence names in the <TT>excised</TT> list are set
	 * to the same names as this DNA sequence list.
	 * <P>
	 * The <TT>exciseUninformativeSites()</TT> method returns the number of
	 * state changes that the (excised) uninformative sites contribute to the
	 * parsimony score.
	 *
	 * @param  excised  Output DNA sequence list with uninformative sites
	 *                  excised.
	 *
	 * @return  Number of state changes the uninformative sites contribute to
	 *          the parsimony score.
	 *
	 * @exception  NullPointerException
	 *     (unchecked exception) Thrown if <TT>excised</TT> is null.
	 */
	public int exciseUninformativeSites
		(DnaSequenceList excised)
		{
		int S = this.mySequence.length;
		int N = this.mySequence[0].length();

		// Allocate storage to remember each site's category: true =
		// informative, false = uninformative. Also count number of informative
		// sites and number of state changes in uninformative sites.
		boolean[] isInformative = new boolean [N];
		int nInformative = 0;
		int nChanges = 0;

		// Allocate storage to count states at each site.
		int[] stateCount = new int [16];

		// Examine all sites.
		for (int i = 0; i < N; ++ i)
			{
			Arrays.fill (stateCount, 0);

			// Examine current site in all sequences.
			for (int s = 0; s < S; ++ s)
				{
				++ stateCount[this.mySequence[s].mySites[i]];
				}

			// Count how many values in stateCount are 2 or greater.
			int x = 0;
			for (int j = 0; j < 16; ++ j)
				{
				if (stateCount[j] >= 2) ++ x;
				}

			// Categorize current site.
			if (x >= 2)
				{
				// Informative site.
				isInformative[i] = true;
				++ nInformative;
				}
			else
				{
				// Uninformative site. Increase number of state changes by
				// (number of different states - 1).
				isInformative[i] = false;
				for (int j = 0; j < 16; ++ j)
					{
					if (stateCount[j] > 0) ++ nChanges;
					}
				-- nChanges;
				}
			}

		// Allocate storage for excised DNA sequences.
		excised.mySequence = new DnaSequence [S];
		for (int s = 0; s < S; ++ s)
			{
			excised.mySequence[s] =
				new DnaSequence (nInformative, this.mySequence[s].myScore);
			}

		// Copy informative sites to excised DNA sequences.
		for (int s = 0; s < S; ++ s)
			{
			byte[] excSites = excised.mySequence[s].mySites;
			byte[] mySites = this.mySequence[s].mySites;
			int j = 0;
			for (int i = 0; i < N; ++ i)
				{
				if (isInformative[i])
					{
					excSites[j++] = mySites[i];
					}
				}
			}

		// Copy DNA sequence names.
		excised.myName = new String [S];
		System.arraycopy (this.myName, 0, excised.myName, 0, S);

		// Return number of state changes.
		return nChanges;
		}

	/**
	 * Shuffle the DNA sequences in this list into descending order of distance
	 * from each other. This is a heuristic for a branch-and-bound search; if
	 * the DNA sequences are added to the phylogenetic tree in descending order
	 * of distance, the search tends to take less time, because the best
	 * (smallest) parsimony score tends to be found sooner.
	 * <P>
	 * Specifically, the <TT>shuffleDescendingDistance()</TT> method:
	 * <OL TYPE=1>
	 * <LI>
	 * Finds the DNA sequence that is most distant from all the others. This
	 * becomes the first DNA sequence in the list. The Hamming distance is used;
	 * i.e., the number of sites that disagree between two DNA sequences.
	 * <P><LI>
	 * Finds the DNA sequence that is most distant from the first DNA sequence.
	 * This becomes the second DNA sequence in the list.
	 * <P><LI>
	 * Finds the DNA sequence that is next most distant from the first DNA
	 * sequence. This becomes the third DNA sequence in the list.
	 * <P><LI>
	 * And so on.
	 * </OL>
	 */
	public void shuffleDescendingDistance()
		{
		// Early return if two sequences or fewer.
		int S = mySequence.length;
		if (S <= 2) return;

		// Find the DNA sequence that is most distant from all the others.
		int maxDist = -1;
		int a = -1;
		for (int i = 0; i < S; ++ i)
			{
			int dist = 0;
			for (int j = 0; j < S; ++ j) dist += distance (i, j);
			if (dist > maxDist)
				{
				maxDist = dist;
				a = i;
				}
			}

		// Make this the first DNA sequence in the list.
		DnaSequence swap1 = mySequence[0];
		mySequence[0] = mySequence[a];
		mySequence[a] = swap1;
		String swap2 = myName[0];
		myName[0] = myName[a];
		myName[a] = swap2;

		// For all remaining DNA sequences...
		for (int i = 1; i < S; ++ i)
			{
			// Find the DNA sequence that is most distant from the first.
			maxDist = -1;
			a = -1;
			for (int j = i; j < S; ++ j)
				{
				int dist = distance (0, j);
				if (dist > maxDist)
					{
					maxDist = dist;
					a = j;
					}
				}

			// Make this the next DNA sequence in the list.
			swap1 = mySequence[i];
			mySequence[i] = mySequence[a];
			mySequence[a] = swap1;
			swap2 = myName[i];
			myName[i] = myName[a];
			myName[a] = swap2;
			}
		}

	/**
	 * Shuffle the DNA sequences in this list into ascending order of distance
	 * from each other.
	 * <P>
	 * Specifically, the <TT>shuffleAscendingDistance()</TT> method:
	 * <OL TYPE=1>
	 * <LI>
	 * Finds the DNA sequence that is least distant from all the others. This
	 * becomes the first DNA sequence in the list. The Hamming distance is used;
	 * i.e., the number of sites that disagree between two DNA sequences.
	 * <P><LI>
	 * Finds the DNA sequence that is least distant from the first DNA sequence.
	 * This becomes the second DNA sequence in the list.
	 * <P><LI>
	 * Finds the DNA sequence that is next least distant from the first DNA
	 * sequence. This becomes the third DNA sequence in the list.
	 * <P><LI>
	 * And so on.
	 * </OL>
	 */
	public void shuffleAscendingDistance()
		{
		// Early return if two sequences or fewer.
		int S = mySequence.length;
		if (S <= 2) return;

		// Find the DNA sequence that is least distant from all the others.
		int minDist = Integer.MAX_VALUE;
		int a = -1;
		for (int i = 0; i < S; ++ i)
			{
			int dist = 0;
			for (int j = 0; j < S; ++ j) dist += distance (i, j);
			if (dist < minDist)
				{
				minDist = dist;
				a = i;
				}
			}

		// Make this the first DNA sequence in the list.
		DnaSequence swap1 = mySequence[0];
		mySequence[0] = mySequence[a];
		mySequence[a] = swap1;
		String swap2 = myName[0];
		myName[0] = myName[a];
		myName[a] = swap2;

		// For all remaining DNA sequences...
		for (int i = 1; i < S; ++ i)
			{
			// Find the DNA sequence that is least distant from the first.
			minDist = Integer.MAX_VALUE;
			a = -1;
			for (int j = i; j < S; ++ j)
				{
				int dist = distance (0, j);
				if (dist < minDist)
					{
					minDist = dist;
					a = j;
					}
				}

			// Make this the next DNA sequence in the list.
			swap1 = mySequence[i];
			mySequence[i] = mySequence[a];
			mySequence[a] = swap1;
			swap2 = myName[i];
			myName[i] = myName[a];
			myName[a] = swap2;
			}
		}

	private int distance
		(int i,
		 int j)
		{
		int N = mySequence[i].length();
		byte[] site_i = mySequence[i].mySites;
		byte[] site_j = mySequence[j].mySites;
		int dist = 0;
		for (int k = 0; k < N; ++ k)
			{
			if (site_i[k] != site_j[k]) ++ dist;
			}
		return dist;
		}

	}
