/*************************************************************************
 *
 *  $RCSfile: LiteMorph.java,v $
 *
 *  $Revision: 1.1 $
 *
 *  last change: $Author: abi $ $Date: 2000/11/30 18:03:09 $
 *
 *  The Contents of this file are made available subject to the terms of
 *  either of the following licenses
 *
 *         - GNU Lesser General Public License Version 2.1
 *         - Sun Industry Standards Source License Version 1.1
 *
 *  Sun Microsystems Inc., October, 2000
 *
 *  GNU Lesser General Public License Version 2.1
 *  =============================================
 *  Copyright 2000 by Sun Microsystems, Inc.
 *  901 San Antonio Road, Palo Alto, CA 94303, USA
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License version 2.1, as published by the Free Software Foundation.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 *  MA  02111-1307  USA
 *
 *
 *  Sun Industry Standards Source License Version 1.1
 *  =================================================
 *  The contents of this file are subject to the Sun Industry Standards
 *  Source License Version 1.1 (the "License"); You may not use this file
 *  except in compliance with the License. You may obtain a copy of the
 *  License at http://www.openoffice.org/license.html.
 *
 *  Software provided under this License is provided on an "AS IS" basis,
 *  WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING,
 *  WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
 *  MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
 *  See the License for the specific provisions governing your rights and
 *  obligations concerning the Software.
 *
 *  The Initial Developer of the Original Code is: Sun Microsystems, Inc.
 *
 *  Copyright: 2000 by Sun Microsystems, Inc.
 *
 *  All Rights Reserved.
 *
 *  Contributor(s): _______________________________________
 *
 *
 ************************************************************************/

/*
 * @(#) LiteMorph.java 1.5 - last change made 04/09/99
 */

package com.sun.xmlsearch.qe;

import java.io.*;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.Hashtable;
import java.util.Enumeration;

/**
 * This class will generate an array of morphological variants of a word
 * to use in search-and-retrieval applications where a user wants to find 
 * other words morphologically related to words in a query.  For example, 
 * a request such as "electrical fixtures" should also retrieve "electric
 * fixtures," "electrical fixture," etc.  Given a word of a query, these 
 * rules generate alternative variations of that word that should also be 
 * considered.  This generation of variant forms of a word fills a role 
 * similar to that often filled by the use of wild card characters or by 
 * stemming rules that produce truncated stems in traditional information
 * retrieval systems.  The approach of generating alternative variations
 * has advantages over a truncated stemming approach for many applications,
 * because it does not require stemming operations during the indexing
 * process, does not require extra indexing space for stems, nor does it
 * lose information by storing only stems.  Rather, the variation rules
 * are applied to the query to produce additional forms to check against
 * the index.
 * <p>
 * Compared to the use of wild card characters, this approach has two 
 * advantages: first, it does not require the user to think about where 
 * the wild cards should be placed, and secondly, it deals with irregular
 * variations such as irregular verbs (e.g., "break," "broke," "broken"),
 * and with stem ending effects such as silent e's and doubling of final 
 * consonants (e.g., "dig," "digs," "digging").  The rules presented here, 
 * together with a table of exceptions, provided at the end, deal with 
 * a wide range of such effects, without requiring any more attention on 
 * the part of the user than to turn on the facility.
 * <p>
 * These rules generate regular morphological variants of words using the 
 * suffixes s, er, ers, est, ed, ing, ly, ful, less, ness, and ment.  Rules 
 * are included for dealing with some words ending in -ize, -ise, -ic and 
 * -ical, and for some words requiring irregular forms, such as -leaf and
 * -man compounds (flyleaf, footman), and Latin words ending in -um and -a, 
 * such as datum.  The rules are supplemented by a list of exceptions for
 * words that do not inflect regularly.  They are not intended to apply to
 * function words or to proper names.  When expanding queries, you may not
 * want to apply them to capitalized words or to hyphenated words like
 * day-to-day and low-level.
 * <p>
 * The rules treat almost all words as if they were multiply meaningful
 * as nouns, verbs, and adjectives.  Hence, the rules will often generate
 * spurious forms that should never occur in a text -- e.g., fix ->
 * fixest, happy -> happied.  The rules are suitable for applications
 * such as searching text using inverted files, where a quick test
 * suffices to determine that a given candidate does not occur in the
 * corpus.  In such applications, it is preferable to overgenerate
 * candidates than to miss possible retrievals.
 * <p>
 * The program uses rules developed by W. A. Woods and Ellen Hays in 1992.
 * An original C program for using them was developed by Jacek Ambroziak
 * and was included in Sun's SearchIt product.
 *
 * @author Roger D. Brinkley
 * @author W. A. Woods
 * @author Jacek Ambroziak
 * @version	1.5	04/09/99
 * 
 * @see Rule
 */

public abstract class LiteMorph {

    protected static Vector variants;
    protected static Hashtable rulesTable;
    protected static Hashtable blockedVariants;
    protected static Hashtable exceptions;

    public LiteMorph() {
	initialize();
    }


    public static LiteMorph getMorph() {
	return null;
    }

    /**
     * Subclasses of this class (generally locale specific)
     * need to set up exceptions and rules. At a minium 
     * implementations
     * need to initialize the size of the exceptions HashTable
     * and the establish the rules HashTable.
     * Implementations have the option of filling exceptions
     * directly in this method or calling intialize(String []).
     * After initialization the exceptionTable shoudl be garbage
     * collected
     */
    protected abstract void initialize();

    /**
     * One time initialization of exceptions Hashtable using an
     * array of Strings. Each String is a list of variation groups.
     * The words in the groups are  space delimited. Any matching
     * word in the exceptions will cause all of the words in the
     * group to be added to the variant list
     */
    protected void initialize (String [] exceptionTable) {

	// Firewall
	if (exceptions == null || exceptionTable == null) {
	    return;
	}
	String tempWord, tempVal;
	for (int i = 0; i < exceptionTable.length; i++) {
	    StringTokenizer tokens = new StringTokenizer(exceptionTable[i], " ");
	    while (tokens.hasMoreTokens()) {
		tempWord = tokens.nextToken();
		tempVal = (String)exceptions.get(tempWord);
		if (tempVal == null) {
		    exceptions.put(tempWord, exceptionTable[i]);
		} else {
		    //the same form can occur in several groups that must be appended
		    exceptions.put(tempWord, tempVal + " " + exceptionTable[i]);
		}
	    }
	}
    }

    private transient Thread lockedThread=null;

    /**
     * Get the variants of given word. This is locale
     * specific variants of a word as supplied by the locale
     * implementation of this class
     * 
     * @return String[] an array of words that are variations of word
     */
    public String[] variantsOf(String word) {
	threadLock();

	    // intialize variants and blockedVariants
	variants = new Vector();
	blockedVariants = new Hashtable ();
	// this blocks adding the input word itself
	blockedVariants.put(word, word);
	    
	    // Go get the morphological variantes of the word.
	morphWord(word, 0);
	    
	// don't need this anymore; release it for gc
	blockedVariants = null; 
	String[] result = new String[variants.size()];
	variants.copyInto(result);
	//release this for garbage collection
	variants = null;
	threadUnlock();
	return result;
    }
  
    /**
     * Morph the word into other words if possible
     */
    protected void morphWord(String word, int depth) {

	debug(" analyzing: " +word+" at depth "+depth);

	if (depth > 2)
	    return;

	// if a word is found among exceptions, don't try rules

	String exceptionList = (String)exceptions.get(word);
	if (exceptionList == null) {
	    exceptionList = "";
	}
	if (exceptionList.length() > 0) {
	    StringTokenizer tokens = new StringTokenizer(exceptionList, " ");
	    while (tokens.hasMoreTokens())
		addVariant(tokens.nextToken());
	    debug("   "+word+": found match in exceptions -- "+
		  exceptionList+", at depth "+depth);
	    return;
	}
    
	if (word.indexOf("-") >= 0)
	    return;
	//don't apply rules to words with internal hyphens (but check exceptions)

	Rule[] rules = null;
	int skipnum = 0;

	// See if the word ends with one of the keys in the rulesTable
	Enumeration keys = rulesTable.keys();
	while (keys.hasMoreElements()) {
	    String key = (String) keys.nextElement();
	    if (word.endsWith(key) && !key.equals("default")) {
		rules = (Rule[]) rulesTable.get(key);
		skipnum = key.length();
		break;
	    }
	}
	if (rules == null) {
	    // no match try to get the "default" rules.
	    rules = (Rule[]) rulesTable.get("default");
	    skipnum = 0;
	}


	for (int i = 0; i < rules.length; i++) {
	    debug("  "+word+": trying rule: " + rules[i]+
		  ", at depth "+depth);
	    String [] results = rules[i].match(word, depth, skipnum);
	    if (results.length > 0) {
		debug("  "+word+": found match for: "+rules[i]+
		      ", at depth "+depth);
		addVariant(word); //do this here -- i.e., only when a rule matches
		for (int j=0; j < results.length; j++) {
		    addVariant(results[j]);
		}
		break;
	    }
	}
    }
  
    /**
     * Add the variant of the word to the list of words
     */
    private void addVariant(String word) {
	if (blockedVariants.get(word) == null) { // word is not blocked
	    variants.addElement(word);
	    blockedVariants.put(word, word); // block it from being added again
	}
    }
  
    /**
     * Convenience method for creating Rules
     */
    protected static Rule r(String expression, String expansions, LiteMorph morph) {
	return new Rule(expression, expansions, morph);
    }

    private transient Thread currThread;

    /**
     * Acquires a lock to begin mutating the document this lock
     * protects.  There can be no writing, notification of changes, or
     * reading going on in order to gain the lock.
     *
     */
    public synchronized final void threadLock() {
	try {
	    while (currThread != null) {
		wait();
	    }
	    currThread = Thread.currentThread();
	} catch (InterruptedException e) {
	    // safe to let this pass... write lock not
	    // held if the thread lands here.
	}
    }

    /**
     * Releases the write lock held because the write
     * operation is finished.  This allows either a new
     * writer or readers to aquire a lock.
     */
    public synchronized final void threadUnlock() {
	currThread = null;
	notify();
    }

    /**
     * For printf debugging.
     */
    private static final boolean debugFlag = false;
    private static void debug(String str) {
        if( debugFlag ) {
            System.out.println("LiteMorph: " + str);
        }
    }
}
