package aconcorde;

import java.util.*;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.File;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.regex.Pattern;

/** A class to index the words of a given text corpus. 
 *
 * <p>
 * The WordIndex class indexes words in a style similar to an inverted
 * file.
 * </p>
 *
 * @author Andrew Roberts
 * @version 1.0 (2003-10-21)
 */

public class Concordance {

	// Instance variables //
	//

	protected List wordList;
	protected List wordIndex;
	protected int numberOfWords = 0;

    private int windowSize = 5;

    protected Locale currentLocale;

    final static int LEFT_TO_RIGHT = 1;
    final static int RIGHT_TO_LEFT = 2;

    private int textOrientation;

    protected String encoding = "UTF8";

	/**
	 * Creates an empty WordIndex object
	 */

    public Concordance() {

        currentLocale = Locale.getDefault();
        textOrientation = LEFT_TO_RIGHT;
    }

	public Concordance(Locale cLocale) {

        currentLocale = cLocale;
        textOrientation = LEFT_TO_RIGHT;

	}

    /**
     * A method to extract a String into its composite words. Decisions
     * on how to split words is depependent on the current Locale. Stores
     * each word within a List and returns this List.
     *
     * @param target The target String from which the words will be
     * extracted
     */

    private List extractWords(String target) {

        BreakIterator wordIterator = BreakIterator.getWordInstance(currentLocale);
        
        List words = new Vector();

        wordIterator.setText(target);
        int start = wordIterator.first();
        int end = wordIterator.next();

        while (end != BreakIterator.DONE) {
	        String word = target.substring(start,end);
	        if (Character.isLetterOrDigit(word.charAt(0))) {
	            //System.out.println(word);
                words.add(word);
	        }
            else {
	            //System.out.println("\"" + word + "\"");
            }

	        start = end;
	        end = wordIterator.next();
        }

        return words;
    }

    public List extractWordsWithPunctuation(String target) {

        //Firstly, replace the newline with a space

        target = target.replaceAll("$", " ");
        
        BreakIterator wordIterator = BreakIterator.getWordInstance(Locale.getDefault());
        
        boolean foundPunc = false;
        boolean foundNextWord = false;

        String leftPunc = "";
        String rightPunc = "";
        String currentWord = "";

        List words = new Vector();

        wordIterator.setText(target);
        int start = wordIterator.first();
        int end = wordIterator.next();

        while (end != BreakIterator.DONE) {
	        String word = target.substring(start,end);
	        if (Character.isLetterOrDigit(word.charAt(0))) {

                if (foundNextWord==false) {
                    foundNextWord = true;
                    foundPunc = true;
                    currentWord = word;
                }
                else {
               
                    words.add(new WordWithPunctuation(leftPunc, currentWord, rightPunc));
                    rightPunc = "";
                    leftPunc = "";
                    currentWord = word;
                }
                
	        }
            else {
                if (foundPunc == false) {
                    leftPunc += word;
                    //foundPunc = true;
                }
                else {
                    rightPunc += word;
                }
                
            }
	        start = end;
	        end = wordIterator.next();
        }

        if (currentWord != null && !currentWord.equals("")) {
            words.add(new WordWithPunctuation(leftPunc, currentWord, rightPunc));
        }

        return words;
    }

	/**
	 * Reads in text from <em>inFilename</em> and creates a list of all
	 * the unique words in the text.
	 *
	 * @param inFilename String specifying the filename to read
	 * @throws IOException if an I/O error occurred
	 */
	    
    public void createWordList(String inFilename) throws IOException {
		
		createWordList(new File(inFilename));

    }

    /**
	 * Reads in text from <em>inFile</em> and creates a list of all
	 * the unique words in the text.
	 *
	 * @param inFile File object specifying the file to read
	 * @throws IOException if an I/O error occurred
	 */

    public void createWordList(File inFile) throws IOException {

        BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(inFile), encoding));
        
        wordList = new Vector();

        String buffer;

        boolean stopReading = false;

        while(!stopReading) {

			buffer = in.readLine();

			if (buffer != null) {
                List words = new Vector();
                words = extractWords(buffer);
                Iterator i = words.iterator();
                while (i.hasNext()) {
                    WordFreq wf = new WordFreq((String)i.next());

                    //int index = -1;
                    //if (wordSet.size() > 0) {
                    int index = Collections.binarySearch(wordList, wf);
                    //}

                    if (index < 0) {
                        // This word doesn't yet exist in the wordSet.
                        // So add it...
                        //wordSet.add(wf);
						wordList.add((index + 1) * -1, wf);

                        // And then sort it, so that the binarySearch
                        // can work correctly for the next iteration.
                        //Collections.sort(wordSet);
                    }
                    else {
                        // The word is already in the set, so increment
                        // its current frequency count by 1.
                        ((WordFreq)wordList.get(index)).incrementCount();
                    }
                    
                    //wordSet.add(i.next());
                }
				numberOfWords += words.size();
			}
			else {
				stopReading = true;
			}
			
		}

		//wordList = new Vector(wordSet);
 		in.close();
    }


	/**
	 * Prints each unique word in the list to System.out
	 */

	public String displayWordList() {

		Iterator words = wordList.iterator();
        String output = "";

        String newline = System.getProperty("line.separator");
			
		while (words.hasNext()) {
			System.out.println(((WordFreq)words.next()).getWord());
		}

        return output;
		
	}

    public ListIterator getWordListIterator() {

        return wordList.listIterator();
    }

    public int getWordListSize() {

        return wordList.size();
    }
	
	/**
	 * Prints the index of each word to System.out, in the order they
	 * occur.
	 */

	public void displayWordIndex() {

		Iterator words = wordIndex.iterator();
			
		while (words.hasNext()) {
			System.out.println(words.next());
		}
		
	}

    public ListIterator getWordIndexIterator() {

        return wordIndex.listIterator();
    }

    public int getWordIndexSize() {

        return wordIndex.size();
    }

	/** 
	 * Creates an index, which records the position within the text
	 * file, where each word occurred.
	 *
	 * @param inFilename String specifying the filename to read
	 * @throws IOException if an I/O error occurred
	 */

	public void createWordIndex(String inFilename) throws IOException {

		// Create the wordList first...
		// 
	
		createWordList(new File(inFilename));

		createWordIndex(new File(inFilename));

	}
	
    /** 
	 * Creates an index, which records the position within the text
	 * file, where each word occurred.
	 *
	 * @param inFile File object specifying the file to read
	 * @throws IOException if an I/O error occurred
	 */

    public void createWordIndex(File inFile) throws IOException {

		// Create the wordList first...
		// 
	
		createWordList(inFile);

		// Now begin reading the file and creating the word index.
		// 
		
        //BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(inFile)));
        BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(inFile), encoding));
		//BufferedReader in = new BufferedReader(new FileReader(inFile));

		wordIndex = new Vector(numberOfWords);

		String buffer;

		boolean stopReading = false;

        int lineCount = 1;
		
		while(!stopReading) {

			buffer = in.readLine();

            //Regex blankline = new Regex("^\\s*$");
			if (buffer != null) {
                if (!Pattern.matches("^\\s*$", buffer)) {
                //if (!blankline.search(buffer)) {
                    List words = new Vector();
                    words = extractWordsWithPunctuation(buffer);
                    Iterator i = words.iterator();
                    while (i.hasNext()) {
                        WordWithPunctuation wwp = (WordWithPunctuation) i.next();
                        WordFreq wf = new WordFreq(wwp.getWord());

                        wordIndex.add(new WordIndexWithPunc(
                                    Collections.binarySearch(wordList, wf),
                                    wwp.getLeftPunctuation(),
                                    wwp.getRightPunctuation())
                                );

                    }
                    
                }
                /*else {
                    System.out.println("Line " + lineCount + ": blank line ignored.");
                }*/
			}
			else {
				stopReading = true;
			}
			lineCount++;
		}
 		in.close();
		
	}

	/** 
	 * Return the actual word that occurred at position
	 * <em>wordPosition</em>. Bear in mind that 0 is the first position.
	 *
	 * @param wordPosition The position index of the word to be
	 * retrieved.
	 * @return word A String containing the word at
	 * <em>wordPosition</em>.
	 * @throws ArrayIndexOutOfBoundsException if position specified is
	 * out of range.
	 */

	public String getWord(int wordPosition) {

		//Integer index = (Integer) wordIndex.get(wordPosition);
		//Integer index = ((WordIndexWithPunc)wordIndex.get(wordPosition)).getIndex();

        WordIndexWithPunc wwp = ((WordIndexWithPunc)wordIndex.get(wordPosition));
		int index = wwp.getIndex();

		String word = wwp.getLeftPunctuation() + 
            ((WordFreq) wordList.get(index)).getWord() +
            wwp.getRightPunctuation();

		return word;
		
	}

    public Vector getWordIndex(String word) {

        WordFreq wf = new WordFreq(word);

        int thisWordIndex = Collections.binarySearch(wordList, wf);
        int currentIndex;

        int indexPos;

        Vector indexes = new Vector();

        ListIterator li = wordIndex.listIterator();

        while (li.hasNext()) {
            indexPos = li.nextIndex(); 
            currentIndex = ((WordIndexWithPunc)li.next()).getIndex();
            //currentIndex = (Integer) li.next();
            if (currentIndex == thisWordIndex) {
                indexes.add(new Integer(indexPos));
            }
        }

        return indexes;
    }
	
	/** 
	 * Return the sequence of words between a begin and end position
	 * index.
	 * 
	 * @param beginPosition The position index of the first word of the
	 * sequence.
	 * @param endPosition The position index of the last word of the
	 * sequence.
	 * @return sequence The sequence of words as a String.
	 * <em>wordPosition</em>.
     * @throws IllegalArgumentException if <em>beginPosition</em> is greater than
     * <em>endPosition</em>.
	 * @throws ArrayIndexOutOfBoundsException if position specified is
	 * out of range.
     * 
	 */

	public String getSequenceOfWords(int beginPosition, int endPosition) 
		throws IllegalArgumentException {

		if (endPosition < beginPosition) {
			throw new IllegalArgumentException("beginPosition must be less than endPosition.");
		}

		String sequence = new String();

		for (int i = beginPosition; i < endPosition; ++i) {
			sequence += getWord(i);
			//sequence += getWord(i) + " ";
		}

		return sequence;
	}

    /**
     * Set the context window size for the concordance output.
     *
     * @param size The window size.
     * @throws IllegalArgumentException if size is less than 1.
     */

    public void setWindowSize(int size) {

        if (size < 1) {

            throw new IllegalArgumentException("Error: concordance window size must be greater than 0!");
            
        }
        
        windowSize = size;
    }

    /**
     * Retrieves the current window size.
     */

    public int getWindowSize() {

        return windowSize;
    }

    public void setEncoding(String enc) {

        encoding = enc;
    }

    public String getEncoding() {

        return encoding;
    }

    /**
     * A method for reversing the word order within a given String.
     * Intended for right-to-left orientated text-output.
     *
     * @param input The input String of which the words are to be
     * reversed.
     */

    private String reverseWords(String input) {

        String rWords = "";
        List words = new Vector();

        words = extractWords(input);

        ListIterator reIt = words.listIterator(words.size());

        while (reIt.hasPrevious()) {

            rWords += reIt.previous();
            if (reIt.hasPrevious()) {
                rWords += " ";
            }
        }

        return rWords;
    }

    /**
     * Specify the text direction of the output. Default is
     * Concordance.LEFT_TO_RIGHT (1).
     *
     * @param oValue Accepts either Concordance.LEFT_TO_RIGHT (1) or
     * Concordance.RIGHT_TO_LEFT (2).
     * @throws IllegalArgumentException if <em>oValue</em> is not a
     * valid valid (either 1 or 2).
     */
    
    public void setOrientation(int oValue) throws IllegalArgumentException {
        
        if (oValue != Concordance.LEFT_TO_RIGHT &&
            oValue != Concordance.RIGHT_TO_LEFT) {
            //throw an exception
            throw new IllegalArgumentException("Orientation: " + oValue + ". Orientation value must be either 1 (LTR) or 2 (RTL).");
            
        }
        else {
            textOrientation = oValue;
        }
    }

    /**
     * Retrieves the orientation setting. Can be either Concordance.LEFT_TO_RIGHT (1) or
     * Concordance.RIGHT_TO_LEFT (2).
     */

    public int getOrientation() {

        return textOrientation;
    }


    /**
     * Performs a concordance on the given <em>word</em> for the text
     * currently loaded in the Concordance object.
     *
     * @param word The word to perform a concordance on.
     * @throws IllegalArgumentException if the <em>word</em> inputted is
     * not actually contained within the text.
     */
    
    public String concordance(String word) throws IllegalArgumentException {

        String newline = System.getProperty("line.separator");

        //setWindowSize(size);

        int window = getWindowSize();
        
        Integer currentIndex;

        String output = "";

        // First, get all the indices of the word in question...

        Vector indexes = new Vector(getWordIndex(word));

        if (indexes.size() == 0) {
            throw new IllegalArgumentException("The word \"" + word + "\" does not appear in this text.");
        }
        //else {
        //    System.out.println("This word appears " + indexes.size() + " time(s)");
        //}

        //System.out.println(indexes);

        // Iterate through, and print out context for each word...

        ListIterator li = indexes.listIterator();

        while (li.hasNext()) {

            // For LEFT_TO_RIGHT text orientation...

            if (getOrientation() == Concordance.LEFT_TO_RIGHT) {

                currentIndex = (Integer) li.next();

                if (currentIndex.intValue() - window < 0) {
                
                    output += "[START] " + getSequenceOfWords(0, currentIndex.intValue());

                }
                else {
                    output += getSequenceOfWords(currentIndex.intValue() - (window), currentIndex.intValue());
                }
                //output += getSequenceOfWords(currentIndex.intValue() - (1 + window), currentIndex.intValue() - 1);
                output += " *" + word + "*  ";

                if (currentIndex.intValue() + window > wordIndex.size() - 1) {
                    output += getSequenceOfWords(currentIndex.intValue() + 1, wordIndex.size() ) + " [END]";
                }
                else {
                    output += getSequenceOfWords(currentIndex.intValue() + 1, currentIndex.intValue() + (1 + window));
                }
                output += newline;
            }
            // Other wise, RIGHT_TO_LEFT...
            else {

                currentIndex = (Integer) li.next();

                //output += getSequenceOfWords(currentIndex.intValue() - (1 + window), currentIndex.intValue() - 1);

                if (currentIndex.intValue() + window > wordIndex.size() - 1) {
                    output += "[END] " + reverseWords(getSequenceOfWords(currentIndex.intValue() + 1, wordIndex.size() ));
                }
                else {
                    output += reverseWords(getSequenceOfWords(currentIndex.intValue() + 1, currentIndex.intValue() + (1 + window)));
                }
                
                output += " *" + word + "*  ";
                
                if (currentIndex.intValue() - window < 0) {
                    output += reverseWords(getSequenceOfWords(0, currentIndex.intValue())) + " [START]";
                }
                else {
                    output += reverseWords(getSequenceOfWords(currentIndex.intValue() - (window), currentIndex.intValue()));
                }
                output += newline;

            } 
            
        }

        return output;
    }

    public List concordanceToList(String word) throws IllegalArgumentException {

        // Create a list to hold the ConcordanceItems
        //
        List items = new Vector();

        String leftContext = "";
        String rightContext = "";

        int window = getWindowSize();
        
        Integer currentIndex;

        String output = "";

        // First, get all the indices of the word in question...
        Vector indexes = new Vector(getWordIndex(word));

        if (indexes.size() == 0) {
            throw new IllegalArgumentException("The word \"" + word + "\" does not appear in this text.");
        }

        // Iterate through, and print out context for each word...

        ListIterator li = indexes.listIterator();

        while (li.hasNext()) {

            // For LEFT_TO_RIGHT text orientation...

            if (getOrientation() == Concordance.LEFT_TO_RIGHT) {

                currentIndex = (Integer) li.next();

                if (currentIndex.intValue() - window < 0) {
                
                    leftContext = "[START] " + getSequenceOfWords(0, currentIndex.intValue());

                }
                else {
                    leftContext = getSequenceOfWords(currentIndex.intValue() - (window), currentIndex.intValue());
                }

                if (currentIndex.intValue() + window > wordIndex.size() - 1) {
                    rightContext = getSequenceOfWords(currentIndex.intValue() + 1, wordIndex.size() ) + " [END]";
                }
                else {
                    rightContext= getSequenceOfWords(currentIndex.intValue() + 1, currentIndex.intValue() + (1 + window));
                }
                
                items.add(new ConcordanceItem(leftContext, word, rightContext));
            }
            // Other wise, RIGHT_TO_LEFT...
            else {

                currentIndex = (Integer) li.next();

                //output += getSequenceOfWords(currentIndex.intValue() - (1 + window), currentIndex.intValue() - 1);

                if (currentIndex.intValue() + window > wordIndex.size() - 1) {
                    leftContext = "[END] " + reverseWords(getSequenceOfWords(currentIndex.intValue() + 1, wordIndex.size() ));
                }
                else {
                    leftContext = reverseWords(getSequenceOfWords(currentIndex.intValue() + 1, currentIndex.intValue() + (1 + window)));
                }
                
                if (currentIndex.intValue() - window < 0) {
                    rightContext = reverseWords(getSequenceOfWords(0, currentIndex.intValue())) + " [START]";
                }
                else {
                    rightContext = reverseWords(getSequenceOfWords(currentIndex.intValue() - (window), currentIndex.intValue()));
                }
                items.add(new ConcordanceItem(leftContext, word, rightContext));

            } 
            
        }

        return items;
    }

	
}
