//*********************************************************************************************************
// © 2013 jakemdrew.com. All rights reserved. 
// This source code is licensed under The GNU General Public License (GPLv3):  
// http://opensource.org/licenses/gpl-3.0.html
//*********************************************************************************************************

//*********************************************************************************************************
//makeNgrams - Example n-gram creator.
//Created By - Jake Drew 
//Version -    1.0, 04/22/2013
//*********************************************************************************************************
public IEnumerable<string> makeNgrams(string text, int nGramSize)
{   
    if (nGramSize == 0) throw new Exception("nGram size was not set");

    StringBuilder nGram = new StringBuilder();
    Queue<int> wordLengths = new Queue<int>();

    int wordCount = 0;
    int lastWordLen = 0;

    //append the first character, if valid.
    //avoids if statement for each for loop to check i==0 for before and after vars.
    if (text != "" && char.IsLetterOrDigit(text[0]))
    {
        nGram.Append(text[0]);
        lastWordLen++;
    }
            
    //generate ngrams
    for (int i = 1; i < text.Length - 1; i++)
    {
        char before = text[i - 1];
        char after = text[i + 1];

        if (char.IsLetterOrDigit(text[i]) 
                ||
                //keep all punctuation that is surrounded by letters or numbers on both sides.
                (  text[i] != ' ' 
                && (char.IsSeparator(text[i]) || char.IsPunctuation(text[i])) 
                && (char.IsLetterOrDigit(before) && char.IsLetterOrDigit(after))
                )
            )
        {
            nGram.Append(text[i]);
            lastWordLen++;
        }
        else
        {
            if (lastWordLen > 0)
            {
                wordLengths.Enqueue(lastWordLen);
                lastWordLen = 0;
                wordCount++;

                if (wordCount >= nGramSize)
                {
                    yield return nGram.ToString();
                    nGram.Remove(0, wordLengths.Dequeue() + 1);
                    wordCount -= 1;
                }

                nGram.Append(" ");
            }
        }
    }
	nGram.Append(text.Last());
    yield return nGram.ToString();
}