//*********************************************************************************************************
// © 2013 jakemdrew.com. All rights reserved.
// This source code is licensed under The GNU General Public License (GPLv3):
// http://opensource.org/licenses/gpl-3.0.html
//*********************************************************************************************************
//*********************************************************************************************************
//makeNgrams - Example n-gram creator.
//Created By - Jake Drew
//Version - 1.0, 04/22/2013
//*********************************************************************************************************
public IEnumerable<string> makeNgrams(string text, int nGramSize)
{
if (nGramSize == 0) throw new Exception("nGram size was not set");
StringBuilder nGram = new StringBuilder();
Queue<int> wordLengths = new Queue<int>();
int wordCount = 0;
int lastWordLen = 0;
//append the first character, if valid.
//avoids if statement for each for loop to check i==0 for before and after vars.
if (text != "" && char.IsLetterOrDigit(text[0]))
{
nGram.Append(text[0]);
lastWordLen++;
}
//generate ngrams
for (int i = 1; i < text.Length - 1; i++)
{
char before = text[i - 1];
char after = text[i + 1];
if (char.IsLetterOrDigit(text[i])
||
//keep all punctuation that is surrounded by letters or numbers on both sides.
( text[i] != ' '
&& (char.IsSeparator(text[i]) || char.IsPunctuation(text[i]))
&& (char.IsLetterOrDigit(before) && char.IsLetterOrDigit(after))
)
)
{
nGram.Append(text[i]);
lastWordLen++;
}
else
{
if (lastWordLen > 0)
{
wordLengths.Enqueue(lastWordLen);
lastWordLen = 0;
wordCount++;
if (wordCount >= nGramSize)
{
yield return nGram.ToString();
nGram.Remove(0, wordLengths.Dequeue() + 1);
wordCount -= 1;
}
nGram.Append(" ");
}
}
}
nGram.Append(text.Last());
yield return nGram.ToString();
}