//*********************************************************************************************************
// © 2014 jakemdrew.com. All rights reserved.
// This source code is licensed under The GNU General Public License (GPLv3):
// http://opensource.org/licenses/gpl-3.0.html
//*********************************************************************************************************
//*********************************************************************************************************
//MinHasher - Example minhashing engine.
//Created By - Jake Drew
//Version - 1.0, 05/08/2013
//*********************************************************************************************************
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace LSH_Blog_Examples
{
class MinHasher
{
public int signatureSize;
public Tuple[] minhashes;
public MinHasher(int SignatureSize)
{
signatureSize = SignatureSize;
minhashes = new Tuple[SignatureSize];
//Create our unique family of hashing function seeds.
createMinhashSeeds();
}
private void createMinhashSeeds()
{
HashSet skipDups = new HashSet();
Random r = new Random();
for (int i = 0; i < minhashes.Length; i++)
{
Tuple seed = new Tuple(r.Next(), r.Next());
if (skipDups.Add(seed.GetHashCode()))
minhashes[i] = seed;
else
i--; //duplicate seed, try again
}
}
public static int LSHHash(T inputData, int seedOne, int seedTwo)
{ //Faster, Does not throw exception for overflows during hashing.
unchecked // Overflow is fine, just wrap
{
int hash = (int)2166136261;
hash = hash * 16777619 ^ seedOne.GetHashCode();
hash = hash * 16777619 ^ seedTwo.GetHashCode();
hash = hash * 16777619 ^ inputData.GetHashCode();
return hash;
}
}
public static double calculateJaccard(int[] setA, int[] setB)
{
int intersection = setA.Intersect(setB).Count();
return intersection / (double)setA.Length;
}
public int[] getMinHashSignature(int[] tokens)
{
//Create a new signature initialized to all int max values
int[] minHashValues = Enumerable.Repeat(int.MaxValue, signatureSize).ToArray();
HashSet skipDups = new HashSet();
//Go through every single token
foreach (var token in tokens)
{ //We do not want to hash the same token value more than once...
if (skipDups.Add(token))
{ //Hash each unique token with each unique hashing function
for (int i = 0; i < signatureSize; i++)
{ //Use the same seeds everytime for each hashing function (this is very important!!!)
Tuple seeds = minhashes[i];
int currentHashValue = LSHHash(token, seeds.Item1, seeds.Item2);
//Only retain the minimum value produced by each unique hashing function.
if (currentHashValue < minHashValues[i])
minHashValues[i] = currentHashValue;
}
}
}
return minHashValues;
}
public Dictionary createMinhashCollection(Dictionary documents)
{
Dictionary minhashCollection = new Dictionary(documents.Count);
foreach (var document in documents)
{
int[] minhashSignature = getMinHashSignature(document.Value);
minhashCollection.Add(document.Key, minhashSignature);
}
return minhashCollection;
}
}
}