Skip to content

Commit 3a3b1c7

Browse files
authored
Add files via upload
1 parent 1867c76 commit 3a3b1c7

File tree

2 files changed

+268
-0
lines changed

2 files changed

+268
-0
lines changed

LSH.py

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created by Michail Nikolaos
4+
5+
This script finds the k-nearest neightboors with LSH from a list of minhashes
6+
and compares the results with actual Jaccard similarity of the sets.
7+
The minhashes are calculated from the shingles in shinglesMap.json
8+
located in execution path and created from LSH_Preproccessing.py
9+
10+
The algorithm parameters are
11+
-kNeigh Number of neighhboors
12+
-bands Number of bands for Minhashes
13+
-r Integers per band
14+
-jaccardThres Threshold for two sets to be similar by Jaccard similarity.
15+
If no threshold is implemented then we will have many false postives from
16+
Jaccard method
17+
"""
18+
19+
import numpy as np
20+
import operator
21+
import random
22+
import time
23+
import os
24+
import json
25+
import sys
26+
from tqdm import tqdm
27+
from collections import defaultdict
28+
29+
30+
#For command line execution
31+
if (len(sys.argv) > 5):
32+
targetDocID = str(sys.argv[1])
33+
kNeigh = int(sys.argv[2])
34+
bands = int(sys.argv[3])
35+
r = int(sys.argv[4])
36+
jaccardThres = int(sys.argv[5])/100
37+
else :
38+
targetDocID = '10864'
39+
kNeigh = 10
40+
bands = 10
41+
r = 4
42+
jaccardThres = 0.2
43+
44+
print('LSH run for document ',targetDocID,' finding',kNeigh,' nearest neighbors with ',
45+
bands,' bands of ', r ,' integers')
46+
47+
numHashes = bands*r;
48+
49+
def findMinHash(myList,a,b,c):
50+
hashedList = [ (a*x + b)%c for x in myList]
51+
return min(hashedList)
52+
53+
def compute_jaccard_index(set_1, set_2):
54+
n = len(set_1.intersection(set_2))
55+
return n / float(len(set_1) + len(set_2) - n) #The union's cardinality
56+
#is the sets cardinality minus the intersection's cardinality
57+
58+
path = os.getcwd();
59+
60+
with open(path+os.path.sep+'shinglesMap.json', 'r') as f:
61+
try:
62+
shingleMap = json.load(f)
63+
except ValueError:
64+
shingleMap = {}
65+
66+
f.close()
67+
68+
#hashed in 0 2^32
69+
#hash functions will be in the family of h(x) = (a*x+b)mod c where c is the big prime
70+
maxShingleID = 2**32-1
71+
72+
##Smallest prime after 2^24-1
73+
bigPrime = 16777259
74+
75+
print('\n---------------------------------------------------------')
76+
print('Creating Minihash Signatures')
77+
print('---------------------------------------------------------\n')
78+
79+
print('Calculating',numHashes,'MinHashes for',len(shingleMap),'documents')
80+
signatures = defaultdict(list)
81+
82+
for hashIndex in tqdm(range(0,numHashes)):
83+
84+
a = random.randint(0, maxShingleID)
85+
b = random.randint(0, maxShingleID)
86+
87+
for doc,docShingles in shingleMap.items():
88+
minHash = findMinHash(docShingles,a,b,bigPrime)
89+
signatures[doc].append(minHash)
90+
91+
92+
collisionMap = defaultdict(int)
93+
94+
95+
96+
if (targetDocID not in signatures):
97+
print('Wrong Document id\nDocument id',targetDocID,'is not in the signatures map\n')
98+
sys.exit()
99+
100+
start_time = time.time()
101+
102+
targetBuckets = dict()
103+
104+
#Find all buckets for target document and save them in a dictionary
105+
for band in range(bands):
106+
targetBuckets[band] = sum((np.multiply(signatures[targetDocID][band*r:(band+1)*r],np.arange(1,r+1)))) % bigPrime
107+
108+
for doc in signatures.keys():
109+
110+
for band in range(bands):
111+
112+
bucketID = sum((np.multiply(signatures[doc][band*r:(band+1)*r],np.arange(1,r+1)))) % bigPrime
113+
114+
#if we found another document that hashes in the same bucket we have a collisionn and a candidate key
115+
if targetBuckets[band] == bucketID:
116+
collisionMap[doc] += 1
117+
118+
neightMap = { k:v for k,v in collisionMap.items() if k != targetDocID}
119+
120+
neigthboorsSorted = sorted(neightMap.items(), key=operator.itemgetter(1),reverse=True)
121+
122+
topKNN_LSH = neigthboorsSorted[0:kNeigh]
123+
124+
LSH_exec_time = time.time() - start_time
125+
126+
print('\n---------------------------------------------------------')
127+
print('Results fot Document:',targetDocID,)
128+
129+
print('\n---------------------------------------------------------')
130+
print('Locality Sensitive Hashing with MiniHashes')
131+
132+
print('LSH Neightboors for',bands,'bands of ',r,' integers are:\n' )
133+
134+
for n in topKNN_LSH:
135+
print('-> Document:',n[0],' collision occured in ',n[1], " of the ",bands,' bands (',round(n[1]/bands,2),')')
136+
137+
print('\nTotal Time for LSH is:', LSH_exec_time ," seconds")
138+
139+
start_time = time.time()
140+
141+
jacSimMap = {}
142+
143+
for doc,docShingles in shingleMap.items():
144+
jacSim = compute_jaccard_index(set(shingleMap[targetDocID]),set(shingleMap[doc]))
145+
if jacSim > jaccardThres and doc != targetDocID:
146+
jacSimMap[doc] = jacSim
147+
148+
jacNeigthboorsSorted = sorted(jacSimMap.items(), key=operator.itemgetter(1),reverse=True)
149+
150+
topKNN_JAC = jacNeigthboorsSorted[0:kNeigh]
151+
152+
print('\n---------------------------------------------------------')
153+
print('Actual Neightboors based on Jaccard Similairy of shingles are:\n')
154+
155+
for n in topKNN_JAC:
156+
print('-> Document:',n[0],' simularity ',round(n[1],2))
157+
158+
print('\nTotal Time for Jaccard similarity calculation is:', time.time() - start_time ," seconds")
159+
160+
161+
if ((len(topKNN_JAC) != 0) & (len(topKNN_LSH) != 0)):
162+
topKNN_JAC_docs = set([x[0] for x in topKNN_JAC])
163+
topKNN_LSH_docs = set([x[0] for x in topKNN_LSH])
164+
165+
if len(topKNN_JAC_docs) != 0: #We have ar least one true Neightboor
166+
TrueNeightboors = len (topKNN_JAC_docs)
167+
168+
TruePositives = len(set.intersection(topKNN_LSH_docs,topKNN_JAC_docs))
169+
170+
FalsePositives = len(set.difference(topKNN_LSH_docs,topKNN_JAC_docs))
171+
172+
FalseNegatives = len(set.difference(topKNN_JAC_docs,topKNN_LSH_docs))
173+
174+
PercFoundNeigt = TruePositives / TrueNeightboors
175+
176+
print('\n---------------------------------------------------------')
177+
print("Accuracy ",PercFoundNeigt ,'\nTrue Neightboors are ',TrueNeightboors,
178+
'\nFound Neightboors from LSH are ',TruePositives,
179+
'\nNot Found Neightboors from LSH are ',FalseNegatives,
180+
'\nFalse Neightboors from LSH are ',FalsePositives)
181+

LSH_Preproccessing.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created by Michail Nikolaos
4+
5+
Preprocessing of the Reuters corpus for LSH technique
6+
7+
Find all the *.sgm files in data file located in current execution path
8+
and creates a single json document with the k-shingles of each document.
9+
The k-shingles are concatanated and hashed in order to be numbers.
10+
11+
shingleLength parameter changes the number of k conscutive words used for
12+
each shingle
13+
"""
14+
15+
from bs4 import BeautifulSoup
16+
import binascii
17+
import re
18+
import glob
19+
import os
20+
import json
21+
import sys
22+
from tqdm import tqdm
23+
24+
print('Creating k-shingles from data files')
25+
26+
#Parameters
27+
if (len(sys.argv) > 1):
28+
shingleLength = int(sys.argv[1])
29+
print('Shingles Length used is '+str(shingleLength))
30+
else :
31+
print('Default shingle Length used (3-shingles)')
32+
shingleLength = 3
33+
34+
textIDs,texts = [],[]
35+
36+
datapath = os.getcwd()+os.path.sep+'data'; #Get current execution path
37+
38+
filenames = glob.glob(os.path.join(datapath, '*.sgm'))
39+
40+
print('Parsing data files...')
41+
42+
for filename in tqdm(filenames):
43+
44+
file = open(filename, 'r')
45+
data= file.read()
46+
47+
soup = BeautifulSoup(data,'html.parser')
48+
reutersTags = soup.find_all('reuters')
49+
50+
for frag in reutersTags:
51+
body = frag.find('body')
52+
53+
if body is not None:
54+
bodylength = len(BeautifulSoup.get_text(body).split())
55+
if (bodylength > shingleLength):
56+
texts.append(BeautifulSoup.get_text(body))
57+
textIDs.append(frag.get('newid'))
58+
59+
print('Converting to lowercase and removing special characters...')
60+
61+
textsLower = [ x.lower() for x in texts]; #Convrt to lower and remove special characters
62+
63+
rx = re.compile('([\n\.\",\'])')
64+
textsNoSpecial = [ rx.sub("",x) for x in textsLower];
65+
66+
textMap = dict(zip(textIDs, textsNoSpecial))
67+
68+
textDict = {key: value.split() for (key,value) in textMap.items()}
69+
70+
textDictHased = {}
71+
textDictUnHased = {}
72+
73+
print('Creating and hashing shingles...\n')
74+
75+
for (key,value) in textDict.items():
76+
docShingleList = [value[i:i+shingleLength] for i in range(len(value) - shingleLength + 1)]
77+
joinedDocShingles = [ "".join(shingle) for shingle in docShingleList] #Concatanate Shingles into one word
78+
hashedDocShingles = [ (binascii.crc32(bytes(jShingle,'utf-8')) & 0xffffffff) for jShingle in joinedDocShingles] #Hash joined shinles in 32 bit
79+
textDictHased[key] = hashedDocShingles
80+
81+
exec_path = os.getcwd();
82+
with open(exec_path+os.path.sep+'shinglesMap.json', 'w') as f:
83+
json.dump(textDictHased, f)
84+
85+
print(len(textDictHased),' documnets were read, transformed and cut into',shingleLength,'shingles')
86+
print('Saved '+str(shingleLength)+'-shingles of texts in shinglesMap.json succefully')
87+

0 commit comments

Comments
 (0)