mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 00:28:22 +00:00
Added SimHash library
This commit is contained in:
parent
60552bca4d
commit
0332f23579
3 changed files with 9 additions and 3 deletions
|
@ -74,9 +74,9 @@ if __name__ == "__main__":
|
|||
# Creating the bloom filter name: bloomyyyymm
|
||||
filebloompath = os.path.join(bloompath, 'bloom' + PST.p_date.year +
|
||||
PST.p_date.month)
|
||||
|
||||
if os.path.exists(filebloompath):
|
||||
bloom = BloomFilter.open(filebloompath)
|
||||
bloop_path_set.add(filebloompath)
|
||||
else:
|
||||
bloom = BloomFilter(100000000, 0.01, filebloompath)
|
||||
bloop_path_set.add(filebloompath)
|
||||
|
@ -94,7 +94,6 @@ if __name__ == "__main__":
|
|||
for bloo in bloop_path_set:
|
||||
# Opening blooms
|
||||
opened_bloom.append(BloomFilter.open(bloo))
|
||||
|
||||
# For each hash of the paste
|
||||
for line_hash in PST._get_hash_lines(min=5, start=1, jump=0):
|
||||
nb_hash_current += 1
|
||||
|
@ -105,7 +104,6 @@ if __name__ == "__main__":
|
|||
r_serv1.sadd("HASHS", line_hash)
|
||||
# Adding the hash in the bloom of the month
|
||||
bloom.add(line_hash)
|
||||
|
||||
# Go throught the Database of the bloom filter (of the month)
|
||||
for bloo in opened_bloom:
|
||||
if line_hash in bloo:
|
||||
|
@ -148,6 +146,8 @@ if __name__ == "__main__":
|
|||
percentage = round((count/float(nb_hash_current))*100, 2)
|
||||
if percentage >= 50:
|
||||
dupl.append((paste, percentage))
|
||||
else:
|
||||
print 'percentage: ' + str(percentage)
|
||||
|
||||
# Creating the object attribute and save it.
|
||||
to_print = 'Duplicate;{};{};{};'.format(
|
||||
|
@ -156,6 +156,7 @@ if __name__ == "__main__":
|
|||
PST.__setattr__("p_duplicate", dupl)
|
||||
PST.save_attribute_redis("p_duplicate", dupl)
|
||||
publisher.info('{}Detected {}'.format(to_print, len(dupl)))
|
||||
print '{}Detected {}'.format(to_print, len(dupl))
|
||||
|
||||
y = time.time()
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import hashlib
|
||||
import crcmod
|
||||
import mmh3
|
||||
import simhash
|
||||
|
||||
|
||||
class Hash(object):
|
||||
|
@ -32,4 +33,7 @@ class Hash(object):
|
|||
elif self.name == "murmur":
|
||||
hash = mmh3.hash(string)
|
||||
|
||||
elif self.name == "simhash":
|
||||
hash = Simhash(string)
|
||||
|
||||
return hash
|
||||
|
|
|
@ -17,6 +17,7 @@ nltk
|
|||
# Hashlib
|
||||
crcmod
|
||||
mmh3
|
||||
simhash
|
||||
|
||||
#Others
|
||||
python-magic
|
||||
|
|
Loading…
Reference in a new issue