mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-12 17:48:22 +00:00
Added SimHash library
This commit is contained in:
parent
60552bca4d
commit
0332f23579
3 changed files with 9 additions and 3 deletions
|
@ -74,9 +74,9 @@ if __name__ == "__main__":
|
||||||
# Creating the bloom filter name: bloomyyyymm
|
# Creating the bloom filter name: bloomyyyymm
|
||||||
filebloompath = os.path.join(bloompath, 'bloom' + PST.p_date.year +
|
filebloompath = os.path.join(bloompath, 'bloom' + PST.p_date.year +
|
||||||
PST.p_date.month)
|
PST.p_date.month)
|
||||||
|
|
||||||
if os.path.exists(filebloompath):
|
if os.path.exists(filebloompath):
|
||||||
bloom = BloomFilter.open(filebloompath)
|
bloom = BloomFilter.open(filebloompath)
|
||||||
|
bloop_path_set.add(filebloompath)
|
||||||
else:
|
else:
|
||||||
bloom = BloomFilter(100000000, 0.01, filebloompath)
|
bloom = BloomFilter(100000000, 0.01, filebloompath)
|
||||||
bloop_path_set.add(filebloompath)
|
bloop_path_set.add(filebloompath)
|
||||||
|
@ -94,7 +94,6 @@ if __name__ == "__main__":
|
||||||
for bloo in bloop_path_set:
|
for bloo in bloop_path_set:
|
||||||
# Opening blooms
|
# Opening blooms
|
||||||
opened_bloom.append(BloomFilter.open(bloo))
|
opened_bloom.append(BloomFilter.open(bloo))
|
||||||
|
|
||||||
# For each hash of the paste
|
# For each hash of the paste
|
||||||
for line_hash in PST._get_hash_lines(min=5, start=1, jump=0):
|
for line_hash in PST._get_hash_lines(min=5, start=1, jump=0):
|
||||||
nb_hash_current += 1
|
nb_hash_current += 1
|
||||||
|
@ -105,7 +104,6 @@ if __name__ == "__main__":
|
||||||
r_serv1.sadd("HASHS", line_hash)
|
r_serv1.sadd("HASHS", line_hash)
|
||||||
# Adding the hash in the bloom of the month
|
# Adding the hash in the bloom of the month
|
||||||
bloom.add(line_hash)
|
bloom.add(line_hash)
|
||||||
|
|
||||||
# Go throught the Database of the bloom filter (of the month)
|
# Go throught the Database of the bloom filter (of the month)
|
||||||
for bloo in opened_bloom:
|
for bloo in opened_bloom:
|
||||||
if line_hash in bloo:
|
if line_hash in bloo:
|
||||||
|
@ -148,6 +146,8 @@ if __name__ == "__main__":
|
||||||
percentage = round((count/float(nb_hash_current))*100, 2)
|
percentage = round((count/float(nb_hash_current))*100, 2)
|
||||||
if percentage >= 50:
|
if percentage >= 50:
|
||||||
dupl.append((paste, percentage))
|
dupl.append((paste, percentage))
|
||||||
|
else:
|
||||||
|
print 'percentage: ' + str(percentage)
|
||||||
|
|
||||||
# Creating the object attribute and save it.
|
# Creating the object attribute and save it.
|
||||||
to_print = 'Duplicate;{};{};{};'.format(
|
to_print = 'Duplicate;{};{};{};'.format(
|
||||||
|
@ -156,6 +156,7 @@ if __name__ == "__main__":
|
||||||
PST.__setattr__("p_duplicate", dupl)
|
PST.__setattr__("p_duplicate", dupl)
|
||||||
PST.save_attribute_redis("p_duplicate", dupl)
|
PST.save_attribute_redis("p_duplicate", dupl)
|
||||||
publisher.info('{}Detected {}'.format(to_print, len(dupl)))
|
publisher.info('{}Detected {}'.format(to_print, len(dupl)))
|
||||||
|
print '{}Detected {}'.format(to_print, len(dupl))
|
||||||
|
|
||||||
y = time.time()
|
y = time.time()
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import hashlib
|
import hashlib
|
||||||
import crcmod
|
import crcmod
|
||||||
import mmh3
|
import mmh3
|
||||||
|
import simhash
|
||||||
|
|
||||||
|
|
||||||
class Hash(object):
|
class Hash(object):
|
||||||
|
@ -32,4 +33,7 @@ class Hash(object):
|
||||||
elif self.name == "murmur":
|
elif self.name == "murmur":
|
||||||
hash = mmh3.hash(string)
|
hash = mmh3.hash(string)
|
||||||
|
|
||||||
|
elif self.name == "simhash":
|
||||||
|
hash = Simhash(string)
|
||||||
|
|
||||||
return hash
|
return hash
|
||||||
|
|
|
@ -17,6 +17,7 @@ nltk
|
||||||
# Hashlib
|
# Hashlib
|
||||||
crcmod
|
crcmod
|
||||||
mmh3
|
mmh3
|
||||||
|
simhash
|
||||||
|
|
||||||
#Others
|
#Others
|
||||||
python-magic
|
python-magic
|
||||||
|
|
Loading…
Reference in a new issue