chg: [Crawler] add priority queue, fix #263

This commit is contained in:
Terrtia 2019-01-29 16:08:59 +01:00
parent c1b34bd99c
commit 88eaaeae93
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
2 changed files with 12 additions and 4 deletions

View file

@ -20,7 +20,7 @@ def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
# send this msg back in the queue # send this msg back in the queue
if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain): if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain):
r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain) r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain)
r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message) r_onion.sadd('{}_crawler_priority_queue'.format(type_hidden_service), message)
def crawl_onion(url, domain, date, date_month, message): def crawl_onion(url, domain, date, date_month, message):
@ -166,6 +166,10 @@ if __name__ == '__main__':
while True: while True:
# Priority Queue - Recovering the streamed message informations.
message = r_onion.spop('{}_crawler_priority_queue'.format(type_hidden_service))
if message is None:
# Recovering the streamed message informations. # Recovering the streamed message informations.
message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))

View file

@ -223,6 +223,10 @@ if __name__ == "__main__":
print('send to onion crawler') print('send to onion crawler')
r_onion.sadd('onion_domain_crawler_queue', domain) r_onion.sadd('onion_domain_crawler_queue', domain)
msg = '{};{}'.format(url,PST.p_path) msg = '{};{}'.format(url,PST.p_path)
if not r_onion.hexists('onion_metadata:{}'.format(domain), 'first_seen'):
r_onion.sadd('onion_crawler_priority_queue', msg)
print('send to priority queue)
else:
r_onion.sadd('onion_crawler_queue', msg) r_onion.sadd('onion_crawler_queue', msg)
#p.populate_set_out(msg, 'Crawler') #p.populate_set_out(msg, 'Crawler')