fix: [Mail module] fix dns caching + use redis queue

This commit is contained in:
Terrtia 2020-05-14 14:26:52 +02:00
parent 5475660785
commit f8ff232c3e
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0

View file

@ -14,6 +14,7 @@ It apply mail regexes on item content and warn if above a threshold.
import os import os
import re import re
import sys import sys
import uuid
import redis import redis
import time import time
import datetime import datetime
@ -22,7 +23,6 @@ import dns.resolver
import dns.exception import dns.exception
from multiprocessing import Process as Proc from multiprocessing import Process as Proc
from multiprocessing import Queue
from pubsublogger import publisher from pubsublogger import publisher
from Helper import Process from Helper import Process
@ -49,7 +49,7 @@ def is_mxdomain_in_cache(mxdomain):
return r_serv_cache.exists('mxdomain:{}'.format(mxdomain)) return r_serv_cache.exists('mxdomain:{}'.format(mxdomain))
def save_mxdomain_in_cache(mxdomain): def save_mxdomain_in_cache(mxdomain):
r_serv_cache.setex(mxdomain, 1, datetime.timedelta(days=1)) r_serv_cache.setex('mxdomain:{}'.format(mxdomain), 1, datetime.timedelta(days=1))
def check_mx_record(set_mxdomains, dns_server): def check_mx_record(set_mxdomains, dns_server):
"""Check if emails MX domains are responding. """Check if emails MX domains are responding.
@ -110,8 +110,14 @@ def check_mx_record(set_mxdomains, dns_server):
print(e) print(e)
return valid_mxdomain return valid_mxdomain
def extract_all_emails(queue, item_content): def extract_all_emails(redis_key, item_content):
queue.put(re.findall(email_regex, item_content)) all_emails = re.findall(email_regex, item_content)
if len(all_emails) > 1:
r_serv_cache.sadd(redis_key, *all_emails)
r_serv_cache.expire(redis_key, 360)
elif all_emails:
r_serv_cache.sadd(redis_key, all_emails[0])
r_serv_cache.expire(redis_key, 360)
if __name__ == "__main__": if __name__ == "__main__":
publisher.port = 6380 publisher.port = 6380
@ -132,10 +138,11 @@ if __name__ == "__main__":
email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}" email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"
q = Queue() redis_key = 'mail_extracted:{}'.format(str(uuid.uuid4()))
while True: while True:
message = p.get_from_set() message = p.get_from_set()
message = 'urlextract/2020/05/06/throwbin.io_guNTS59b94413e7-a6e8-428a-b561-c9829ec77587.gz 2'
if message is not None: if message is not None:
item_id, score = message.split() item_id, score = message.split()
@ -144,9 +151,9 @@ if __name__ == "__main__":
item_content = Item.get_item_content(item_id) item_content = Item.get_item_content(item_id)
proc = Proc(target=extract_all_emails, args=(q, item_content)) proc = Proc(target=extract_all_emails, args=(redis_key, item_content, ))
proc.start()
try: try:
proc.start()
proc.join(max_execution_time) proc.join(max_execution_time)
if proc.is_alive(): if proc.is_alive():
proc.terminate() proc.terminate()
@ -156,15 +163,14 @@ if __name__ == "__main__":
publisher.info(err_mess) publisher.info(err_mess)
continue continue
else: else:
all_emails = q.get() all_emails = r_serv_cache.smembers(redis_key)
r_serv_cache.delete(redis_key)
proc.terminate()
except KeyboardInterrupt: except KeyboardInterrupt:
print("Caught KeyboardInterrupt, terminating workers") print("Caught KeyboardInterrupt, terminating workers")
proc.terminate() proc.terminate()
sys.exit(0) sys.exit(0)
# filtering duplicate
all_emails = set(all_emails)
# get MXdomains # get MXdomains
set_mxdomains = set() set_mxdomains = set()
dict_mxdomains_email = {} dict_mxdomains_email = {}