mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-30 09:47:17 +00:00
fix: [Mail module] fix dns caching + use redis queue
This commit is contained in:
parent
5475660785
commit
f8ff232c3e
1 changed files with 17 additions and 11 deletions
28
bin/Mail.py
28
bin/Mail.py
|
@ -14,6 +14,7 @@ It apply mail regexes on item content and warn if above a threshold.
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import uuid
|
||||||
import redis
|
import redis
|
||||||
import time
|
import time
|
||||||
import datetime
|
import datetime
|
||||||
|
@ -22,7 +23,6 @@ import dns.resolver
|
||||||
import dns.exception
|
import dns.exception
|
||||||
|
|
||||||
from multiprocessing import Process as Proc
|
from multiprocessing import Process as Proc
|
||||||
from multiprocessing import Queue
|
|
||||||
|
|
||||||
from pubsublogger import publisher
|
from pubsublogger import publisher
|
||||||
from Helper import Process
|
from Helper import Process
|
||||||
|
@ -49,7 +49,7 @@ def is_mxdomain_in_cache(mxdomain):
|
||||||
return r_serv_cache.exists('mxdomain:{}'.format(mxdomain))
|
return r_serv_cache.exists('mxdomain:{}'.format(mxdomain))
|
||||||
|
|
||||||
def save_mxdomain_in_cache(mxdomain):
|
def save_mxdomain_in_cache(mxdomain):
|
||||||
r_serv_cache.setex(mxdomain, 1, datetime.timedelta(days=1))
|
r_serv_cache.setex('mxdomain:{}'.format(mxdomain), 1, datetime.timedelta(days=1))
|
||||||
|
|
||||||
def check_mx_record(set_mxdomains, dns_server):
|
def check_mx_record(set_mxdomains, dns_server):
|
||||||
"""Check if emails MX domains are responding.
|
"""Check if emails MX domains are responding.
|
||||||
|
@ -110,8 +110,14 @@ def check_mx_record(set_mxdomains, dns_server):
|
||||||
print(e)
|
print(e)
|
||||||
return valid_mxdomain
|
return valid_mxdomain
|
||||||
|
|
||||||
def extract_all_emails(queue, item_content):
|
def extract_all_emails(redis_key, item_content):
|
||||||
queue.put(re.findall(email_regex, item_content))
|
all_emails = re.findall(email_regex, item_content)
|
||||||
|
if len(all_emails) > 1:
|
||||||
|
r_serv_cache.sadd(redis_key, *all_emails)
|
||||||
|
r_serv_cache.expire(redis_key, 360)
|
||||||
|
elif all_emails:
|
||||||
|
r_serv_cache.sadd(redis_key, all_emails[0])
|
||||||
|
r_serv_cache.expire(redis_key, 360)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
publisher.port = 6380
|
publisher.port = 6380
|
||||||
|
@ -132,10 +138,11 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"
|
email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"
|
||||||
|
|
||||||
q = Queue()
|
redis_key = 'mail_extracted:{}'.format(str(uuid.uuid4()))
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
message = p.get_from_set()
|
message = p.get_from_set()
|
||||||
|
message = 'urlextract/2020/05/06/throwbin.io_guNTS59b94413e7-a6e8-428a-b561-c9829ec77587.gz 2'
|
||||||
|
|
||||||
if message is not None:
|
if message is not None:
|
||||||
item_id, score = message.split()
|
item_id, score = message.split()
|
||||||
|
@ -144,9 +151,9 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
item_content = Item.get_item_content(item_id)
|
item_content = Item.get_item_content(item_id)
|
||||||
|
|
||||||
proc = Proc(target=extract_all_emails, args=(q, item_content))
|
proc = Proc(target=extract_all_emails, args=(redis_key, item_content, ))
|
||||||
proc.start()
|
|
||||||
try:
|
try:
|
||||||
|
proc.start()
|
||||||
proc.join(max_execution_time)
|
proc.join(max_execution_time)
|
||||||
if proc.is_alive():
|
if proc.is_alive():
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
|
@ -156,15 +163,14 @@ if __name__ == "__main__":
|
||||||
publisher.info(err_mess)
|
publisher.info(err_mess)
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
all_emails = q.get()
|
all_emails = r_serv_cache.smembers(redis_key)
|
||||||
|
r_serv_cache.delete(redis_key)
|
||||||
|
proc.terminate()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print("Caught KeyboardInterrupt, terminating workers")
|
print("Caught KeyboardInterrupt, terminating workers")
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
# filtering duplicate
|
|
||||||
all_emails = set(all_emails)
|
|
||||||
|
|
||||||
# get MXdomains
|
# get MXdomains
|
||||||
set_mxdomains = set()
|
set_mxdomains = set()
|
||||||
dict_mxdomains_email = {}
|
dict_mxdomains_email = {}
|
||||||
|
|
Loading…
Reference in a new issue