mirror of
https://github.com/ail-project/ail-framework.git
synced 2025-01-18 08:26:15 +00:00
chg: [modules] crawl pasties domains
This commit is contained in:
parent
f05c7b6a93
commit
0cb7431e10
6 changed files with 177 additions and 72 deletions
|
@ -83,6 +83,7 @@ class ConfigLoader(object):
|
|||
else:
|
||||
return []
|
||||
|
||||
|
||||
# # # # Directory Config # # # #
|
||||
|
||||
config_loader = ConfigLoader()
|
||||
|
|
|
@ -113,6 +113,34 @@ def regex_finditer(r_key, regex, item_id, content, max_time=30):
|
|||
proc.terminate()
|
||||
sys.exit(0)
|
||||
|
||||
def _regex_match(r_key, regex, content):
|
||||
if re.match(regex, content):
|
||||
r_serv_cache.set(r_key, 1)
|
||||
r_serv_cache.expire(r_key, 360)
|
||||
|
||||
def regex_match(r_key, regex, item_id, content, max_time=30):
|
||||
proc = Proc(target=_regex_match, args=(r_key, regex, content))
|
||||
try:
|
||||
proc.start()
|
||||
proc.join(max_time)
|
||||
if proc.is_alive():
|
||||
proc.terminate()
|
||||
# Statistics.incr_module_timeout_statistic(r_key)
|
||||
err_mess = f"{r_key}: processing timeout: {item_id}"
|
||||
logger.info(err_mess)
|
||||
return False
|
||||
else:
|
||||
if r_serv_cache.exists(r_key):
|
||||
r_serv_cache.delete(r_key)
|
||||
return True
|
||||
else:
|
||||
r_serv_cache.delete(r_key)
|
||||
return False
|
||||
except KeyboardInterrupt:
|
||||
print("Caught KeyboardInterrupt, terminating regex worker")
|
||||
proc.terminate()
|
||||
sys.exit(0)
|
||||
|
||||
def _regex_search(r_key, regex, content):
|
||||
if re.search(regex, content):
|
||||
r_serv_cache.set(r_key, 1)
|
||||
|
|
144
bin/modules/Pasties.py
Executable file
144
bin/modules/Pasties.py
Executable file
|
@ -0,0 +1,144 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
"""
|
||||
The Pasties Module
|
||||
======================
|
||||
This module spots domain-pasties services for further processing
|
||||
"""
|
||||
|
||||
##################################
|
||||
# Import External packages
|
||||
##################################
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
from pyfaup.faup import Faup
|
||||
|
||||
sys.path.append(os.environ['AIL_BIN'])
|
||||
##################################
|
||||
# Import Project packages
|
||||
##################################
|
||||
from modules.abstract_module import AbstractModule
|
||||
from lib.ConfigLoader import ConfigLoader
|
||||
from lib import crawlers
|
||||
|
||||
# TODO add url validator
|
||||
|
||||
pasties_blocklist_urls = set()
|
||||
pasties_domains = {}
|
||||
|
||||
class Pasties(AbstractModule):
|
||||
"""
|
||||
Pasties module for AIL framework
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super(Pasties, self).__init__()
|
||||
self.faup = Faup()
|
||||
|
||||
config_loader = ConfigLoader()
|
||||
self.r_cache = config_loader.get_redis_conn("Redis_Cache")
|
||||
|
||||
self.pasties = {}
|
||||
self.urls_blocklist = set()
|
||||
self.load_pasties_domains()
|
||||
|
||||
# Send module state to logs
|
||||
self.logger.info(f'Module {self.module_name} initialized')
|
||||
|
||||
def load_pasties_domains(self):
|
||||
self.pasties = {}
|
||||
self.urls_blocklist = set()
|
||||
|
||||
domains_pasties = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties')
|
||||
if os.path.exists(domains_pasties):
|
||||
with open(domains_pasties) as f:
|
||||
for line in f:
|
||||
url = line.strip()
|
||||
if url: # TODO validate line
|
||||
self.faup.decode(url)
|
||||
url_decoded = self.faup.get()
|
||||
host = url_decoded['host']
|
||||
# if url_decoded.get('port', ''):
|
||||
# host = f'{host}:{url_decoded["port"]}'
|
||||
path = url_decoded.get('resource_path', '')
|
||||
# print(url_decoded)
|
||||
if path and path != '/':
|
||||
if path[-1] != '/':
|
||||
path = f'{path}/'
|
||||
else:
|
||||
path = None
|
||||
|
||||
if host in self.pasties:
|
||||
if path:
|
||||
self.pasties[host].add(path)
|
||||
else:
|
||||
if path:
|
||||
self.pasties[host] = {path}
|
||||
else:
|
||||
self.pasties[host] = set()
|
||||
|
||||
url_blocklist = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties_blacklist')
|
||||
if os.path.exists(url_blocklist):
|
||||
with open(url_blocklist) as f:
|
||||
for line in f:
|
||||
url = line.strip()
|
||||
self.faup.decode(url)
|
||||
url_decoded = self.faup.get()
|
||||
host = url_decoded['host']
|
||||
# if url_decoded.get('port', ''):
|
||||
# host = f'{host}:{url_decoded["port"]}'
|
||||
path = url_decoded.get('resource_path', '')
|
||||
url = f'{host}{path}'
|
||||
if url_decoded['query_string']:
|
||||
url = url + url_decoded['query_string']
|
||||
self.urls_blocklist.add(url)
|
||||
|
||||
def send_to_crawler(self, url, obj_id):
|
||||
if not self.r_cache.exists(f'{self.module_name}:url:{url}'):
|
||||
self.r_cache.set(f'{self.module_name}:url:{url}', int(time.time()))
|
||||
self.r_cache.expire(f'{self.module_name}:url:{url}', 86400)
|
||||
crawlers.create_task(url, depth=0, har=False, screenshot=False, proxy='force_tor', priority=60, parent=obj_id)
|
||||
|
||||
def compute(self, message):
|
||||
url, item_id = message.split()
|
||||
|
||||
self.faup.decode(url)
|
||||
url_decoded = self.faup.get()
|
||||
# print(url_decoded)
|
||||
url_host = url_decoded['host']
|
||||
# if url_decoded.get('port', ''):
|
||||
# url_host = f'{url_host}:{url_decoded["port"]}'
|
||||
path = url_decoded.get('resource_path', '')
|
||||
if url_host in self.pasties:
|
||||
if url.startswith('http://'):
|
||||
if url[7:] in self.urls_blocklist:
|
||||
return None
|
||||
elif url.startswith('https://'):
|
||||
if url[8:] in self.urls_blocklist:
|
||||
return None
|
||||
else:
|
||||
if url in self.urls_blocklist:
|
||||
return None
|
||||
|
||||
if not self.pasties[url_host]:
|
||||
if path and path != '/':
|
||||
print('send to crawler', url_host, url)
|
||||
self.send_to_crawler(url, item_id)
|
||||
else:
|
||||
if path.endswith('/'):
|
||||
path_end = path[:-1]
|
||||
else:
|
||||
path_end = f'{path}/'
|
||||
for url_path in self.pasties[url_host]:
|
||||
if path.startswith(url_path):
|
||||
if url_path != path and url_path != path_end:
|
||||
print('send to crawler', url_path, url)
|
||||
self.send_to_crawler(url, item_id)
|
||||
break
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
module = Pasties()
|
||||
module.run()
|
|
@ -1,71 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
"""
|
||||
The Zerobins Module
|
||||
======================
|
||||
This module spots zerobins-like services for further processing
|
||||
"""
|
||||
|
||||
##################################
|
||||
# Import External packages
|
||||
##################################
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
sys.path.append(os.environ['AIL_BIN'])
|
||||
##################################
|
||||
# Import Project packages
|
||||
##################################
|
||||
from modules.abstract_module import AbstractModule
|
||||
from lib import crawlers
|
||||
|
||||
|
||||
class Zerobins(AbstractModule):
|
||||
"""
|
||||
Zerobins module for AIL framework
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super(Zerobins, self).__init__()
|
||||
|
||||
binz = [
|
||||
r'^https:\/\/(zerobin||privatebin)\..*$', # historical ones
|
||||
]
|
||||
|
||||
self.regex = re.compile('|'.join(binz))
|
||||
|
||||
# Pending time between two computation (computeNone) in seconds
|
||||
self.pending_seconds = 10
|
||||
|
||||
# Send module state to logs
|
||||
self.logger.info(f'Module {self.module_name} initialized')
|
||||
|
||||
def computeNone(self):
|
||||
"""
|
||||
Compute when no message in queue
|
||||
"""
|
||||
self.logger.debug("No message in queue")
|
||||
|
||||
def compute(self, message):
|
||||
"""
|
||||
Compute a message in queue
|
||||
"""
|
||||
url, item_id = message.split()
|
||||
|
||||
# Extract zerobins addresses
|
||||
matching_binz = self.regex_findall(self.regex, item_id, url)
|
||||
|
||||
if len(matching_binz) > 0:
|
||||
for bin_url in matching_binz:
|
||||
print(f'send {bin_url} to crawler')
|
||||
# TODO Change priority ???
|
||||
crawlers.create_task(bin_url, depth=0, har=False, screenshot=False, proxy='force_tor',
|
||||
parent='manual', priority=60)
|
||||
|
||||
self.logger.debug("Compute message in queue")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
module = Zerobins()
|
||||
module.run()
|
|
@ -92,6 +92,9 @@ class AbstractModule(ABC):
|
|||
def get_available_queues(self):
|
||||
return self.queue.get_out_queues()
|
||||
|
||||
def regex_match(self, regex, obj_id, content):
|
||||
return regex_helper.regex_match(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time)
|
||||
|
||||
def regex_search(self, regex, obj_id, content):
|
||||
return regex_helper.regex_search(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time)
|
||||
|
||||
|
|
|
@ -162,7 +162,7 @@ publish = Importers,Tags
|
|||
subscribe = Item
|
||||
publish = Tags
|
||||
|
||||
[Zerobins]
|
||||
[Pasties]
|
||||
subscribe = Url
|
||||
|
||||
# [My_Module_Name]
|
||||
|
|
Loading…
Add table
Reference in a new issue