From 20727fff7737c404ca70404f577cb107f44245b7 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 19 May 2021 16:57:20 +0200 Subject: [PATCH] chg: [Categ] tests + docs --- bin/ApiKey.py | 4 +- bin/Categ.py | 95 ++++++++++++++++++------------------------- tests/test_modules.py | 15 ++++++- 3 files changed, 55 insertions(+), 59 deletions(-) diff --git a/bin/ApiKey.py b/bin/ApiKey.py index 06e7d6f2..2b946386 100755 --- a/bin/ApiKey.py +++ b/bin/ApiKey.py @@ -43,7 +43,7 @@ class ApiKey(AbstractModule): # Send module state to logs self.redis_logger.info(f"Module {self.module_name} initialized") - def compute(self, message, r_match=False): + def compute(self, message, r_result=False): id, score = message.split() item = Item(id) item_content = item.get_content() @@ -82,7 +82,7 @@ class ApiKey(AbstractModule): # Send to duplicate self.send_message_to_queue('Duplicate', item.get_id()) - if r_match: + if r_result: return (google_api_key, aws_access_key, aws_secret_key) if __name__ == "__main__": diff --git a/bin/Categ.py b/bin/Categ.py index a8efd12b..f0a24442 100755 --- a/bin/Categ.py +++ b/bin/Categ.py @@ -4,19 +4,16 @@ The ZMQ_PubSub_Categ Module ============================ -This module is consuming the Redis-list created by the ZMQ_PubSub_Tokenize_Q -Module. - Each words files created under /files/ are representing categories. This modules take these files and compare them to -the stream of data given by the ZMQ_PubSub_Tokenize_Q Module. +the content of an item. -When a word from a paste match one or more of these words file, the filename of -the paste is published/forwarded to the next modules. +When a word from a item match one or more of these words file, the filename of +the item / zhe item id is published/forwarded to the next modules. Each category (each files) are representing a dynamic channel. This mean that if you create 1000 files under /files/ you'll have 1000 channels -where every time there is a matching word to a category, the paste containing +where every time there is a matching word to a category, the item containing this word will be pushed to this specific channel. ..note:: The channel will have the name of the file created. @@ -25,15 +22,11 @@ Implementing modules can start here, create your own category file, and then create your own module to treat the specific paste matching this category. -..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put -the same Subscriber name in both of them. - Requirements ------------ *Need running Redis instances. (Redis) *Categories files of words in /files/ need to be created -*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly. """ @@ -42,16 +35,13 @@ Requirements ################################## import os import argparse -import time import re ################################## # Import Project packages ################################## from module.abstract_module import AbstractModule -from pubsublogger import publisher -from packages import Paste -from Helper import Process +from packages.Item import Item class Categ(AbstractModule): @@ -59,73 +49,66 @@ class Categ(AbstractModule): Categ module for AIL framework """ - def __init__(self): + def __init__(self, categ_files_dir='../files/'): """ Init Categ """ super(Categ, self).__init__() + self.categ_files_dir = categ_files_dir + + # default = 1 string self.matchingThreshold = self.process.config.getint("Categ", "matchingThreshold") - # SCRIPT PARSER # - parser = argparse.ArgumentParser(description='Start Categ module on files.') - - parser.add_argument( - '-d', type=str, default="../files/", - help='Path to the directory containing the category files.', - action='store') - - args = parser.parse_args() - + self.reload_categ_words() self.redis_logger.info("Script Categ started") + # # TODO: trigger reload on change ( save last reload time, ...) + def reload_categ_words(self): categories = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential', 'Cve', 'ApiKey'] tmp_dict = {} for filename in categories: bname = os.path.basename(filename) tmp_dict[bname] = [] - with open(os.path.join(args.d, filename), 'r') as f: + with open(os.path.join(self.categ_files_dir, filename), 'r') as f: patterns = [r'%s' % ( re.escape(s.strip()) ) for s in f] tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE) + self.categ_words = tmp_dict.items() - self.categ_items = tmp_dict.items() + def compute(self, message, r_result=False): + # Create Item Object + item = Item(message) + # Get item content + content = item.get_content() + categ_found = [] - prec_filename = None - - - def compute(self, message): - # Cast message as paste - paste = Paste.Paste(message) - # Get paste content - content = paste.get_p_content() - - # init categories found - is_categ_found = False - - # Search for pattern categories in paste content - for categ, pattern in self.categ_items: + # Search for pattern categories in item content + for categ, pattern in self.categ_words: found = set(re.findall(pattern, content)) lenfound = len(found) if lenfound >= self.matchingThreshold: - is_categ_found = True - msg = '{} {}'.format(paste.p_rel_path, lenfound) + categ_found.append(categ) + msg = f'{item.get_id()} {lenfound}' - self.redis_logger.debug('%s;%s %s'%(self.module_name, msg, categ)) - # Export message to categ queue - self.process.populate_set_out(msg, categ) + print(msg, categ) + self.send_message_to_queue(categ, msg) self.redis_logger.info( - 'Categ;{};{};{};Detected {} as {};{}'.format( - paste.p_source, paste.p_date, paste.p_name, - lenfound, categ, paste.p_rel_path)) - - if not is_categ_found: - self.redis_logger.debug('No %s found in this paste: %s'%(self.module_name, paste.p_name)) - + f'Categ;{item.get_source()};{item.get_date()};{item.get_basename()};Detected {lenfound} as {categ};{item.get_id()}') + if r_result: + return categ_found if __name__ == '__main__': - - module = Categ() + + # SCRIPT PARSER # + parser = argparse.ArgumentParser(description='Start Categ module on files.') + parser.add_argument( + '-d', type=str, default="../files/", + help='Path to the directory containing the category files.', + action='store') + args = parser.parse_args() + + module = Categ(categ_files_dir=args.d) module.run() diff --git a/tests/test_modules.py b/tests/test_modules.py index 73594a5c..cf9b1f11 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -9,6 +9,7 @@ sys.path.append(os.environ['AIL_BIN']) # Modules Classes from ApiKey import ApiKey +from Categ import Categ from Onion import Onion # project packages @@ -25,11 +26,23 @@ class Test_Module_ApiKey(unittest.TestCase): aws_access_key = 'AKIAIOSFODNN7EXAMPLE' aws_secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' - matches = self.module_obj.compute(f'{item_id} 3', r_match=True) + matches = self.module_obj.compute(f'{item_id} 3', r_result=True) self.assertCountEqual(matches[0], [google_api_key]) self.assertCountEqual(matches[1], [aws_access_key]) self.assertCountEqual(matches[2], [aws_secret_key]) +class Test_Module_Categ(unittest.TestCase): + + def setUp(self): + self.module_obj = Categ() + + def test_module(self): + item_id = 'tests/2021/01/01/categ.gz' + test_categ = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential', 'Cve'] + + result = self.module_obj.compute(item_id, r_result=True) + self.assertCountEqual(result, test_categ) + class Test_Module_Onion(unittest.TestCase): def setUp(self):