chg: [Categ] tests + docs

This commit is contained in:
Terrtia 2021-05-19 16:57:20 +02:00
parent 4a9bda2ee8
commit 20727fff77
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
3 changed files with 55 additions and 59 deletions

View file

@ -43,7 +43,7 @@ class ApiKey(AbstractModule):
# Send module state to logs
self.redis_logger.info(f"Module {self.module_name} initialized")
def compute(self, message, r_match=False):
def compute(self, message, r_result=False):
id, score = message.split()
item = Item(id)
item_content = item.get_content()
@ -82,7 +82,7 @@ class ApiKey(AbstractModule):
# Send to duplicate
self.send_message_to_queue('Duplicate', item.get_id())
if r_match:
if r_result:
return (google_api_key, aws_access_key, aws_secret_key)
if __name__ == "__main__":

View file

@ -4,19 +4,16 @@
The ZMQ_PubSub_Categ Module
============================
This module is consuming the Redis-list created by the ZMQ_PubSub_Tokenize_Q
Module.
Each words files created under /files/ are representing categories.
This modules take these files and compare them to
the stream of data given by the ZMQ_PubSub_Tokenize_Q Module.
the content of an item.
When a word from a paste match one or more of these words file, the filename of
the paste is published/forwarded to the next modules.
When a word from a item match one or more of these words file, the filename of
the item / zhe item id is published/forwarded to the next modules.
Each category (each files) are representing a dynamic channel.
This mean that if you create 1000 files under /files/ you'll have 1000 channels
where every time there is a matching word to a category, the paste containing
where every time there is a matching word to a category, the item containing
this word will be pushed to this specific channel.
..note:: The channel will have the name of the file created.
@ -25,15 +22,11 @@ Implementing modules can start here, create your own category file,
and then create your own module to treat the specific paste matching this
category.
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
the same Subscriber name in both of them.
Requirements
------------
*Need running Redis instances. (Redis)
*Categories files of words in /files/ need to be created
*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
"""
@ -42,16 +35,13 @@ Requirements
##################################
import os
import argparse
import time
import re
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from pubsublogger import publisher
from packages import Paste
from Helper import Process
from packages.Item import Item
class Categ(AbstractModule):
@ -59,73 +49,66 @@ class Categ(AbstractModule):
Categ module for AIL framework
"""
def __init__(self):
def __init__(self, categ_files_dir='../files/'):
"""
Init Categ
"""
super(Categ, self).__init__()
self.categ_files_dir = categ_files_dir
# default = 1 string
self.matchingThreshold = self.process.config.getint("Categ", "matchingThreshold")
# SCRIPT PARSER #
parser = argparse.ArgumentParser(description='Start Categ module on files.')
parser.add_argument(
'-d', type=str, default="../files/",
help='Path to the directory containing the category files.',
action='store')
args = parser.parse_args()
self.reload_categ_words()
self.redis_logger.info("Script Categ started")
# # TODO: trigger reload on change ( save last reload time, ...)
def reload_categ_words(self):
categories = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential', 'Cve', 'ApiKey']
tmp_dict = {}
for filename in categories:
bname = os.path.basename(filename)
tmp_dict[bname] = []
with open(os.path.join(args.d, filename), 'r') as f:
with open(os.path.join(self.categ_files_dir, filename), 'r') as f:
patterns = [r'%s' % ( re.escape(s.strip()) ) for s in f]
tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE)
self.categ_words = tmp_dict.items()
self.categ_items = tmp_dict.items()
def compute(self, message, r_result=False):
# Create Item Object
item = Item(message)
# Get item content
content = item.get_content()
categ_found = []
prec_filename = None
def compute(self, message):
# Cast message as paste
paste = Paste.Paste(message)
# Get paste content
content = paste.get_p_content()
# init categories found
is_categ_found = False
# Search for pattern categories in paste content
for categ, pattern in self.categ_items:
# Search for pattern categories in item content
for categ, pattern in self.categ_words:
found = set(re.findall(pattern, content))
lenfound = len(found)
if lenfound >= self.matchingThreshold:
is_categ_found = True
msg = '{} {}'.format(paste.p_rel_path, lenfound)
self.redis_logger.debug('%s;%s %s'%(self.module_name, msg, categ))
categ_found.append(categ)
msg = f'{item.get_id()} {lenfound}'
# Export message to categ queue
self.process.populate_set_out(msg, categ)
print(msg, categ)
self.send_message_to_queue(categ, msg)
self.redis_logger.info(
'Categ;{};{};{};Detected {} as {};{}'.format(
paste.p_source, paste.p_date, paste.p_name,
lenfound, categ, paste.p_rel_path))
if not is_categ_found:
self.redis_logger.debug('No %s found in this paste: %s'%(self.module_name, paste.p_name))
f'Categ;{item.get_source()};{item.get_date()};{item.get_basename()};Detected {lenfound} as {categ};{item.get_id()}')
if r_result:
return categ_found
if __name__ == '__main__':
module = Categ()
# SCRIPT PARSER #
parser = argparse.ArgumentParser(description='Start Categ module on files.')
parser.add_argument(
'-d', type=str, default="../files/",
help='Path to the directory containing the category files.',
action='store')
args = parser.parse_args()
module = Categ(categ_files_dir=args.d)
module.run()

View file

@ -9,6 +9,7 @@ sys.path.append(os.environ['AIL_BIN'])
# Modules Classes
from ApiKey import ApiKey
from Categ import Categ
from Onion import Onion
# project packages
@ -25,11 +26,23 @@ class Test_Module_ApiKey(unittest.TestCase):
aws_access_key = 'AKIAIOSFODNN7EXAMPLE'
aws_secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'
matches = self.module_obj.compute(f'{item_id} 3', r_match=True)
matches = self.module_obj.compute(f'{item_id} 3', r_result=True)
self.assertCountEqual(matches[0], [google_api_key])
self.assertCountEqual(matches[1], [aws_access_key])
self.assertCountEqual(matches[2], [aws_secret_key])
class Test_Module_Categ(unittest.TestCase):
def setUp(self):
self.module_obj = Categ()
def test_module(self):
item_id = 'tests/2021/01/01/categ.gz'
test_categ = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential', 'Cve']
result = self.module_obj.compute(item_id, r_result=True)
self.assertCountEqual(result, test_categ)
class Test_Module_Onion(unittest.TestCase):
def setUp(self):