#!/usr/bin/env python3 # -*-coding:UTF-8 -* """ The ZMQ_PubSub_Categ Module ============================ Each words files created under /files/ are representing categories. This modules take these files and compare them to the stream of data. When a word from a paste match one or more of these words file, the filename of the paste is published/forwarded to the next modules. Each category (each files) are representing a dynamic channel. This mean that if you create 1000 files under /files/ you'll have 1000 channels where every time there is a matching word to a category, the paste containing this word will be pushed to this specific channel. ..note:: The channel will have the name of the file created. Implementing modules can start here, create your own category file, and then create your own module to treat the specific paste matching this category. Requirements ------------ *Categories files of words in /files/ need to be created """ import os import argparse import time import re from pubsublogger import publisher from packages import Paste from Helper import Process if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Categ' p = Process(config_section) matchingThreshold = p.config.getint("Categ", "matchingThreshold") # SCRIPT PARSER # parser = argparse.ArgumentParser(description='Start Categ module on files.') parser.add_argument( '-d', type=str, default="../files/", help='Path to the directory containing the category files.', action='store') args = parser.parse_args() # FUNCTIONS # publisher.info("Script Categ started") categories = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential', 'Cve', 'ApiKey'] tmp_dict = {} for filename in categories: bname = os.path.basename(filename) tmp_dict[bname] = [] with open(os.path.join(args.d, filename), 'r') as f: patterns = [r'%s' % ( re.escape(s.strip()) ) for s in f] tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE) prec_filename = None while True: filename = p.get_from_set() if filename is None: publisher.debug("Script Categ is Idling 10s") print('Sleeping') time.sleep(10) continue paste = Paste.Paste(filename) content = paste.get_p_content() #print('-----------------------------------------------------') #print(filename) #print(content) #print('-----------------------------------------------------') for categ, pattern in tmp_dict.items(): found = set(re.findall(pattern, content)) if len(found) >= matchingThreshold: msg = '{} {}'.format(paste.p_path, len(found)) #msg = " ".join( [paste.p_path, bytes(len(found))] ) print(msg, categ) p.populate_set_out(msg, categ) publisher.info( 'Categ;{};{};{};Detected {} as {};{}'.format( paste.p_source, paste.p_date, paste.p_name, len(found), categ, paste.p_path))