chg; [Duplicates module] refactor module + DB keys

2025-02-18 15:26:25 +00:00 · 2022-07-13 15:10:27 +02:00 · 2022-07-13 15:10:27 +02:00 · 2f8a5a333a
commit 2f8a5a333a
parent 8672671e51
11 changed files with 480 additions and 290 deletions
--- a/bin/DB_KVROCKS_MIGRATION.py
+++ b/bin/DB_KVROCKS_MIGRATION.py
@ -218,6 +218,8 @@ def item_submit_migration():
 # # TODO: change db
 def tags_migration():
    pass
 def items_migration():
--- a/bin/Duplicates.py
+++ b/bin/Duplicates.py
@ -1,198 +0,0 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
 The Duplicate module
 ====================
 This huge module is, in short term, checking duplicates.
 Its input comes from other modules, namely:
    Credential, CreditCard, Keys, Mails, SQLinjectionDetection, CVE and Phone
 This one differ from v1 by only using redis and not json file stored on disk
 Perform comparisions with ssdeep and tlsh
 Requirements:
 -------------
 """
 import redis
 import os
 import time
 from datetime import datetime, timedelta
 import json
 import ssdeep
 import tlsh
 from packages import Paste
 from pubsublogger import publisher
 from Helper import Process
 if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = 'Duplicates'
    p = Process(config_section)
    PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
    maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
    threshold_duplicate_ssdeep = int(p.config.get("Modules_Duplicates", "threshold_duplicate_ssdeep"))
    threshold_duplicate_tlsh = int(p.config.get("Modules_Duplicates", "threshold_duplicate_tlsh"))
    threshold_set = {}
    threshold_set['ssdeep'] = threshold_duplicate_ssdeep
    threshold_set['tlsh'] = threshold_duplicate_tlsh
    min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
    # REDIS #
    dico_redis = {}
    date_today = datetime.today()
    for year in range(2013, date_today.year+1):
        for month in range(0, 13):
            dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
                host=p.config.get("ARDB_DB", "host"),
                port=p.config.get("ARDB_DB", "port"),
                db=str(year) + str(month),
                decode_responses=True)
    # FUNCTIONS #
    publisher.info("Script duplicate started")
    while True:
        try:
            hash_dico = {}
            dupl = set()
            dico_range_list = []
            x = time.time()
            message = p.get_from_set()
            if message is not None:
                path = message
                PST = Paste.Paste(path)
            else:
                publisher.debug("Script Attribute is idling 10s")
                print('sleeping')
                time.sleep(10)
                continue
            # the paste is too small
            if (PST._get_p_size() < min_paste_size):
                continue
            PST._set_p_hash_kind("ssdeep")
            PST._set_p_hash_kind("tlsh")
            # Assignate the correct redis connexion
            r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
            # Creating the dico name: yyyymm
            # Get the date of the range
            date_range = date_today - timedelta(days = maximum_month_range*30.4166666)
            num_of_month = (date_today.year - date_range.year)*12 + (date_today.month - date_range.month)
            for diff_month in range(0, num_of_month+1):
                curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
                to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
                dico_range_list.append(to_append)
            # Use all dico in range
            dico_range_list = dico_range_list[0:maximum_month_range]
            # UNIQUE INDEX HASHS TABLE
            yearly_index = str(date_today.year)+'00'
            r_serv0 = dico_redis[yearly_index]
            r_serv0.incr("current_index")
            index = (r_serv0.get("current_index")) + str(PST.p_date)
            # Open selected dico range
            opened_dico = []
            for dico_name in dico_range_list:
                opened_dico.append([dico_name, dico_redis[dico_name]])
            # retrieve hash from paste
            paste_hashes = PST._get_p_hash()
            # Go throught the Database of the dico (of the month)
            for curr_dico_name, curr_dico_redis in opened_dico:
                for hash_type, paste_hash in paste_hashes.items():
                    for dico_hash in curr_dico_redis.smembers('HASHS_'+hash_type):
                        try:
                            if hash_type == 'ssdeep':
                                percent = 100-ssdeep.compare(dico_hash, paste_hash)
                            else:
                                percent = tlsh.diffxlen(dico_hash, paste_hash)
                                if percent > 100:
                                    percent = 100
                            threshold_duplicate = threshold_set[hash_type]
                            if percent < threshold_duplicate:
                                percent = 100 - percent if hash_type == 'ssdeep' else percent #recovert the correct percent value for ssdeep
                                # Go throught the Database of the dico filter (month)
                                r_serv_dico = dico_redis[curr_dico_name]
                                # index of paste
                                index_current = r_serv_dico.get(dico_hash)
                                index_current = index_current
                                paste_path = r_serv_dico.get(index_current)
                                paste_path = paste_path
                                paste_date = r_serv_dico.get(index_current+'_date')
                                paste_date = paste_date
                                paste_date = paste_date if paste_date != None else "No date available"
                                if paste_path != None:
                                    paste_path = paste_path.replace(PASTES_FOLDER+'/', '', 1)
                                    if paste_path != PST.p_rel_path:
                                        hash_dico[dico_hash] = (hash_type, paste_path, percent, paste_date)
                                        print('['+hash_type+'] '+'comparing: ' + str(PST.p_rel_path) + '  and  ' + str(paste_path) + ' percentage: ' + str(percent))
                        except Exception:
                            print('hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash)
            # Add paste in DB after checking to prevent its analysis twice
            # hash_type_i -> index_i  AND  index_i -> PST.PATH
            r_serv1.set(index, PST.p_rel_path)
            r_serv1.set(index+'_date', PST._get_p_date())
            r_serv1.sadd("INDEX", index)
            # Adding hashes in Redis
            for hash_type, paste_hash in paste_hashes.items():
                r_serv1.set(paste_hash, index)
                #bad hash
                if paste_hash == '':
                    print('bad Hash: ' + hash_type)
                else:
                    r_serv1.sadd("HASHS_"+hash_type, paste_hash)
    ##################### Similarity found  #######################
            # if there is data in this dictionnary
            if len(hash_dico) != 0:
                # paste_tuple = (hash_type, date, paste_path, percent)
                for dico_hash, paste_tuple in hash_dico.items():
                    dupl.add(paste_tuple)
                # Creating the object attribute and save it.
                to_print = 'Duplicate;{};{};{};'.format(
                    PST.p_source, PST.p_date, PST.p_name)
                if dupl != []:
                    dupl = list(dupl)
                    PST.__setattr__("p_duplicate", dupl)
                    PST.save_attribute_duplicate(dupl)
                    PST.save_others_pastes_attribute_duplicate(dupl)
                    publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_rel_path))
                    print('{}Detected {}'.format(to_print, len(dupl)))
                    print('')
                y = time.time()
                publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
        except IOError:
            to_print = 'Duplicate;{};{};{};'.format(
                PST.p_source, PST.p_date, PST.p_name)
            print("CRC Checksum Failed on :", PST.p_rel_path)
            publisher.error('{}CRC Checksum Failed'.format(to_print))
--- a/bin/lib/Duplicate.py
+++ b/bin/lib/Duplicate.py
@ -0,0 +1,130 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 import os
 import ssdeep
 import sys
 import time
 import tlsh
 import datetime
 sys.path.append(os.environ['AIL_BIN'])
 ##################################
 # Import Project packages
 ##################################
 from lib.ConfigLoader import ConfigLoader
 config_loader = ConfigLoader()
 r_serv_db = config_loader.get_redis_conn("Kvrocks_DB")
 MIN_ITEM_SIZE = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: RENAME ME
 config_loader = None
 #
 #
 # Hash != Duplicates => New correlation HASH => check if same hash if duplicate == 100
 #
 # Object Hash => correlation decoded => don't need correlation to exists
 #
 # New CORRELATION => HASH
 #                     -> compute/get(if exist we have a correlation) hash -> get correlation same hash
 #
 #
 # Duplicates between differents objects ?????
 #         Diff Decoded -> Item => Diff Item decoded - Item
 #
 # Duplicates domains != Duplicates items
 def get_ssdeep_hash(content):
    return ssdeep.hash(content)
 def get_ssdeep_similarity(obj_hash, other_hash):
    return ssdeep.compare(obj_hash, other_hash)
 def get_tlsh_hash(content):
    return tlsh.hash(content)
 def get_tlsh_similarity(obj_hash, other_hash):
    similarity = tlsh.diffxlen(obj_hash, other_hash)
    if similarity > 100:
        similarity = 100
    similarity = 100 - similarity
    return similarity
 def get_algo_similarity(algo, obj_hash, other_hash):
    if algo == 'ssdeep':
        return get_ssdeep_similarity(obj_hash, other_hash)
    elif algo == 'tlsh':
        return get_tlsh_similarity(obj_hash, other_hash)
 def get_algo_hashs_by_month(algo, date_ymonth):
    return r_serv_db.hkeys(f'duplicates:hashs:{algo}:{date_ymonth}')
 def exists_algo_hash_by_month(algo, hash, date_ymonth):
    return r_serv_db.hexists(f'duplicates:hashs:{algo}:{date_ymonth}', hash)
 def get_object_id_by_hash(algo, hash, date_ymonth):
    return r_serv_db.hget(f'duplicates:hashs:{algo}:{date_ymonth}', hash)
 def save_object_hash(algo, date_ymonth, hash, obj_id):
    r_serv_db.hset(f'duplicates:hashs:{algo}:{date_ymonth}', hash, obj_id)
 def get_duplicates(obj_type, subtype, id):
    dict_dup = {}
    duplicates = r_serv_db.smembers(f'obj:duplicates:{obj_type}:{subtype}:{id}')
    for str_dup in duplicates:
        similarity, algo, id = str_dup.split(':', 2)
        if not dict_dup.get(id):
            dict_dup[id] = []
        dict_dup[id].append({'algo': algo, 'similarity': int(similarity)})
    return dict_dup
 def _add_obj_duplicate(algo, similarity, obj_type, subtype, id, id_2):
    r_serv_db.sadd(f'obj:duplicates:{obj_type}:{subtype}:{id}', f'{similarity}:{algo}:{id_2}')
 def add_obj_duplicate(algo, hash, similarity, obj_type, subtype, id, date_ymonth):
    obj2_id = get_object_id_by_hash(algo, hash, date_ymonth)
    # same content
    if similarity == 100:
        dups = get_duplicates(obj_type, subtype, id)
        for dup_id in dups:
            for algo_dict in dups[dup_id]:
                if algo_dict['similarity'] == 100 and algo_dict['algo'] == algo:
                    _add_obj_duplicate(algo, similarity, obj_type, subtype, id, dups[dup_id])
                    _add_obj_duplicate(algo, similarity, obj_type, subtype, dups[dup_id], id)
    _add_obj_duplicate(algo, similarity, obj_type, subtype, id, obj2_id)
    _add_obj_duplicate(algo, similarity, obj_type, subtype, obj2_id, id)
 def get_last_x_month_dates(nb_months):
    now = datetime.datetime.now()
    result = [now.strftime("%Y%m")]
    for x in range(0, nb_months):
        now = now.replace(day=1) - datetime.timedelta(days=1)
        result.append(now.strftime("%Y%m"))
    return result
 if __name__ == '__main__':
    res = get_last_x_month_dates(7)
    print(res)
 #################################
--- a/bin/lib/item_basic.py
+++ b/bin/lib/item_basic.py
@ -54,6 +54,16 @@ def is_crawled(item_id):
 def get_item_domain(item_id):
    return item_id[19:-36]
 def get_item_content_binary(item_id):
    item_full_path = os.path.join(PASTES_FOLDER, item_id)
    try:
        with gzip.open(item_full_path, 'rb') as f:
            item_content = f.read()
    except Exception as e:
        print(e)
        item_content = ''
    return item_content
 def get_item_content(item_id):
    item_full_path = os.path.join(PASTES_FOLDER, item_id)
    try:
--- a/bin/lib/objects/Items.py
+++ b/bin/lib/objects/Items.py
@ -91,11 +91,14 @@ class Item(AbstractObject):
        else:
            return filename
-    def get_content(self):
+    def get_content(self, binary=False):
        """
        Returns Item content
        """
-        return item_basic.get_item_content(self.id)
+        if binary:
            return item_basic.get_item_content_binary(self.id)
        else:
            return item_basic.get_item_content(self.id)
    def get_raw_content(self):
        filepath = self.get_filename()
@ -110,15 +113,34 @@ class Item(AbstractObject):
            content = base64.b64encode(content)
        return content.decode()
    def get_html2text_content(self, content=None, ignore_links=False):
        if not content:
            content = self.get_content()
        h = html2text.HTML2Text()
        h.ignore_links = ignore_links
        h.ignore_images = ignore_links
        return h.handle(content)
    def get_size(self, str=False):
        size = os.path.getsize(self.get_filename())/1024.0
        if str:
            size = round(size, 2)
        return size
    def get_ail_2_ail_payload(self):
        payload = {'raw': self.get_gzip_content(b64=True)}
        return payload
-    def set_origin(self): # set_parent ?
+    def set_father(self, father_id): # UPDATE KEYS ?????????????????????????????
-        pass
+        r_serv_metadata.sadd(f'paste_children:{father_id}', self.id)
        r_serv_metadata.hset(f'paste_metadata:{self.id}', 'father', father_id)
        #f'obj:children:{obj_type}:{subtype}:{id}, {obj_type}:{subtype}:{id}
        #f'obj:metadata:{obj_type}:{subtype}:{id}', 'father', fathe
        #  => ON Object LEVEL ?????????
    def add_duplicate(self):
        pass
    def sanitize_id(self):
        pass
@ -150,18 +172,25 @@ class Item(AbstractObject):
    # origin
    # duplicate -> all item iterations ???
    #
-    def create(self, content, tags, origin=None, duplicate=None):
+    def create(self, content, tags, father=None, duplicates=[], _save=True):
-        self.save_on_disk(content, binary=True, compressed=False, base64=False)
+        if _save:
            self.save_on_disk(content, binary=True, compressed=False, base64=False)
        # # TODO:
        # for tag in tags:
        #     self.add_tag(tag)
-        if origin:
+        if father:
            pass
        for obj_id in duplicates:
            for dup in duplicates[obj_id]:
                self.add_duplicate(obj_id, dup['algo'], dup['similarity'])
        if duplicate:
        pass
    # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
    # TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ...
@ -204,6 +233,80 @@ class Item(AbstractObject):
    def exist_correlation(self):
        pass
    def is_crawled(self):
        return self.id.startswith('crawled')
    # if is_crawled
    def get_domain(self):
        return self.id[19:-36]
    def get_screenshot(self):
        s = r_serv_metadata.hget(f'paste_metadata:{self.id}', 'screenshot')
        if s:
            return os.path.join(s[0:2], s[2:4], s[4:6], s[6:8], s[8:10], s[10:12], s[12:])
    def get_har(self):
        har_path = os.path.join(har_directory, self.id) + '.json'
        if os.path.isfile(har_path):
            return har_path
        else:
            return None
    def get_url(self):
        return r_serv_metadata.hget(f'paste_metadata:{self.id}', 'real_link')
    # options: set of optional meta fields
    def get_meta(self, options=set()):
        meta = {}
        meta['id'] = self.id
        meta['date'] = self.get_date(separator=True) ############################ # TODO:
        meta['source'] = self.get_source()
        meta['tags'] = self.get_tags()
        # optional meta fields
        if 'content' in options:
            meta['content'] = self.get_content()
        if 'crawler' in options:
            if self.is_crawled():
                tags = meta.get('tags')
                meta['crawler'] = self.get_meta_crawler(tags=tags)
        if 'duplicates' in options:
            meta['duplicates'] = self.get_duplicates()
        if 'lines' in options:
            content = meta.get('content')
            meta['lines'] = self.get_meta_lines(content=content)
        if 'size' in options:
            meta['size'] = self.get_size(str=True)
        # # TODO: ADD GET FATHER
        # meta['encoding'] = None
        return meta
    def get_meta_crawler(self, tags=[]):
        crawler = {}
        if self.is_crawled():
            crawler['domain'] = self.get_domain()
            crawler['har'] = self.get_har()
            crawler['screenshot'] = self.get_screenshot()
            crawler['url'] = self.get_url()
            if not tags:
                tags = self.get_tags()
            crawler['is_tags_safe'] = Tag.is_tags_safe(tags)
        return crawler
    def get_meta_lines(self, content=None):
        if not content:
            content = self.get_content()
        max_length = 0
        line_id = 0
        nb_line = 0
        for line in content.splitlines():
            length = len(line)
            if length > max_length:
                max_length = length
            nb_line += 1
        return {'nb': nb_line, 'max_length': max_length}
    ############################################################################
    ############################################################################
@ -547,7 +650,7 @@ def get_item_list_desc(list_item_id):
 def is_crawled(item_id):
    return item_basic.is_crawled(item_id)
-def get_crawler_matadata(item_id, ltags=None):
+def get_crawler_matadata(item_id, tags=None):
    dict_crawler = {}
    if is_crawled(item_id):
        dict_crawler['domain'] = get_item_domain(item_id)
@ -759,5 +862,7 @@ def delete_domain_node(item_id):
 if __name__ == '__main__':
    content = 'test file content'
    duplicates = {'tests/2020/01/02/test.gz': [{'algo':'ssdeep', 'similarity':75}, {'algo':'tlsh', 'similarity':45}]}
    item = Item('tests/2020/01/02/test_save.gz')
-    item.save_on_disk(content, binary=False)
+    item.create(content, _save=False)
--- a/bin/lib/objects/abstract_object.py
+++ b/bin/lib/objects/abstract_object.py
@ -17,6 +17,7 @@ sys.path.append(os.environ['AIL_BIN'])
 # Import Project packages
 ##################################
 from packages import Tag
 from lib import Duplicate
 from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations
 from lib.Tracker import is_obj_tracked, get_obj_all_trackers, delete_obj_trackers
@ -69,6 +70,9 @@ class AbstractObject(ABC):
            tags = set(tags)
        return tags
    def get_duplicates(self):
        return Duplicate.get_duplicates(self.type, self.get_subtype(r_str=True), self.id)
    ## ADD TAGS ????
    #def add_tags(self):
--- a/bin/lib/objects/abstract_subtype_object.py
+++ b/bin/lib/objects/abstract_subtype_object.py
@ -113,14 +113,39 @@ class AbstractSubtypeObject(AbstractObject):
            if date > last_seen:
                self.set_last_seen(date)
-    def add(self, date):
+    def add(self, date, item_id):
        self.update_correlation_daterange()
        # daily
        r_metadata.hincrby(f'{self.type}:{self.subtype}:{date}', self.id, 1)
        # all type
        r_metadata.zincrby(f'{self.type}_all:{self.subtype}', self.id, 1)
        #######################################################################
        #######################################################################
        # REPLACE WITH CORRELATION ?????
        # global set
        r_serv_metadata.sadd(f'set_{self.type}_{self.subtype}:{self.id}', item_id)
        ## object_metadata
        # item
        r_serv_metadata.sadd(f'item_{self.type}_{self.subtype}:{item_id}', self.id)
        # new correlation
        #
        #       How to filter by correlation type ????
        #
        f'correlation:obj:{self.type}:{self.subtype}:{self.id}',                f'{obj_type}:{obj_subtype}:{obj_id}'
        f'correlation:obj:{self.type}:{self.subtype}:{obj_type}:{self.id}',     f'{obj_subtype}:{obj_id}'
        #
        #
        #
        #
        #
        #
        #
        #
        # # domain
@ -128,6 +153,9 @@ class AbstractSubtypeObject(AbstractObject):
        #     domain = item_basic.get_item_domain(item_id)
        #     self.save_domain_correlation(domain, subtype, obj_id)
    def create(self, first_seen, last_seen):
        pass
    def _delete(self):
--- a/bin/modules/Duplicates.py
+++ b/bin/modules/Duplicates.py
@ -0,0 +1,108 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
 The Duplicate module
 ====================
 This huge module is, in short term, checking duplicates.
 Its input comes from other modules, namely:
    Credential
 Perform comparisions with ssdeep and tlsh
 """
 import redis
 import os
 import sys
 import time
 #from datetime import datetime, timedelta
 import datetime
 sys.path.append(os.environ['AIL_BIN'])
 ##################################
 # Import Project packages
 ##################################
 from modules.abstract_module import AbstractModule
 from lib.ConfigLoader import ConfigLoader
 from lib import Duplicate
 from lib.objects.Items import Item
 class Duplicates(AbstractModule):
    """Duplicates module."""
    def __init__(self):
        super(Duplicates, self).__init__()
        config_loader = ConfigLoader()
        THRESHOLD_SSDEEP = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_ssdeep')
        THRESHOLD_TLSH = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_tlsh')
        self.min_item_size = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: # FIXME: rename me
        self.maximum_month_range = config_loader.get_config_int('Modules_Duplicates', 'maximum_month_range')
        self.algos = {
                        "ssdeep": {"threshold": THRESHOLD_SSDEEP},
                        "tlsh": {"threshold": THRESHOLD_TLSH}
                     }
        self.redis_logger.info(f"Module: {self.module_name} Launched")
    def compute(self, message):
        # IOError: "CRC Checksum Failed on : {id}"
        item = Item(message)
        # Check file size
        if item.get_size() < self.min_item_size:
            return None
        # one month
        curr_date_ymonth = datetime.datetime.now().strftime("%Y%m")
        last_month_dates = Duplicate.get_last_x_month_dates(self.maximum_month_range)
        x = time.time()
        # Get Hashs
        content = item.get_content(binary=True)
        self.algos['ssdeep']['hash'] = Duplicate.get_ssdeep_hash(content)
        self.algos['tlsh']['hash'] = Duplicate.get_tlsh_hash(content)
        # TODO: Handle coputed duplicates
        nb_duplicates = 0
        for algo in self.algos:
            obj_hash = self.algos[algo]['hash']
            for date_ymonth in last_month_dates:
                if Duplicate.exists_algo_hash_by_month(algo, obj_hash, date_ymonth):
                    Duplicate.add_obj_duplicate(algo, obj_hash, 100, 'item', '', item.get_id(), date_ymonth)
                    nb_duplicates +=1
                else:
                    for hash in Duplicate.get_algo_hashs_by_month(algo, date_ymonth):
                        # # FIXME:  try - catch 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash
                        similarity = Duplicate.get_algo_similarity(algo, obj_hash, hash)
                        print(f'[{algo}] comparing: {obj_hash} and {hash} similarity: {similarity}') # DEBUG:
                        if similarity >= self.algos[algo]['threshold']:
                            Duplicate.add_obj_duplicate(algo, hash, similarity, 'item', '', item.get_id(), date_ymonth)
                            nb_duplicates +=1
            # Save Hashs
            Duplicate.save_object_hash(algo, curr_date_ymonth, self.algos[algo]['hash'], item.get_id())
        if nb_duplicates:
            self.redis_logger.info(f'Duplicate;{item.get_source()};{item.get_date()};{item.get_basename()};Detected {nb_duplicates};{item.get_id()}')
        y = time.time()
        print(f'{item.get_id()} Processed in {y-x} sec')
        #self.redis_logger.debug('{}Processed in {} sec'.format(to_print, y-x))
 if __name__ == "__main__":
    module = Duplicates()
    module.run()
--- a/bin/packages/modules.cfg
+++ b/bin/packages/modules.cfg
@ -66,15 +66,15 @@ publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Urls,Redis_Credential,R
 [CreditCards]
 subscribe = Redis_CreditCards
-publish = Redis_Duplicate,Redis_ModuleStats,Redis_Tags
+publish = Redis_ModuleStats,Redis_Tags
 [BankAccount]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 [Mail]
 subscribe = Redis_Mail
-publish = Redis_Duplicate,Redis_ModuleStats,Redis_Tags
+publish = Redis_ModuleStats,Redis_Tags
 [Onion]
 subscribe = Redis_Onion
@ -92,11 +92,11 @@ publish = Redis_Url
 [LibInjection]
 subscribe = Redis_Url
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 [SQLInjectionDetection]
 subscribe = Redis_Url
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 [ModuleStats]
 subscribe = Redis_ModuleStats
@ -128,31 +128,31 @@ publish = Redis_Duplicate,Redis_ModuleStats,Redis_Tags
 [Cve]
 subscribe = Redis_Cve
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 [Phone]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 [Keys]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_PgpDump,Redis_Tags
+publish = Redis_PgpDump,Redis_Tags
 [PgpDump]
 subscribe = Redis_PgpDump
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 [ApiKey]
 subscribe = Redis_ApiKey
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 [Decoder]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 [Bitcoin]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 [submit_paste]
 subscribe = Redis
@ -164,7 +164,8 @@ publish = Redis_Mixer,Redis_Tags
 [IP]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 [Zerobins]
 subscribe = Redis_Url
--- a/var/www/blueprints/objects_item.py
+++ b/var/www/blueprints/objects_item.py
@ -15,12 +15,15 @@ from flask_login import login_required, current_user
 # Import Role_Manager
 from Role_Manager import login_admin, login_analyst, login_read_only
-sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages'))
+sys.path.append(os.environ['AIL_BIN'])
-import Item
+##################################
-import Tag
+# Import Project packages
 ##################################
 from lib import item_basic
 from lib.objects.Items import Item
 from export import Export
 from packages import Tag
 sys.path.append(os.path.join(os.environ['AIL_BIN'], 'export'))
 import Export
 # ============ BLUEPRINT ============
 objects_item = Blueprint('objects_item', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/objects/item'))
@ -38,28 +41,22 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info']
@login_read_only
 def showItem(): # # TODO: support post
    item_id = request.args.get('id')
-    if not item_id or not Item.exist_item(item_id):
+    if not item_id or not item_basic.exist_item(item_id):
        abort(404)
-    dict_item = {}
+    item = Item(item_id)
-    dict_item['id'] = item_id
+    meta = item.get_meta(options=set(['content', 'crawler', 'duplicates', 'lines', 'size']))
    dict_item['name'] = dict_item['id'].replace('/', ' / ')
    dict_item['father'] = Item.get_item_parent(item_id)
    dict_item['content'] = Item.get_item_content(item_id)
    dict_item['metadata'] = Item.get_item_metadata(item_id, item_content=dict_item['content'])
    dict_item['tags'] = Tag.get_obj_tag(item_id)
    #dict_item['duplicates'] = Item.get_item_nb_duplicates(item_id)
    dict_item['duplicates'] = Item.get_item_duplicates_dict(item_id)
    dict_item['crawler'] = Item.get_crawler_matadata(item_id, ltags=dict_item['tags'])
    meta['name'] = meta['id'].replace('/', ' / ')
    meta['father'] = item_basic.get_item_parent(item_id)
    ## EXPORT SECTION
    # # TODO: ADD in Export SECTION
-    dict_item['hive_case'] = Export.get_item_hive_cases(item_id)
+    meta['hive_case'] = Export.get_item_hive_cases(item_id)
    return render_template("show_item.html", bootstrap_label=bootstrap_label,
-                            modal_add_tags=Tag.get_modal_add_tags(dict_item['id'], object_type='item'),
+                            modal_add_tags=Tag.get_modal_add_tags(meta['id'], object_type='item'),
                            is_hive_connected=Export.get_item_hive_cases(item_id),
-                            dict_item=dict_item)
+                            meta=meta)
    # kvrocks data
@ -74,24 +71,27 @@ def showItem(): # # TODO: support post
@login_read_only
 def html2text(): # # TODO: support post
    item_id = request.args.get('id')
-    if not item_id or not Item.exist_item(item_id):
+    if not item_id or not item_basic.exist_item(item_id):
        abort(404)
-    return Item.get_item_content_html2text(item_id)
+    item = Item(item_id)
    return item.get_html2text_content()
@objects_item.route("/object/item/raw_content")
@login_required
@login_read_only
 def item_raw_content(): # # TODO: support post
    item_id = request.args.get('id')
-    if not item_id or not Item.exist_item(item_id):
+    if not item_id or not item_basic.exist_item(item_id):
        abort(404)
-    return Response(Item.get_item_content(item_id), mimetype='text/plain')
+    item = Item(item_id)
    return Response(item.get_content(), mimetype='text/plain')
@objects_item.route("/object/item/download")
@login_required
@login_read_only
 def item_download(): # # TODO: support post
    item_id = request.args.get('id')
-    if not item_id or not Item.exist_item(item_id):
+    if not item_id or not item_basic.exist_item(item_id):
        abort(404)
-    return send_file(Item.get_raw_content(item_id), attachment_filename=item_id, as_attachment=True)
+    item = Item(item_id)
    return send_file(item.get_raw_content(), attachment_filename=item_id, as_attachment=True)
--- a/var/www/templates/objects/item/show_item.html
+++ b/var/www/templates/objects/item/show_item.html
@ -38,7 +38,7 @@
  <div class="card my-2 mx-2">
    <div class="card-header bg-dark">
-      <h3 class="text-white text-center" >{{ dict_item['name'] }}</h3>
+      <h3 class="text-white text-center" >{{ meta['name'] }}</h3>
    </div>
    <div class="card-body pb-1">
      <table class="table table-condensed">
@ -46,7 +46,7 @@
          <tr>
            <th>Date</th>
            <th>Source</th>
-            <th>Encoding</th>
+            <!-- <th>Encoding</th> -->
            <th>Size (Kb)</th>
            <th>Number of lines</th>
            <th>Max line length</th>
@ -54,12 +54,12 @@
        </thead>
        <tbody>
          <tr>
-          <td>{{ dict_item['metadata']['date'] }}</td>
+          <td>{{ meta['date'] }}</td>
-          <td>{{ dict_item['metadata']['source'] }}</td>
+          <td>{{ meta['source'] }}</td>
-          <td>{{ dict_item['metadata']['encoding'] }}</td>
+          <!-- <td>{{ meta['encoding'] }}</td> -->
-          <td>{{ dict_item['metadata']['size'] }}</td>
+          <td>{{ meta['size'] }}</td>
-          <td>{{ dict_item['metadata']['lines']['nb'] }}</td>
+          <td>{{ meta['lines']['nb'] }}</td>
-          <td>{{ dict_item['metadata']['lines']['max_length'] }}</td>
+          <td>{{ meta['lines']['max_length'] }}</td>
          </tr>
        </tbody>
      </table>
@ -68,9 +68,9 @@
        <h5>
          <div>
            {% include 'modals/edit_tag.html' %}
-            {% for tag in dict_item['tags'] %}
+            {% for tag in meta['tags'] %}
              <button class="btn btn-{{ bootstrap_label[loop.index0 % 5] }}" data-toggle="modal" data-target="#edit_tags_modal"
-              data-tagid="{{ tag }}" data-objtype="item" data-objid="{{ dict_item['id'] }}">
+              data-tagid="{{ tag }}" data-objtype="item" data-objid="{{ meta['id'] }}">
                {{ tag }}
              </button>
@ -84,21 +84,21 @@
        </h5>
      </div>
-      {% if dict_item['father'] %}
+      {% if meta['father'] %}
        <div class="mt-3">
-          Father: <a href="{{ url_for('objects_item.showItem')}}?id={{dict_item['father']}}" target="_blank">{{dict_item['father']}}</a>
+          Father: <a href="{{ url_for('objects_item.showItem')}}?id={{meta['father']}}" target="_blank">{{meta['father']}}</a>
        </div>
      {% endif %}
      <div class="d-flex flex-row-reverse bd-highlight">
        <div>
-          <a href="{{ url_for('correlation.show_correlation')}}?object_type=paste&correlation_id={{ dict_item['id'] }}&correlation_objects=paste" target="_blank">
+          <a href="{{ url_for('correlation.show_correlation')}}?object_type=paste&correlation_id={{ meta['id'] }}&correlation_objects=paste" target="_blank">
 						<button class="btn btn-lg btn-info"><i class="fas fa-project-diagram"></i> Correlations Graph
 						</button>
 					</a>
        </div>
        <div>
-          {% with obj_type='item', obj_id=dict_item['id'], obj_subtype=''%}
+          {% with obj_type='item', obj_id=meta['id'], obj_subtype=''%}
            {% include 'modals/investigations_register_obj.html' %}
          {% endwith %}
          <div class="mr-2">
@ -108,7 +108,7 @@
          </div>
        </div>
        <div class="mx-2">
-          {% with obj_type='item', obj_id=dict_item['id'], obj_lvl=0%}
+          {% with obj_type='item', obj_id=meta['id'], obj_lvl=0%}
            {% include 'import_export/block_add_user_object_to_export.html' %}
          {% endwith %}
        </div>
@ -134,14 +134,14 @@
    </div>
  {% endif %}
-  {% if dict_item['hive_case'] %}
+  {% if meta['hive_case'] %}
    <div class="list-group" id="misp_event">
      <li class="list-group-item active">The Hive Case already Created</li>
      <a target="_blank" href="{{ hive_url }}" class="list-group-item">{{ hive_url }}</a>
    </div>
  {% endif %}
-  {% if  dict_item['duplicates'] != 0 %}
+  {% if  meta['duplicates'] != 0 %}
    <div id="accordionDuplicate" class="mb-2 mx-3">
      <div class="card">
        <div class="card-header py-1" id="headingDuplicate">
@ -149,7 +149,7 @@
            <div class="col-11">
              <div class="mt-2">
                <i class="far fa-clone"></i> duplicates&nbsp;&nbsp;
-                <div class="badge badge-warning">{{dict_item['duplicates']|length}}</div>
+                <div class="badge badge-warning">{{meta['duplicates']|length}}</div>
              </div>
            </div>
            <div class="col-1">
@ -173,19 +173,19 @@
                </tr>
              </thead>
              <tbody>
-                {% for duplicate_id in dict_item['duplicates'] %}
+                {% for duplicate_id in meta['duplicates'] %}
                  <tr>
-                    <td>{{dict_item['duplicates'][duplicate_id]['date']}}</td>
+                    <td>{{meta['duplicates'][duplicate_id]['date']}}</td>
                    <td class="py-0">
                      <table class="table table-borderless table-sm my-0">
                        <tbody>
-                          {%for algo in dict_item['duplicates'][duplicate_id]['algo']|sort()%}
+                          {%for dict_algo in meta['duplicates'][duplicate_id]|sort(attribute='algo')%}
                            <tr>
-                              <td class="py-0">{{algo}}</td>
+                              <td class="py-0">{{dict_algo['algo']}}</td>
                              <td class="w-100 py-0">
                                <div class="progress mt-1">
-                                  <div class="progress-bar progress-bar-striped {%if algo=='tlsh'%}bg-secondary{%endif%}" role="progressbar" style="width: {{dict_item['duplicates'][duplicate_id]['algo'][algo]}}%;" aria-valuenow="{{dict_item['duplicates'][duplicate_id]['algo'][algo]}}" aria-valuemin="0" aria-valuemax="100">
+                                  <div class="progress-bar progress-bar-striped {%if dict_algo['algo']=='tlsh'%}bg-secondary{%endif%}" role="progressbar" style="width: {{dict_algo['similarity']}}%;" aria-valuenow="{{dict_algo['similarity']}}" aria-valuemin="0" aria-valuemax="100">
-                                    {{dict_item['duplicates'][duplicate_id]['algo'][algo]}}%
+                                    {{dict_algo['similarity']}}%
                                  </div>
                                </div>
                              </td>
@ -200,7 +200,7 @@
                      </a>
                    </td>
                    <td>
-                      <a target="_blank" href="{{ url_for('showsavedpastes.showDiff') }}?s1={{dict_item['id']}}&s2={{duplicate_id}}" class="fa fa-columns" title="Show diff"></a>
+                      <a target="_blank" href="{{ url_for('showsavedpastes.showDiff') }}?s1={{meta['id']}}&s2={{duplicate_id}}" class="fa fa-columns" title="Show diff"></a>
                    </td>
                  </tr>
                {% endfor %}
@ -261,7 +261,7 @@
  {% endif %}
-  {% if dict_item['crawler'] %}
+  {% if meta['crawler'] %}
  <div id="accordionCrawler" class="mb-3 mx-3">
    <div class="card">
      <div class="card-header py-1" id="headingCrawler">
@ -294,18 +294,18 @@
                  <tr>
                    <td><i class="far fa-file"></i></td>
                    <td>
-                      <a class="badge" target="_blank" href="{{ url_for('objects_item.showItem', paste=dict_item['father']) }}" />{{ dict_item['father'] }}</a>
+                      <a class="badge" target="_blank" href="{{ url_for('objects_item.showItem', paste=meta['father']) }}" />{{ meta['father'] }}</a>
                    </td>
                  </tr>
                    <td><i class="fab fa-html5"></i></td>
                    <td>
-                      <a class="badge" target="_blank" href="{{ url_for('crawler_splash.showDomain', domain=dict_item['crawler']['domain']) }}" />{{ dict_item['crawler']['domain'] }}</a>
+                      <a class="badge" target="_blank" href="{{ url_for('crawler_splash.showDomain', domain=meta['crawler']['domain']) }}" />{{ meta['crawler']['domain'] }}</a>
                    </td>
                  </tr>
                  <tr>
                    <td>url</td>
                    <td>
-                      {{ dict_item['crawler']['url'] }}
+                      {{ meta['crawler']['url'] }}
                    </td>
                  </tr>
                </tbody>
@ -318,11 +318,11 @@
              <div class="card-body py-2">
                <div class="row">
                  <div class="col-md-8">
-                    <input class="custom-range mt-2" id="blocks" type="range" min="1" max="50" value="{%if dict_item['crawler']['is_tags_safe']%}13{%else%}0{%endif%}">
+                    <input class="custom-range mt-2" id="blocks" type="range" min="1" max="50" value="{%if meta['crawler']['is_tags_safe']%}13{%else%}0{%endif%}">
                  </div>
                  <div class="col-md-4">
-                    <button class="btn {%if dict_item['crawler']['is_tags_safe']%}btn-primary{%else%}btn-danger{%endif%}" onclick="blocks.value=50;pixelate();">
+                    <button class="btn {%if meta['crawler']['is_tags_safe']%}btn-primary{%else%}btn-danger{%endif%}" onclick="blocks.value=50;pixelate();">
-                      {%if dict_item['crawler']['is_tags_safe']%}
+                      {%if meta['crawler']['is_tags_safe']%}
                        <i class="fas fas fa-plus-square"></i>
                      {%else%}
                        <i class="fas fa-exclamation-triangle"></i>
@ -358,8 +358,8 @@
        <li class="nav-item dropdown">
          <a class="nav-link dropdown-toggle" data-toggle="dropdown" href="#">Others</a>
          <div class="dropdown-menu">
-            <a class="dropdown-item" href="{{ url_for('objects_item.item_raw_content', id=dict_item['id']) }}"><i class="far fa-file"></i> &nbsp;Raw Content</a>
+            <a class="dropdown-item" href="{{ url_for('objects_item.item_raw_content', id=meta['id']) }}"><i class="far fa-file"></i> &nbsp;Raw Content</a>
-            <a class="dropdown-item" href="{{ url_for('objects_item.item_download', id=dict_item['id']) }}"><i class="fas fa-download"></i> &nbsp;Download</i></a>
+            <a class="dropdown-item" href="{{ url_for('objects_item.item_download', id=meta['id']) }}"><i class="fas fa-download"></i> &nbsp;Download</i></a>
          </div>
        </li>
      </ul>
@ -367,7 +367,7 @@
      <div class="tab-content" id="pills-tabContent">
        <div class="tab-pane fade show active" id="pills-content" role="tabpanel" aria-labelledby="pills-content-tab">
-          <p class="my-0"> <pre class="border">{{ dict_item['content'] }}</pre></p>
+          <p class="my-0"> <pre class="border">{{ meta['content'] }}</pre></p>
        </div>
        <div class="tab-pane fade" id="pills-html2text" role="tabpanel" aria-labelledby="pills-html2text-tab">
          <p class="my-0"> <pre id="html2text-container" class="border"></pre></p>
@ -393,7 +393,7 @@
      $('#pills-html2text-tab').on('shown.bs.tab', function (e) {
        if ($('#html2text-container').is(':empty')){
-          $.get("{{ url_for('objects_item.html2text') }}?id={{ dict_item['id'] }}").done(function(data){
+          $.get("{{ url_for('objects_item.html2text') }}?id={{ meta['id'] }}").done(function(data){
            $('#html2text-container').text(data);
          });
@ -401,7 +401,7 @@
      });
  </script>
-{% if dict_item['crawler'] %}
+{% if meta['crawler'] %}
  <script>
  var ctx = canvas.getContext('2d'), img = new Image();
@ -413,7 +413,7 @@
  img.addEventListener("error", img_error);
  var draw_img = false;
-  img.src = "{{ url_for('showsavedpastes.screenshot', filename=dict_item['crawler']['screenshot']) }}";
+  img.src = "{{ url_for('showsavedpastes.screenshot', filename=meta['crawler']['screenshot']) }}";
  function pixelate() {