improve Duplicate + tlsh + add time out handler + debug + clean

2025-03-18 20:39:51 +00:00 · 2018-05-02 17:07:10 +02:00 · 2018-05-02 17:07:10 +02:00 · c52caebe7c
commit c52caebe7c
parent 92977201fd
20 changed files with 133 additions and 64 deletions
--- a/bin/ApiKey.py
+++ b/bin/ApiKey.py
@ -32,17 +32,8 @@ def search_api_key(message):
    aws_secret_key = regex_aws_secret_key.findall(content)
    google_api_key = regex_google_api_key.findall(content)

-    print(aws_access_key)
-    print(aws_secret_key)
-    print(google_api_key)
-
    if(len(aws_access_key) > 0 or len(aws_secret_key) > 0 or len(google_api_key) > 0):

-        print('-------------------------------')
-        print(aws_access_key)
-        print(aws_secret_key)
-        print(google_api_key)
-
        to_print = 'ApiKey;{};{};{};'.format(
            paste.p_source, paste.p_date, paste.p_name)
        if(len(google_api_key) > 0):
@ -99,5 +90,4 @@ if __name__ == "__main__":

        else:
            publisher.debug("Script ApiKey is Idling 10s")
-            #print('Sleeping')
            time.sleep(10)
--- a/bin/Base64.py
+++ b/bin/Base64.py
@ -20,6 +20,16 @@ from hashlib import sha1
 import magic
 import json

+import signal
+
+class TimeoutException(Exception):
+    pass
+
+def timeout_handler(signum, frame):
+    raise TimeoutException
+
+signal.signal(signal.SIGALRM, timeout_handler)
+

 def search_base64(content, message):
    find = False
@ -88,6 +98,7 @@ if __name__ == '__main__':

    # Setup the I/O queues
    p = Process(config_section)
+    max_execution_time = p.config.getint("Base64", "max_execution_time")

    # Sent to the logging a description of the module
    publisher.info("Base64 started")
@ -105,14 +116,21 @@ if __name__ == '__main__':
            time.sleep(1)
            continue

-            # Do something with the message from the queue
-
        filename = message
        paste = Paste.Paste(filename)
-        content = paste.get_p_content()

-        #print(filename)
-        search_base64(content,message)
+        signal.alarm(max_execution_time)
+        try:
+            # Do something with the message from the queue
+            #print(filename)
+            content = paste.get_p_content()
+            search_base64(content,message)

-        # (Optional) Send that thing to the next queue
-        #p.populate_set_out(something_has_been_done)
+            # (Optional) Send that thing to the next queue
+            #p.populate_set_out(something_has_been_done)
+
+        except TimeoutException:
+             print ("{0} processing timeout".format(paste.p_path))
+             continue
+        else:
+            signal.alarm(0)
--- a/bin/Credential.py
+++ b/bin/Credential.py
@ -74,18 +74,9 @@ if __name__ == "__main__":

        filepath, count = message.split(' ')

-        #if count < minTopPassList:
-            # Less than 5 matches from the top password list, false positive.
-            #print("false positive:", count)
-            #continue
-
        paste = Paste.Paste(filepath)
        content = paste.get_p_content()
        creds = set(re.findall(regex_cred, content))
-        print(len(creds))
-        print(creds)
-        print(content)
-        print('-----')

        publisher.warning('to_print')

--- a/bin/DomClassifier.py
+++ b/bin/DomClassifier.py
@ -24,10 +24,11 @@ def main():
    config_section = 'DomClassifier'

    p = Process(config_section)
+    addr_dns = p.config.get("DomClassifier", "dns")

    publisher.info("""ZMQ DomainClassifier is Running""")

-    c = DomainClassifier.domainclassifier.Extract(rawtext="")
+    c = DomainClassifier.domainclassifier.Extract(rawtext="", nameservers=[addr_dns])

    cc = p.config.get("DomClassifier", "cc")
    cc_tld = p.config.get("DomClassifier", "cc_tld")
--- a/bin/Duplicates.py
+++ b/bin/Duplicates.py
@ -62,7 +62,7 @@ if __name__ == "__main__":
    while True:
        try:
            hash_dico = {}
-            dupl = []
+            dupl = set()
            dico_range_list = []

            x = time.time()
@ -124,6 +124,8 @@ if __name__ == "__main__":
                                percent = 100-ssdeep.compare(dico_hash, paste_hash)
                            else:
                                percent = tlsh.diffxlen(dico_hash, paste_hash)
+                                if percent > 100:
+                                    percent = 100

                            threshold_duplicate = threshold_set[hash_type]
                            if percent < threshold_duplicate:
@ -163,14 +165,16 @@ if __name__ == "__main__":
            if len(hash_dico) != 0:
                # paste_tuple = (hash_type, date, paste_path, percent)
                for dico_hash, paste_tuple in hash_dico.items():
-                    dupl.append(paste_tuple)
+                    dupl.add(paste_tuple)

                # Creating the object attribute and save it.
                to_print = 'Duplicate;{};{};{};'.format(
                    PST.p_source, PST.p_date, PST.p_name)
                if dupl != []:
+                    dupl = list(dupl)
                    PST.__setattr__("p_duplicate", dupl)
                    PST.save_attribute_redis("p_duplicate", dupl)
+                    PST.save_others_pastes_attribute_duplicate("p_duplicate", dupl)
                    publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_path))
                    print('{}Detected {}'.format(to_print, len(dupl)))

--- a/bin/Global.py
+++ b/bin/Global.py
@ -29,8 +29,9 @@ from Helper import Process

 import magic
 import io
-import gzip
+#import gzip

+'''
 def gunzip_bytes_obj(bytes_obj):
    in_ = io.BytesIO()
    in_.write(bytes_obj)
@ -38,7 +39,7 @@ def gunzip_bytes_obj(bytes_obj):
    with gzip.GzipFile(fileobj=in_, mode='rb') as fo:
        gunzipped_bytes_obj = fo.read()

-    return gunzipped_bytes_obj.decode()
+    return gunzipped_bytes_obj.decode()'''

 if __name__ == '__main__':
    publisher.port = 6380
@ -80,7 +81,7 @@ if __name__ == '__main__':
        # Creating the full filepath
        filename = os.path.join(os.environ['AIL_HOME'],
                                p.config.get("Directories", "pastes"), paste)
-        
+
        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
@ -89,7 +90,7 @@ if __name__ == '__main__':

        with open(filename, 'wb') as f:
            f.write(decoded)
-        try:
+        '''try:
            decoded2 = gunzip_bytes_obj(decoded)
        except:
            decoded2 =''
@ -101,8 +102,7 @@ if __name__ == '__main__':
            print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------')
            print(filename)
            print(type)
-            print(decoded2)
            print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------')
-
+            '''
        p.populate_set_out(filename)
        processed_paste+=1
--- a/bin/Keys.py
+++ b/bin/Keys.py
@ -27,6 +27,7 @@ def search_key(paste):
    find = False
    if '-----BEGIN PGP MESSAGE-----' in content:
        publisher.warning('{} has a PGP enc message'.format(paste.p_name))
+
        find = True

    if '-----BEGIN CERTIFICATE-----' in content:
@ -35,18 +36,27 @@ def search_key(paste):

    if '-----BEGIN RSA PRIVATE KEY-----' in content:
        publisher.warning('{} has a RSA private key message'.format(paste.p_name))
+        print('rsa private key message found')
        find = True

    if '-----BEGIN PRIVATE KEY-----' in content:
        publisher.warning('{} has a private key message'.format(paste.p_name))
+        print('private key message found')
        find = True

    if '-----BEGIN ENCRYPTED PRIVATE KEY-----' in content:
        publisher.warning('{} has an encrypted private key message'.format(paste.p_name))
+        print('encrypted private key message found')
        find = True

    if '-----BEGIN OPENSSH PRIVATE KEY-----' in content:
        publisher.warning('{} has an openssh private key message'.format(paste.p_name))
+        print('openssh private key message found')
+        find = True
+
+    if '-----BEGIN OpenVPN Static key V1-----' in content:
+        publisher.warning('{} has an openssh private key message'.format(paste.p_name))
+        print('OpenVPN Static key message found')
        find = True

    if '-----BEGIN DSA PRIVATE KEY-----' in content:
--- a/bin/LAUNCH.sh
+++ b/bin/LAUNCH.sh
@ -130,7 +130,7 @@ function launching_scripts {
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Lines" bash -c 'python3 Lines.py; read x'
    sleep 0.1
-    #screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c './DomClassifier.py; read x'
+    screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c './DomClassifier.py; read x'
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Categ" bash -c 'python3 Categ.py; read x'
    sleep 0.1
--- a/bin/Mail.py
+++ b/bin/Mail.py
@ -28,6 +28,7 @@ if __name__ == "__main__":
    config_section = 'Mail'

    p = Process(config_section)
+    addr_dns = p.config.get("Mail", "dns")

    # REDIS #
    r_serv2 = redis.StrictRedis(
@ -56,7 +57,7 @@ if __name__ == "__main__":
            if prec_filename is None or filename != prec_filename:
                PST = Paste.Paste(filename)
                MX_values = lib_refine.checking_MX_record(
-                    r_serv2, PST.get_regex(email_regex))
+                    r_serv2, PST.get_regex(email_regex), addr_dns)

                if MX_values[0] >= 1:

--- a/bin/ModuleStats.py
+++ b/bin/ModuleStats.py
@ -29,6 +29,7 @@ def get_date_range(num_day):


 def compute_most_posted(server, message):
+    print(message)
    module, num, keyword, paste_date = message.split(';')

    redis_progression_name_set = 'top_'+ module +'_set_' + paste_date
--- a/bin/ModulesInformationV2.py
+++ b/bin/ModulesInformationV2.py
@ -617,7 +617,9 @@ def fetchQueueData():
        for moduleNum in server.smembers(keySet):
            moduleNum = moduleNum.decode('utf8')
            value = ( server.get(key + str(moduleNum)) ).decode('utf8')
-            complete_paste_path = ( server.get(key + str(moduleNum) + "_PATH") ).decode('utf8')
+            complete_paste_path = ( server.get(key + str(moduleNum) + "_PATH") )
+            if(complete_paste_path is not None):
+                complete_paste_path = complete_paste_path.decode('utf8')
            COMPLETE_PASTE_PATH_PER_PID[moduleNum] = complete_paste_path

            if value is not None:
--- a/bin/RegexForTermsFrequency.py
+++ b/bin/RegexForTermsFrequency.py
@ -76,9 +76,6 @@ if __name__ == "__main__":
                dico_regex, dico_regexname_to_redis = refresh_dicos()
                print('dico got refreshed')

-            print(dico_regex)
-            print(dico_regexname_to_redis)
-
            filename = message
            temp = filename.split('/')
            timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
--- a/bin/Release.py
+++ b/bin/Release.py
@ -6,6 +6,16 @@ from pubsublogger import publisher
 from Helper import Process
 import re

+import signal
+
+class TimeoutException(Exception):
+    pass
+
+def timeout_handler(signum, frame):
+    raise TimeoutException
+
+signal.signal(signal.SIGALRM, timeout_handler)
+
 '''
 This module takes its input from the global module.
 It applies some regex and publish matched content
@ -16,6 +26,7 @@ if __name__ == "__main__":
    publisher.channel = "Script"
    config_section = "Release"
    p = Process(config_section)
+    max_execution_time = p.config.getint("Curve", "max_execution_time")
    publisher.info("Release scripts to find release names")

    movie = "[a-zA-Z0-9.]+\.[0-9]{4}.[a-zA-Z0-9.]+\-[a-zA-Z]+"
@ -35,13 +46,22 @@ if __name__ == "__main__":

        paste = Paste.Paste(filepath)
        content = paste.get_p_content()
-        releases = set(re.findall(regex, content))
-        if len(releases) == 0:
-            continue

-        to_print = 'Release;{};{};{};{} releases;{}'.format(paste.p_source, paste.p_date, paste.p_name, len(releases), paste.p_path)
-        print(to_print)
-        if len(releases) > 30:
-            publisher.warning(to_print)
+        signal.alarm(max_execution_time)
+        try:
+            releases = set(re.findall(regex, content))
+            if len(releases) == 0:
+                continue
+
+                to_print = 'Release;{};{};{};{} releases;{}'.format(paste.p_source, paste.p_date, paste.p_name, len(releases), paste.p_path)
+                print(to_print)
+                if len(releases) > 30:
+                    publisher.warning(to_print)
+                else:
+                    publisher.info(to_print)
+
+        except TimeoutException:
+             print ("{0} processing timeout".format(paste.p_path))
+             continue
        else:
-            publisher.info(to_print)
+            signal.alarm(0)
--- a/bin/packages/Paste.py
+++ b/bin/packages/Paste.py
@ -329,6 +329,27 @@ class Paste(object):
        else:
            self.store.hset(self.p_path, attr_name, json.dumps(value))

+    def save_others_pastes_attribute_duplicate(self, attr_name, list_value):
+        """
+        Save a new duplicate on others pastes
+        """
+        for hash_type, path, percent, date in list_value:
+            print(hash_type, path, percent, date)
+            #get json
+            json_duplicate = self.store.hget(path, attr_name)
+            #json save on redis
+            if json_duplicate is not None:
+                list_duplicate = json.loads(json_duplicate.decode('utf8'))
+                # add new duplicate
+                list_duplicate.append([hash_type, self.p_path, percent, date])
+                self.store.hset(path, attr_name, json.dumps(list_duplicate))
+
+            else:
+                # create the new list
+                list_duplicate = [[hash_type, self.p_path, percent, date]]
+                self.store.hset(path, attr_name, json.dumps(list_duplicate))
+
+
    def _get_from_redis(self, r_serv):
        ans = {}
        for hash_name, the_hash in self.p_hash:
--- a/bin/packages/config.cfg.sample
+++ b/bin/packages/config.cfg.sample
@ -54,6 +54,10 @@ criticalNumberToAlert=8
 #Will be considered as false positive if less that X matches from the top password list
 minTopPassList=5

+[Base64]
+path = Base64/
+max_execution_time = 60
+
 [Modules_Duplicates]
 #Number of month to look back
 maximum_month_range = 3
@ -145,6 +149,10 @@ cc_critical = DE
 [DomClassifier]
 cc = DE
 cc_tld = r'\.de$'
+dns = 8.8.8.8
+
+[Mail]
+dns = 8.8.8.8

 # Indexer configuration
 [Indexer]
--- a/bin/packages/lib_refine.py
+++ b/bin/packages/lib_refine.py
@ -17,19 +17,18 @@ def is_luhn_valid(card_number):
    return (sum(r[0::2]) + sum(sum(divmod(d*2, 10)) for d in r[1::2])) % 10 == 0


-def checking_MX_record(r_serv, adress_set):
+def checking_MX_record(r_serv, adress_set, addr_dns):
    """Check if emails MX domains are responding.

    :param r_serv: -- Redis connexion database
    :param adress_set: -- (set) This is a set of emails adress
+    :param adress_set: -- (str) This is a server dns address
    :return: (int) Number of adress with a responding and valid MX domains

    This function will split the email adress and try to resolve their domains
    names: on example@gmail.com it will try to resolve gmail.com

    """
-    print('mails:')
-    print(adress_set)

    #remove duplicate
    adress_set = list(set(adress_set))
@ -40,7 +39,7 @@ def checking_MX_record(r_serv, adress_set):
    # Transforming the set into a string
    MXdomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", str(adress_set).lower())
    resolver = dns.resolver.Resolver()
-    resolver.nameservers = ['149.13.33.69']
+    resolver.nameservers = [addr_dns]
    resolver.timeout = 5
    resolver.lifetime = 2
    if MXdomains != []:
@ -64,21 +63,27 @@ def checking_MX_record(r_serv, adress_set):

                except dns.resolver.NoNameservers:
                    publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
+                    print('NoNameserver, No non-broken nameservers are available to answer the query.')

                except dns.resolver.NoAnswer:
                    publisher.debug('NoAnswer, The response did not contain an answer to the question.')
+                    print('NoAnswer, The response did not contain an answer to the question.')

                except dns.name.EmptyLabel:
                    publisher.debug('SyntaxError: EmptyLabel')
+                    print('SyntaxError: EmptyLabel')

                except dns.resolver.NXDOMAIN:
                    r_serv.setex(MXdomain[1:], 1, timedelta(days=1))
                    publisher.debug('The query name does not exist.')
+                    print('The query name does not exist.')

                except dns.name.LabelTooLong:
                    publisher.debug('The Label is too long')
+                    print('The Label is too long')

                except dns.resolver.Timeout:
+                    print('timeout')
                    r_serv.setex(MXdomain[1:], 1, timedelta(days=1))

                except Exception as e:
--- a/bin/packages/lib_words.py
+++ b/bin/packages/lib_words.py
@ -81,7 +81,7 @@ def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month
    to keep the timeline of the curve correct.

    """
-    threshold = 50
+    threshold = 30
    first_day = date(year, month, 1)
    last_day = date(year, month, calendar.monthrange(year, month)[1])
    words = []
@ -135,6 +135,7 @@ def create_curve_from_redis_set(server, csvfilename, set_to_plot, year, month):

    redis_set_name = set_to_plot + "_set_" + str(year) + str(month).zfill(2)
    words = list(server.smembers(redis_set_name))
+    words = [x.decode('utf-8') for x in words]

    headers = ['Date'] + words
    with open(csvfilename+'.csv', 'w') as f:
@ -153,5 +154,5 @@ def create_curve_from_redis_set(server, csvfilename, set_to_plot, year, month):
                    row.append(0)
                else:
                    # if the word have a value for the day
-                    row.append(value)
+                    row.append(value.decode('utf8'))
            writer.writerow(row)
--- a/var/www/modules/Flask_config.py
+++ b/var/www/modules/Flask_config.py
@ -62,5 +62,4 @@ r_serv_pasteName = redis.StrictRedis(
 max_preview_char = int(cfg.get("Flask", "max_preview_char")) # Maximum number of character to display in the tooltip
 max_preview_modal = int(cfg.get("Flask", "max_preview_modal")) # Maximum number of character to display in the modal

-tlsh_to_percent = 1000.0 #Use to display the estimated percentage instead of a raw value
 DiffMaxLineLength =  int(cfg.get("Flask", "DiffMaxLineLength"))#Use to display the estimated percentage instead of a raw value
--- a/var/www/modules/showpaste/Flask_showpaste.py
+++ b/var/www/modules/showpaste/Flask_showpaste.py
@ -20,7 +20,6 @@ cfg = Flask_config.cfg
 r_serv_pasteName = Flask_config.r_serv_pasteName
 max_preview_char = Flask_config.max_preview_char
 max_preview_modal = Flask_config.max_preview_modal
-tlsh_to_percent = Flask_config.tlsh_to_percent
 DiffMaxLineLength = Flask_config.DiffMaxLineLength

 showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates')
@ -48,8 +47,10 @@ def showpaste(content_range):

    for dup_list in p_duplicate_full_list:
        if dup_list[0] == "tlsh":
-            dup_list[2] = int(((tlsh_to_percent - float(dup_list[2])) / tlsh_to_percent)*100)
+            dup_list[2] = 100 - int(dup_list[2])
        else:
+            print('dup_list')
+            print(dup_list)
            dup_list[2] = int(dup_list[2])

    #p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True)
@ -64,12 +65,13 @@ def showpaste(content_range):
        hash_types = []
        comp_vals = []
        for i in indices:
-            hash_types.append(p_duplicate_full_list[i][0].encode('utf8'))
+            hash_types.append(p_duplicate_full_list[i][0])
            comp_vals.append(p_duplicate_full_list[i][2])
            dup_list_removed.append(i)

        hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types)
        comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals)
+
        if len(p_duplicate_full_list[dup_list_index]) > 3:
            try:
                date_paste = str(int(p_duplicate_full_list[dup_list_index][3]))
@ -91,7 +93,6 @@ def showpaste(content_range):
    if content_range != 0:
       p_content = p_content[0:content_range]

-
    return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list, date_list=p_date_list)

 # ============ ROUTES ============
--- a/var/www/modules/showpaste/templates/show_saved_paste.html
+++ b/var/www/modules/showpaste/templates/show_saved_paste.html
@ -69,13 +69,12 @@
          <tbody>
          {% for dup_path in duplicate_list %}
              <tr>
-                  <td>{{ hashtype_list[i] }}</td>
-                  <td>Similarity: {{ simil_list[i] }}%</td>
-                  <td>{{ date_list[i] }}</td>
+                  <td>{{ hashtype_list[loop.index - 1] }}</td>
+                  <td>Similarity: {{ simil_list[loop.index - 1] }}%</td>
+                  <td>{{ date_list[loop.index - 1] }}</td>
                  <td><a target="_blank" href="{{ url_for('showsavedpastes.showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></td>
                  <td><a target="_blank" href="{{ url_for('showsavedpastes.showDiff') }}?s1={{ request.args.get('paste') }}&s2={{ dup_path }}" class="fa fa-columns" title="Show differences"></a></td>
              </tr>
-              {% set i = i + 1 %}
          {% endfor %}
          </tbody>
          </table>