chg: [statistics] clean scripts

2025-03-18 20:39:51 +00:00 · 2018-09-12 11:21:11 +02:00 · 2018-09-12 11:21:11 +02:00 · 204e996fc3
commit 204e996fc3
parent ea3d2c1977
6 changed files with 98 additions and 65 deletions
--- a/.gitignore
+++ b/.gitignore
@ -40,3 +40,4 @@ doc/all_modules.txt
 # auto generated
 doc/module-data-flow.png
 doc/data-flow.png
+doc/statistics
--- a/bin/BankAccount.py
+++ b/bin/BankAccount.py
@ -62,7 +62,7 @@ def check_all_iban(l_iban, paste, filename):
            if is_valid_iban(iban):
                print('------')
                nb_valid_iban = nb_valid_iban + 1
-                server_statistics.hincrby('iban_by_tld:'+date, iban[0:2], 1)
+                server_statistics.hincrby('iban_by_country:'+date, iban[0:2], 1)

    if(nb_valid_iban > 0):
        to_print = 'Iban;{};{};{};'.format(paste.p_source, paste.p_date, paste.p_name)
--- a/bin/DomClassifier.py
+++ b/bin/DomClassifier.py
@ -10,8 +10,6 @@ the out output of the Global module.

 """
 import time
-import datetime
-import redis
 from packages import Paste
 from pubsublogger import publisher

@ -28,13 +26,6 @@ def main():
    p = Process(config_section)
    addr_dns = p.config.get("DomClassifier", "dns")

-    # ARDB #
-    server_statistics = redis.StrictRedis(
-        host=p.config.get("ARDB_Statistics", "host"),
-        port=p.config.getint("ARDB_Statistics", "port"),
-        db=p.config.getint("ARDB_Statistics", "db"),
-        decode_responses=True)
-
    publisher.info("""ZMQ DomainClassifier is Running""")

    c = DomainClassifier.domainclassifier.Extract(rawtext="", nameservers=[addr_dns])
@ -55,31 +46,20 @@ def main():
            paste = PST.get_p_content()
            mimetype = PST._get_p_encoding()

-            nb_domain = 0
-            nb_tld_domain = 0
-
            if mimetype == "text/plain":
                c.text(rawtext=paste)
                c.potentialdomain()
-                valid = c.validdomain(rtype=['A'], extended=True)
-                nb_domain = len(set(valid))
-                if nb_domain > 0:
-                    localizeddomains = c.include(expression=cc_tld)
-                    if localizeddomains:
-                        nb_tld_domain = len(set(localizeddomains))
-                        publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format(
-                            PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc_tld, PST.p_path))
-
-                    localizeddomains = c.localizedomain(cc=cc)
-                    if localizeddomains:
-                        nb_tld_domain = nb_tld_domain + len(set(localizeddomains))
-                        publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format(
-                            PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc, PST.p_path))
-
-                    date = datetime.datetime.now().strftime("%Y%m")
-                    server_statistics.hincrby('domain_by_tld:'+date, 'ALL', nb_domain)
-                    if nb_tld_domain > 0:
-                        server_statistics.hincrby('domain_by_tld:'+date, cc, nb_tld_domain)
+                c.validdomain(rtype=['A'], extended=True)
+                localizeddomains = c.include(expression=cc_tld)
+                if localizeddomains:
+                    print(localizeddomains)
+                    publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format(
+                        PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc_tld, PST.p_path))
+                localizeddomains = c.localizedomain(cc=cc)
+                if localizeddomains:
+                    print(localizeddomains)
+                    publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format(
+                        PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc, PST.p_path))
        except IOError:
            print("CRC Checksum Failed on :", PST.p_path)
            publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format(
--- a/bin/DomainSubject.py
+++ b/bin/DomainSubject.py
@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+
+from packages import Paste
+from Helper import Process
+from pubsublogger import publisher
+
+import time
+import redis
+import newspaper
+
+from collections import defaultdict
+
+from newspaper import fulltext
+
+if __name__ == '__main__':
+
+    publisher.port = 6380
+    publisher.channel = "Script"
+
+    publisher.info("Script DomainSubject started")
+
+    config_section = 'DomainSubject'
+    p = Process(config_section)
+
+    r_onion = redis.StrictRedis(
+        host=p.config.get("ARDB_Onion", "host"),
+        port=p.config.getint("ARDB_Onion", "port"),
+        db=p.config.getint("ARDB_Onion", "db"),
+        decode_responses=True)
+
+
+    while True:
+
+        # format: <domain>
+        domain = p.get_from_set()
+        domain = 'easycoinsayj7p5l.onion'
+
+        if domain is not None:
+
+            #retrieve all crawled pastes
+            set_crawles_pastes = r_onion.smembers('temp:crawled_domain_pastes:{}'.format(domain))
+            if set_crawles_pastes:
+                dict_keyword = defaultdict(int)
+
+                for paste_path in set_crawles_pastes:
+
+                    paste = Paste.Paste(paste_path)
+                    content = paste.get_p_content()
+
+                    article = newspaper.Article(url='')
+                    article.set_html(content)
+                    article.parse()
+                    article.nlp()
+
+                    for keyword in article.keywords:
+                        dict_keyword[keyword] += 1
+
+
+                if dict_keyword:
+                    res = [(k, dict_keyword[k]) for k in sorted(dict_keyword, key=dict_keyword.get, reverse=True)]
+                    for item in res:
+                        print(item)
+                else:
+                    print('no keywords found')
+            time.sleep(60)
+
+        else:
+            time.sleep(5)
--- a/bin/Phone.py
+++ b/bin/Phone.py
@ -11,9 +11,7 @@ It apply phone number regexes on paste content and warn if above a threshold.

 """

-import datetime
 import time
-import redis
 import re
 import phonenumbers
 from packages import Paste
@ -25,10 +23,8 @@ def search_phone(message):
    paste = Paste.Paste(message)
    content = paste.get_p_content()
    # regex to find phone numbers, may raise many false positives (shalt thou seek optimization, upgrading is required)
-    #reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})')
-    #reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\(?\d{2,4}\)?){3,4})')
-    # use non capturing group
-    reg_phone = re.compile(r'(?:\+\d{1,4}(?:\(\d\))?\d?|0\d?)(?:\d{6,8}|(?:[-/\. ]{1}\(?\d{2,4}\)?){3,4})')
+    reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})')
+    reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\(?\d{2,4}\)?){3,4})')
    # list of the regex results in the Paste, may be null
    results = reg_phone.findall(content)

@ -49,23 +45,17 @@ def search_phone(message):
        for phone_number in results:
            try:
                x = phonenumbers.parse(phone_number, None)
-                print(x)
                country_code = x.country_code
                if stats.get(country_code) is None:
                    stats[country_code] = 1
                else:
                    stats[country_code] = stats[country_code] + 1
-            except Exception as e:
-                #print(e)
+            except:
                pass
-
-        date = datetime.datetime.now().strftime("%Y%m")
        for country_code in stats:
-            print(country_code)
            if stats[country_code] > 4:
                publisher.warning('{} contains Phone numbers with country code {}'.format(paste.p_name, country_code))

-
 if __name__ == '__main__':
    # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
    # Port of the redis instance used by pubsublogger
@ -82,13 +72,6 @@ if __name__ == '__main__':
    # Sent to the logging a description of the module
    publisher.info("Run Phone module")

-    # ARDB #
-    server_statistics = redis.StrictRedis(
-        host=p.config.get("ARDB_Statistics", "host"),
-        port=p.config.getint("ARDB_Statistics", "port"),
-        db=p.config.getint("ARDB_Statistics", "db"),
-        decode_responses=True)
-
    # Endless loop getting messages from the input queue
    while True:
        # Get one message from the input queue
--- a/doc/statistics/create_graph_by_tld.py
+++ b/doc/statistics/create_graph_by_tld.py
@ -2,7 +2,9 @@
 # -*-coding:UTF-8 -*

 '''
-lu
+Create statistics pie charts by tld
+
+Default tld: lu
 '''

 import os
@ -64,7 +66,7 @@ def create_pie_chart(country ,db_key, date, pie_title, path, save_name):

    ax1.set_title(pie_title)
    #plt.show()
-    plt.savefig(os.path.join(path, save_name))
+    plt.savefig(os.path.join(path,save_name))
    plt.close(fig1)

 def create_donut_chart(db_key, date, pie_title, path, save_name):
@ -126,7 +128,7 @@ if __name__ == '__main__':

    parser = argparse.ArgumentParser(
        description='''This script is a part of the Analysis Information Leak
-        framework. It create pie charts on a country statistics".''',
+        framework. Create statistics pie charts".''',
        epilog='Example: ./create_lu_graph.py 0 lu now, create_lu_graph.py 0 lu 201807')

    parser.add_argument('type', type=int, default=0,
@ -135,12 +137,11 @@ if __name__ == '__main__':
                        1: credential_pie,
                        2: mail_pie
                        3: sqlinjection_pie,
-                        4: domain_pie,
-                        5: iban_pie,''',
-                        choices=[0, 1, 2, 3, 4, 5], action='store')
+                        4: iban_pie,''',
+                        choices=[0, 1, 2, 3, 4], action='store')

-    parser.add_argument('country', type=str, default="de",
-                        help='''The country code, de:default''',
+    parser.add_argument('country', type=str, default="lu",
+                        help='''The country code, lu:default''',
                        action='store')

    parser.add_argument('date', type=str, default="now",
@ -148,7 +149,7 @@ if __name__ == '__main__':

    args = parser.parse_args()

-    path = os.path.join(os.environ['AIL_HOME'], 'doc') # path to module config file
+    path = os.path.join(os.environ['AIL_HOME'], 'doc', 'statistics') # save path

    config_section = 'ARDB_Statistics'

@ -171,7 +172,7 @@ if __name__ == '__main__':
        create_pie_chart(args.country, 'mail_by_tld:', date, "AIL: mail leak by tld", path, 'AIL_mail_by_tld.png')
        create_pie_chart(args.country, 'SQLInjection_by_tld:', date, "AIL: SQLInjection by tld", path, 'AIL_SQLInjection_by_tld.png')
        create_pie_chart(args.country.upper(), 'domain_by_tld:', date, "AIL: Domain by tld", path, 'AIL_domain_by_tld.png')
-        create_pie_chart(args.country.upper(), 'iban_by_tld:', date, "AIL: Iban by tld", path, 'AIL_iban_by_tld.png')
+        create_pie_chart(args.country.upper(), 'iban_by_country:', date, "AIL: Iban by country", path, 'AIL_iban_by_country.png')
    elif args.type == 1:
        create_pie_chart(args.country, 'credential_by_tld:', date, "AIL: Credential leak by tld", path, 'AIL_credential_by_tld.png')
    elif args.type == 2:
@ -179,6 +180,4 @@ if __name__ == '__main__':
    elif args.type == 3:
        create_pie_chart(args.country, 'SQLInjection_by_tld:', date, "AIL: sqlInjection by tld", path, 'AIL_sqlInjectionl_by_tld.png')
    elif args.type == 4:
-        create_pie_chart(args.country.upper(), 'domain_by_tld:', date, "AIL: Domain by tld", path, 'AIL_domain_by_tld.png')
-    elif args.type == 5:
-        create_pie_chart(args.country.upper(), 'iban_by_tld:', date, "AIL: Iban by tld", path, 'AIL_iban_by_tld.png')
+        create_pie_chart(args.country.upper(), 'iban_by_country:', date, "AIL: Iban by country", path, 'AIL_iban_by_country.png')