From 204e996fc3e24187154a44e5ca0a29487e5324b7 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 12 Sep 2018 11:21:11 +0200 Subject: [PATCH] chg: [statistics] clean scripts --- .gitignore | 1 + bin/BankAccount.py | 2 +- bin/DomClassifier.py | 42 +++-------- bin/DomainSubject.py | 70 +++++++++++++++++++ bin/Phone.py | 23 +----- .../create_graph_by_tld.py} | 25 ++++--- 6 files changed, 98 insertions(+), 65 deletions(-) create mode 100755 bin/DomainSubject.py rename doc/{api/create_lu_graph.py => statistics/create_graph_by_tld.py} (87%) diff --git a/.gitignore b/.gitignore index 11c01083..c4bd48c9 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,4 @@ doc/all_modules.txt # auto generated doc/module-data-flow.png doc/data-flow.png +doc/statistics diff --git a/bin/BankAccount.py b/bin/BankAccount.py index 42baf535..06e86d06 100755 --- a/bin/BankAccount.py +++ b/bin/BankAccount.py @@ -62,7 +62,7 @@ def check_all_iban(l_iban, paste, filename): if is_valid_iban(iban): print('------') nb_valid_iban = nb_valid_iban + 1 - server_statistics.hincrby('iban_by_tld:'+date, iban[0:2], 1) + server_statistics.hincrby('iban_by_country:'+date, iban[0:2], 1) if(nb_valid_iban > 0): to_print = 'Iban;{};{};{};'.format(paste.p_source, paste.p_date, paste.p_name) diff --git a/bin/DomClassifier.py b/bin/DomClassifier.py index 268607e8..aed87a55 100755 --- a/bin/DomClassifier.py +++ b/bin/DomClassifier.py @@ -10,8 +10,6 @@ the out output of the Global module. """ import time -import datetime -import redis from packages import Paste from pubsublogger import publisher @@ -28,13 +26,6 @@ def main(): p = Process(config_section) addr_dns = p.config.get("DomClassifier", "dns") - # ARDB # - server_statistics = redis.StrictRedis( - host=p.config.get("ARDB_Statistics", "host"), - port=p.config.getint("ARDB_Statistics", "port"), - db=p.config.getint("ARDB_Statistics", "db"), - decode_responses=True) - publisher.info("""ZMQ DomainClassifier is Running""") c = DomainClassifier.domainclassifier.Extract(rawtext="", nameservers=[addr_dns]) @@ -55,31 +46,20 @@ def main(): paste = PST.get_p_content() mimetype = PST._get_p_encoding() - nb_domain = 0 - nb_tld_domain = 0 - if mimetype == "text/plain": c.text(rawtext=paste) c.potentialdomain() - valid = c.validdomain(rtype=['A'], extended=True) - nb_domain = len(set(valid)) - if nb_domain > 0: - localizeddomains = c.include(expression=cc_tld) - if localizeddomains: - nb_tld_domain = len(set(localizeddomains)) - publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format( - PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc_tld, PST.p_path)) - - localizeddomains = c.localizedomain(cc=cc) - if localizeddomains: - nb_tld_domain = nb_tld_domain + len(set(localizeddomains)) - publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format( - PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc, PST.p_path)) - - date = datetime.datetime.now().strftime("%Y%m") - server_statistics.hincrby('domain_by_tld:'+date, 'ALL', nb_domain) - if nb_tld_domain > 0: - server_statistics.hincrby('domain_by_tld:'+date, cc, nb_tld_domain) + c.validdomain(rtype=['A'], extended=True) + localizeddomains = c.include(expression=cc_tld) + if localizeddomains: + print(localizeddomains) + publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format( + PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc_tld, PST.p_path)) + localizeddomains = c.localizedomain(cc=cc) + if localizeddomains: + print(localizeddomains) + publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format( + PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc, PST.p_path)) except IOError: print("CRC Checksum Failed on :", PST.p_path) publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format( diff --git a/bin/DomainSubject.py b/bin/DomainSubject.py new file mode 100755 index 00000000..6db110da --- /dev/null +++ b/bin/DomainSubject.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + + +from packages import Paste +from Helper import Process +from pubsublogger import publisher + +import time +import redis +import newspaper + +from collections import defaultdict + +from newspaper import fulltext + +if __name__ == '__main__': + + publisher.port = 6380 + publisher.channel = "Script" + + publisher.info("Script DomainSubject started") + + config_section = 'DomainSubject' + p = Process(config_section) + + r_onion = redis.StrictRedis( + host=p.config.get("ARDB_Onion", "host"), + port=p.config.getint("ARDB_Onion", "port"), + db=p.config.getint("ARDB_Onion", "db"), + decode_responses=True) + + + while True: + + # format: + domain = p.get_from_set() + domain = 'easycoinsayj7p5l.onion' + + if domain is not None: + + #retrieve all crawled pastes + set_crawles_pastes = r_onion.smembers('temp:crawled_domain_pastes:{}'.format(domain)) + if set_crawles_pastes: + dict_keyword = defaultdict(int) + + for paste_path in set_crawles_pastes: + + paste = Paste.Paste(paste_path) + content = paste.get_p_content() + + article = newspaper.Article(url='') + article.set_html(content) + article.parse() + article.nlp() + + for keyword in article.keywords: + dict_keyword[keyword] += 1 + + + if dict_keyword: + res = [(k, dict_keyword[k]) for k in sorted(dict_keyword, key=dict_keyword.get, reverse=True)] + for item in res: + print(item) + else: + print('no keywords found') + time.sleep(60) + + else: + time.sleep(5) diff --git a/bin/Phone.py b/bin/Phone.py index 3d579f4e..213db2b3 100755 --- a/bin/Phone.py +++ b/bin/Phone.py @@ -11,9 +11,7 @@ It apply phone number regexes on paste content and warn if above a threshold. """ -import datetime import time -import redis import re import phonenumbers from packages import Paste @@ -25,10 +23,8 @@ def search_phone(message): paste = Paste.Paste(message) content = paste.get_p_content() # regex to find phone numbers, may raise many false positives (shalt thou seek optimization, upgrading is required) - #reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})') - #reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\(?\d{2,4}\)?){3,4})') - # use non capturing group - reg_phone = re.compile(r'(?:\+\d{1,4}(?:\(\d\))?\d?|0\d?)(?:\d{6,8}|(?:[-/\. ]{1}\(?\d{2,4}\)?){3,4})') + reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})') + reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\(?\d{2,4}\)?){3,4})') # list of the regex results in the Paste, may be null results = reg_phone.findall(content) @@ -49,23 +45,17 @@ def search_phone(message): for phone_number in results: try: x = phonenumbers.parse(phone_number, None) - print(x) country_code = x.country_code if stats.get(country_code) is None: stats[country_code] = 1 else: stats[country_code] = stats[country_code] + 1 - except Exception as e: - #print(e) + except: pass - - date = datetime.datetime.now().strftime("%Y%m") for country_code in stats: - print(country_code) if stats[country_code] > 4: publisher.warning('{} contains Phone numbers with country code {}'.format(paste.p_name, country_code)) - if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) # Port of the redis instance used by pubsublogger @@ -82,13 +72,6 @@ if __name__ == '__main__': # Sent to the logging a description of the module publisher.info("Run Phone module") - # ARDB # - server_statistics = redis.StrictRedis( - host=p.config.get("ARDB_Statistics", "host"), - port=p.config.getint("ARDB_Statistics", "port"), - db=p.config.getint("ARDB_Statistics", "db"), - decode_responses=True) - # Endless loop getting messages from the input queue while True: # Get one message from the input queue diff --git a/doc/api/create_lu_graph.py b/doc/statistics/create_graph_by_tld.py similarity index 87% rename from doc/api/create_lu_graph.py rename to doc/statistics/create_graph_by_tld.py index c2a66769..e1fbbe97 100755 --- a/doc/api/create_lu_graph.py +++ b/doc/statistics/create_graph_by_tld.py @@ -2,7 +2,9 @@ # -*-coding:UTF-8 -* ''' -lu +Create statistics pie charts by tld + +Default tld: lu ''' import os @@ -64,7 +66,7 @@ def create_pie_chart(country ,db_key, date, pie_title, path, save_name): ax1.set_title(pie_title) #plt.show() - plt.savefig(os.path.join(path, save_name)) + plt.savefig(os.path.join(path,save_name)) plt.close(fig1) def create_donut_chart(db_key, date, pie_title, path, save_name): @@ -126,7 +128,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser( description='''This script is a part of the Analysis Information Leak - framework. It create pie charts on a country statistics".''', + framework. Create statistics pie charts".''', epilog='Example: ./create_lu_graph.py 0 lu now, create_lu_graph.py 0 lu 201807') parser.add_argument('type', type=int, default=0, @@ -135,12 +137,11 @@ if __name__ == '__main__': 1: credential_pie, 2: mail_pie 3: sqlinjection_pie, - 4: domain_pie, - 5: iban_pie,''', - choices=[0, 1, 2, 3, 4, 5], action='store') + 4: iban_pie,''', + choices=[0, 1, 2, 3, 4], action='store') - parser.add_argument('country', type=str, default="de", - help='''The country code, de:default''', + parser.add_argument('country', type=str, default="lu", + help='''The country code, lu:default''', action='store') parser.add_argument('date', type=str, default="now", @@ -148,7 +149,7 @@ if __name__ == '__main__': args = parser.parse_args() - path = os.path.join(os.environ['AIL_HOME'], 'doc') # path to module config file + path = os.path.join(os.environ['AIL_HOME'], 'doc', 'statistics') # save path config_section = 'ARDB_Statistics' @@ -171,7 +172,7 @@ if __name__ == '__main__': create_pie_chart(args.country, 'mail_by_tld:', date, "AIL: mail leak by tld", path, 'AIL_mail_by_tld.png') create_pie_chart(args.country, 'SQLInjection_by_tld:', date, "AIL: SQLInjection by tld", path, 'AIL_SQLInjection_by_tld.png') create_pie_chart(args.country.upper(), 'domain_by_tld:', date, "AIL: Domain by tld", path, 'AIL_domain_by_tld.png') - create_pie_chart(args.country.upper(), 'iban_by_tld:', date, "AIL: Iban by tld", path, 'AIL_iban_by_tld.png') + create_pie_chart(args.country.upper(), 'iban_by_country:', date, "AIL: Iban by country", path, 'AIL_iban_by_country.png') elif args.type == 1: create_pie_chart(args.country, 'credential_by_tld:', date, "AIL: Credential leak by tld", path, 'AIL_credential_by_tld.png') elif args.type == 2: @@ -179,6 +180,4 @@ if __name__ == '__main__': elif args.type == 3: create_pie_chart(args.country, 'SQLInjection_by_tld:', date, "AIL: sqlInjection by tld", path, 'AIL_sqlInjectionl_by_tld.png') elif args.type == 4: - create_pie_chart(args.country.upper(), 'domain_by_tld:', date, "AIL: Domain by tld", path, 'AIL_domain_by_tld.png') - elif args.type == 5: - create_pie_chart(args.country.upper(), 'iban_by_tld:', date, "AIL: Iban by tld", path, 'AIL_iban_by_tld.png') + create_pie_chart(args.country.upper(), 'iban_by_country:', date, "AIL: Iban by country", path, 'AIL_iban_by_country.png')