From 353b290899d7a398fb7e3bcb65afc70b22521f6b Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 24 May 2023 10:48:29 +0200 Subject: [PATCH] chg: [Phone module] Filter Invalid Phone numbers + UI Show extracted --- bin/LAUNCH.sh | 4 +- bin/lib/module_extractor.py | 2 + bin/lib/regex_helper.py | 43 ++++++++++++++++++--- bin/modules/Phone.py | 70 ++++++++++++++++++++++------------ bin/modules/abstract_module.py | 11 ++++++ configs/modules.cfg | 7 ++-- 6 files changed, 101 insertions(+), 36 deletions(-) diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index ba75d2fd..547cd76f 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -249,6 +249,8 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "PgpDump" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./PgpDump.py; read x" sleep 0.1 + screen -S "Script_AIL" -X screen -t "Phone" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Phone.py; read x" + sleep 0.1 screen -S "Script_AIL" -X screen -t "Telegram" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Telegram.py; read x" sleep 0.1 screen -S "Script_AIL" -X screen -t "Tools" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Tools.py; read x" @@ -290,8 +292,6 @@ function launching_scripts { ################################## # DISABLED MODULES # ################################## - # screen -S "Script_AIL" -X screen -t "Phone" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Phone.py; read x" - # sleep 0.1 # screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./SentimentAnalysis.py; read x" # sleep 0.1 # screen -S "Script_AIL" -X screen -t "Release" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Release.py; read x" diff --git a/bin/lib/module_extractor.py b/bin/lib/module_extractor.py index 5a111eb9..99d51a6b 100755 --- a/bin/lib/module_extractor.py +++ b/bin/lib/module_extractor.py @@ -25,6 +25,7 @@ from modules.CreditCards import CreditCards from modules.Iban import Iban from modules.Mail import Mail from modules.Onion import Onion +from modules.Phone import Phone from modules.Tools import Tools config_loader = ConfigLoader() @@ -40,6 +41,7 @@ MODULES = { 'infoleak:automatic-detection="iban"': Iban(queue=False), 'infoleak:automatic-detection="mail"': Mail(queue=False), 'infoleak:automatic-detection="onion"': Onion(queue=False), + 'infoleak:automatic-detection="phone-number"': Phone(queue=False), # APIkey ??? # Credentials # Zerobins diff --git a/bin/lib/regex_helper.py b/bin/lib/regex_helper.py index d1722516..41ba4e98 100755 --- a/bin/lib/regex_helper.py +++ b/bin/lib/regex_helper.py @@ -7,14 +7,13 @@ Regex Helper import os import logging.config +import phonenumbers import re import sys import uuid from multiprocessing import Process as Proc -sys.path.append(os.environ['AIL_BIN']) - sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages @@ -65,7 +64,6 @@ def regex_findall(module_name, redis_key, regex, item_id, item_content, max_time proc.terminate() # Statistics.incr_module_timeout_statistic(module_name) err_mess = f"{module_name}: processing timeout: {item_id}" - print(err_mess) logger.info(err_mess) return [] else: @@ -99,7 +97,6 @@ def regex_finditer(r_key, regex, item_id, content, max_time=30): proc.terminate() # Statistics.incr_module_timeout_statistic(r_key) err_mess = f"{r_key}: processing timeout: {item_id}" - print(err_mess) logger.info(err_mess) return [] else: @@ -130,7 +127,6 @@ def regex_search(r_key, regex, item_id, content, max_time=30): proc.terminate() # Statistics.incr_module_timeout_statistic(r_key) err_mess = f"{r_key}: processing timeout: {item_id}" - print(err_mess) logger.info(err_mess) return False else: @@ -144,3 +140,40 @@ def regex_search(r_key, regex, item_id, content, max_time=30): print("Caught KeyboardInterrupt, terminating regex worker") proc.terminate() sys.exit(0) + +## Phone Regexs ## +def _regex_phone_iter(r_key, country_code, content): + iterator = phonenumbers.PhoneNumberMatcher(content, country_code) + for match in iterator: + value = match.raw_string + # PhoneNumberFormat.E164 + # value = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.INTERNATIONAL) + start = match.start + end = match.end + r_serv_cache.rpush(r_key, f'{start}:{end}:{value}') + r_serv_cache.expire(r_key, 360) + +def regex_phone_iter(r_key, country_code, item_id, content, max_time=30): + proc = Proc(target=_regex_phone_iter, args=(r_key, country_code, content)) + try: + proc.start() + proc.join(max_time) + if proc.is_alive(): + proc.terminate() + # Statistics.incr_module_timeout_statistic(r_key) + err_mess = f"{r_key}: processing timeout: {item_id}" + logger.info(err_mess) + return [] + else: + res = r_serv_cache.lrange(r_key, 0, -1) + r_serv_cache.delete(r_key) + proc.terminate() + all_match = [] + for match in res: + start, end, value = match.split(':', 2) + all_match.append((int(start), int(end), value)) + return all_match + except KeyboardInterrupt: + print("Caught KeyboardInterrupt, terminating regex worker") + proc.terminate() + sys.exit(0) \ No newline at end of file diff --git a/bin/modules/Phone.py b/bin/modules/Phone.py index 0e86d1d3..4ded33ff 100755 --- a/bin/modules/Phone.py +++ b/bin/modules/Phone.py @@ -15,7 +15,6 @@ It apply phone number regexes on item content and warn if above a threshold. # Import External packages ################################## import os -import re import sys import phonenumbers @@ -34,44 +33,65 @@ class Phone(AbstractModule): # regex to find phone numbers, may raise many false positives (shalt thou seek optimization, upgrading is required) # reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})') - REG_PHONE = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\(?\d{2,4}\)?){3,4})') + # REG_PHONE = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\(?\d{2,4}\)?){3,4})') - def __init__(self): - super(Phone, self).__init__() + def __init__(self, queue=True): + super(Phone, self).__init__(queue=queue) # Waiting time in seconds between to message processed self.pending_seconds = 1 + def extract(self, obj_id, content, tag): + extracted = [] + phones = self.regex_phone_iter('US', obj_id, content) + for phone in phones: + extracted.append([phone[0], phone[1], phone[2], f'tag:{tag}']) + return extracted + def compute(self, message): item = Item(message) content = item.get_content() - # List of the regex results in the Item, may be null - results = self.REG_PHONE.findall(content) - # If the list is greater than 4, we consider the Item may contain a list of phone numbers - if len(results) > 4: - self.logger.debug(results) - self.redis_logger.warning(f'{item.get_id()} contains PID (phone numbers)') + # TODO use language detection to choose the country code ? + results = self.regex_phone_iter('US', item.id, content) + for phone in results: + print(phone[2]) + if results: + # TAGS msg = f'infoleak:automatic-detection="phone-number";{item.get_id()}' self.add_message_to_queue(msg, 'Tags') - stats = {} - for phone_number in results: - try: - x = phonenumbers.parse(phone_number, None) - country_code = x.country_code - if stats.get(country_code) is None: - stats[country_code] = 1 - else: - stats[country_code] = stats[country_code] + 1 - except: - pass - for country_code in stats: - if stats[country_code] > 4: - self.redis_logger.warning(f'{item.get_id()} contains Phone numbers with country code {country_code}') + self.redis_logger.warning(f'{item.get_id()} contains {len(phone)} Phone numbers') + + # # List of the regex results in the Item, may be null + # results = self.REG_PHONE.findall(content) + # + # # If the list is greater than 4, we consider the Item may contain a list of phone numbers + # if len(results) > 4: + # self.logger.debug(results) + # self.redis_logger.warning(f'{item.get_id()} contains PID (phone numbers)') + # + # msg = f'infoleak:automatic-detection="phone-number";{item.get_id()}' + # self.add_message_to_queue(msg, 'Tags') + # + # stats = {} + # for phone_number in results: + # try: + # x = phonenumbers.parse(phone_number, None) + # country_code = x.country_code + # if stats.get(country_code) is None: + # stats[country_code] = 1 + # else: + # stats[country_code] = stats[country_code] + 1 + # except: + # pass + # for country_code in stats: + # if stats[country_code] > 4: + # self.redis_logger.warning(f'{item.get_id()} contains Phone numbers with country code {country_code}') if __name__ == '__main__': module = Phone() - module.run() + # module.run() + module.compute('crawled/2023/02/21/circl.luc90be694-a559-4d77-bfa4-9c54ea8bc2f7') diff --git a/bin/modules/abstract_module.py b/bin/modules/abstract_module.py index 7e1f8220..38989bd9 100644 --- a/bin/modules/abstract_module.py +++ b/bin/modules/abstract_module.py @@ -110,6 +110,17 @@ class AbstractModule(ABC): return regex_helper.regex_findall(self.module_name, self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time, r_set=r_set) + def regex_phone_iter(self, country_code, obj_id, content): + """ + regex findall helper (force timeout) + :param regex: compiled regex + :param obj_id: object id + :param content: object content + :param r_set: return result as set + """ + return regex_helper.regex_phone_iter(self.r_cache_key, country_code, obj_id, content, + max_time=self.max_execution_time) + def run(self): """ Run Module endless process diff --git a/configs/modules.cfg b/configs/modules.cfg index 262099ac..26aa8580 100644 --- a/configs/modules.cfg +++ b/configs/modules.cfg @@ -128,10 +128,9 @@ publish = Duplicate,Tags subscribe = Cve publish = Tags -# Disabled -#[Phone] -#subscribe = Item -#publish = Tags +[Phone] +subscribe = Item +publish = Tags [Keys] subscribe = Item