From 6bc54baf74a026938316977dedb10a6d249f628d Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 11 Dec 2020 21:02:07 +0100 Subject: [PATCH] chg: [Languages]detect + search domains languages --- bin/LAUNCH.sh | 2 + bin/Languages.py | 33 +++ bin/lib/Domain.py | 124 ++++++++- bin/lib/Language.py | 240 ++++++++++++++++++ bin/packages/Item.py | 69 ++++- bin/packages/modules.cfg | 3 + bin/update-background.py | 24 ++ update/v3.4/Update.py | 37 +++ update/v3.4/Update.sh | 54 ++++ update/v3.4/Update_domain.py | 57 +++++ var/www/blueprints/crawler_splash.py | 37 +++ .../templates/Crawler_dashboard.html | 4 + .../crawler_splash/domain_explorer.html | 2 +- .../crawler/crawler_splash/showDomain.html | 6 + .../domains/block_languages_search.html | 73 ++++++ .../templates/domains/card_img_domain.html | 54 ++-- .../domains/domains_filter_languages.html | 192 ++++++++++++++ 17 files changed, 990 insertions(+), 21 deletions(-) create mode 100755 bin/Languages.py create mode 100755 bin/lib/Language.py create mode 100755 update/v3.4/Update.py create mode 100755 update/v3.4/Update.sh create mode 100755 update/v3.4/Update_domain.py create mode 100644 var/www/templates/domains/block_languages_search.html create mode 100644 var/www/templates/domains/domains_filter_languages.html diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 0c8a6a70..c4e4a538 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -216,6 +216,8 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "Tags" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Tags.py; read x" sleep 0.1 + screen -S "Script_AIL" -X screen -t "Languages" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Languages.py; read x" + sleep 0.1 screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SentimentAnalysis.py; read x" sleep 0.1 screen -S "Script_AIL" -X screen -t "DbCleaner" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DbCleaner.py; read x" diff --git a/bin/Languages.py b/bin/Languages.py new file mode 100755 index 00000000..f4785250 --- /dev/null +++ b/bin/Languages.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys + +import cld3 + +from packages import Item +from lib import Domain + +from pubsublogger import publisher +from Helper import Process + +if __name__ == '__main__': + publisher.port = 6380 + publisher.channel = 'Script' + # Section name in bin/packages/modules.cfg + config_section = 'Languages' + # Setup the I/O queues + p = Process(config_section) + + while True: + message = p.get_from_set() + if message is None: + publisher.debug("{} queue is empty, waiting".format(config_section)) + time.sleep(1) + continue + + item_id = Item.get_item_id(message) + if Item.is_crawled(item_id): + domain = Item.get_item_domain(item_id) + Domain.add_domain_languages_by_item_id(domain, item_id) diff --git a/bin/lib/Domain.py b/bin/lib/Domain.py index 52b5b7c6..5eca1943 100755 --- a/bin/lib/Domain.py +++ b/bin/lib/Domain.py @@ -9,6 +9,7 @@ The ``Domain`` import os import sys +import itertools import time import redis import random @@ -24,6 +25,7 @@ import Tag sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) import ConfigLoader import Correlate_object +import Language import Screenshot import Username @@ -66,6 +68,15 @@ def sanitize_domain_type(domain_type): else: return 'regular' +def sanitize_domain_types(l_domain_type): + all_domain_types = get_all_domains_type() + if not l_domain_type: + return all_domain_types + for domain_type in l_domain_type: + if domain_type not in all_domain_types: + return all_domain_types + return l_domain_type + ######## DOMAINS ######## def get_all_domains_type(): return ['onion', 'regular'] @@ -210,6 +221,15 @@ def get_domains_up_by_filers(domain_type, date_from=None, date_to=None, tags=[], else: return None + + +## TODO: filters: +# - tags +# - languages +# - daterange UP +def get_domains_by_filters(): + pass + def create_domains_metadata_list(list_domains, domain_type): l_domains = [] for domain in list_domains: @@ -218,9 +238,98 @@ def create_domains_metadata_list(list_domains, domain_type): else: dom_type = domain_type l_domains.append(get_domain_metadata(domain, dom_type, first_seen=True, last_ckeck=True, status=True, - ports=True, tags=True, screenshot=True, tags_safe=True)) + ports=True, tags=True, languages=True, screenshot=True, tags_safe=True)) return l_domains + +######## LANGUAGES ######## +def get_all_domains_languages(): + return r_serv_onion.smembers('all_domains_languages') + +def get_domains_by_languages(languages, l_domain_type=[]): + l_domain_type = sanitize_domain_types(l_domain_type) + if not languages: + return [] + elif len(languages) == 1: + return get_all_domains_by_language(languages[0], l_domain_type=l_domain_type) + else: + all_domains_t = [] + for domain_type in l_domain_type: + l_keys_name = [] + for language in languages: + l_keys_name.append('language:domains:{}:{}'.format(domain_type, language)) + res = r_serv_onion.sinter(l_keys_name[0], *l_keys_name[1:]) + if res: + all_domains_t.append(res) + return list(itertools.chain.from_iterable(all_domains_t)) + +def get_all_domains_by_language(language, l_domain_type=[]): + l_domain_type = sanitize_domain_types(l_domain_type) + if len(l_domain_type) == 1: + return r_serv_onion.smembers('language:domains:{}:{}'.format(l_domain_type[0], language)) + else: + l_keys_name = [] + for domain_type in l_domain_type: + l_keys_name.append('language:domains:{}:{}'.format(domain_type, language)) + return r_serv_onion.sunion(l_keys_name[0], *l_keys_name[1:]) + +def get_domain_languages(domain, r_list=False): + res = r_serv_onion.smembers('domain:language:{}'.format(domain)) + if r_list: + return list(res) + else: + return res + +def add_domain_language(domain, language): + language = language.split('-')[0] + domain_type = get_domain_type(domain) + r_serv_onion.sadd('all_domains_languages', language) + r_serv_onion.sadd('all_domains_languages:{}'.format(domain_type), language) + r_serv_onion.sadd('language:domains:{}:{}'.format(domain_type, language), domain) + r_serv_onion.sadd('domain:language:{}'.format(domain), language) + +def add_domain_languages_by_item_id(domain, item_id): + for lang in Item.get_item_languages(item_id, min_proportion=0.2, min_probability=0.8): + add_domain_language(domain, lang.language) + +def delete_domain_languages(domain): + domain_type = get_domain_type(domain) + for language in get_domain_languages(domain): + r_serv_onion.srem('language:domains:{}:{}'.format(domain_type, language), domain) + if not r_serv_onion.exists('language:domains:{}:{}'.format(domain_type, language)): + r_serv_onion.srem('all_domains_languages:{}'.format(domain_type), language) + exist_domain_type_lang = False + for domain_type in get_all_domains_type(): + if r_serv_onion.sismembers('all_domains_languages:{}'.format(domain_type), language): + exist_domain_type_lang = True + continue + if not exist_domain_type_lang: + r_serv_onion.srem('all_domains_languages', language) + r_serv_onion.delete('domain:language:{}'.format(domain)) + +def _delete_all_domains_languages(): + for language in get_all_domains_languages(): + for domain in get_all_domains_by_language(language): + delete_domain_languages(domain) + +## API ## +## TODO: verify domains type + languages list +## TODO: add pagination +def api_get_domains_by_languages(domains_types, languages, domains_metadata=False, page=1): + l_domains = sorted(get_domains_by_languages(languages, l_domain_type=domains_types)) + l_domains = paginate_iterator(l_domains, nb_obj=28, page=page) + if not domains_metadata: + return l_domains + else: + l_dict_domains = [] + for domain in l_domains['list_elem']: + l_dict_domains.append(get_domain_metadata(domain, get_domain_type(domain), first_seen=True, last_ckeck=True, + status=True, ports=True, tags=True, tags_safe=True, + languages=True, screenshot=True)) + l_domains['list_elem'] = l_dict_domains + return l_domains +####---- ----#### + ######## DOMAIN ######## def get_domain_type(domain): @@ -498,7 +607,7 @@ def get_domain_random_screenshot(domain): ''' return Screenshot.get_randon_domain_screenshot(domain) -def get_domain_metadata(domain, domain_type, first_seen=True, last_ckeck=True, status=True, ports=True, tags=False, tags_safe=False, screenshot=False): +def get_domain_metadata(domain, domain_type, first_seen=True, last_ckeck=True, status=True, ports=True, tags=False, tags_safe=False, languages=False, screenshot=False): ''' Get Domain basic metadata @@ -516,6 +625,7 @@ def get_domain_metadata(domain, domain_type, first_seen=True, last_ckeck=True, s ''' dict_metadata = {} dict_metadata['id'] = domain + dict_metadata['type'] = domain_type if first_seen: res = get_domain_first_seen(domain, domain_type=domain_type) if res is not None: @@ -535,6 +645,8 @@ def get_domain_metadata(domain, domain_type, first_seen=True, last_ckeck=True, s dict_metadata['is_tags_safe'] = Tag.is_tags_safe(dict_metadata['tags']) else: dict_metadata['is_tags_safe'] = Tag.is_tags_safe(get_domain_tags(domain)) + if languages: + dict_metadata['languages'] = Language.get_languages_from_iso(get_domain_languages(domain, r_list=True), sort=True) if screenshot: dict_metadata['screenshot'] = get_domain_random_screenshot(domain) return dict_metadata @@ -796,6 +908,14 @@ class Domain(object): ''' return get_domain_tags(self.domain) + def get_domain_languages(self): + ''' + Retun all languages of a given domain. + + :param domain: domain name + ''' + return get_domain_languages(self.domain) + def get_domain_correlation(self): ''' Retun all correlation of a given domain. diff --git a/bin/lib/Language.py b/bin/lib/Language.py new file mode 100755 index 00000000..6b5bd6a0 --- /dev/null +++ b/bin/lib/Language.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import redis + +dict_iso_languages = { + 'af': 'Afrikaans', + 'am': 'Amharic', + 'ar': 'Arabic', + 'bg': 'Bulgarian', + 'bn': 'Bangla', + 'bs': 'Bosnian', + 'ca': 'Catalan', + 'ceb': 'Cebuano', + 'co': 'Corsican', + 'cs': 'Czech', + 'cy': 'Welsh', + 'da': 'Danish', + 'de': 'German', + 'el': 'Greek', + 'en': 'English', + 'eo': 'Esperanto', + 'es': 'Spanish', + 'et': 'Estonian', + 'eu': 'Basque', + 'fa': 'Persian', + 'fi': 'Finnish', + 'fil': 'Filipino', + 'fr': 'French', + 'fy': 'Western Frisian', + 'ga': 'Irish', + 'gd': 'Scottish Gaelic', + 'gl': 'Galician', + 'gu': 'Gujarati', + 'ha': 'Hausa', + 'haw': 'Hawaiian', + 'hi': 'Hindi', + 'hmn': 'Hmong', + 'hr': 'Croatian', + 'ht': 'Haitian Creole', + 'hu': 'Hungarian', + 'hy': 'Armenian', + 'id': 'Indonesian', + 'ig': 'Igbo', + 'is': 'Icelandic', + 'it': 'Italian', + 'iw': 'Hebrew', + 'ja': 'Japanese', + 'jv': 'Javanese', + 'ka': 'Georgian', + 'kk': 'Kazakh', + 'km': 'Khmer', + 'kn': 'Kannada', + 'ko': 'Korean', + 'ku': 'Kurdish', + 'ky': 'Kyrgyz', + 'la': 'Latin', + 'lb': 'Luxembourgish', + 'lo': 'Lao', + 'lt': 'Lithuanian', + 'lv': 'Latvian', + 'mg': 'Malagasy', + 'mi': 'Maori', + 'mk': 'Macedonian', + 'ml': 'Malayalam', + 'mn': 'Mongolian', + 'mr': 'Marathi', + 'ms': 'Malay', + 'mt': 'Maltese', + 'my': 'Burmese', + 'ne': 'Nepali', + 'nl': 'Dutch', + 'no': 'Norwegian', + 'ny': 'Nyanja', + 'pa': 'Punjabi', + 'pl': 'Polish', + 'ps': 'Pashto', + 'pt': 'Portuguese', + 'ro': 'Romanian', + 'ru': 'Russian', + 'sd': 'Sindhi', + 'si': 'Sinhala', + 'sk': 'Slovak', + 'sl': 'Slovenian', + 'sm': 'Samoan', + 'sn': 'Shona', + 'so': 'Somali', + 'sq': 'Albanian', + 'sr': 'Serbian', + 'st': 'Southern Sotho', + 'su': 'Sundanese', + 'sv': 'Swedish', + 'sw': 'Swahili', + 'ta': 'Tamil', + 'te': 'Telugu', + 'tg': 'Tajik', + 'th': 'Thai', + 'tr': 'Turkish', + 'uk': 'Ukrainian', + 'ur': 'Urdu', + 'uz': 'Uzbek', + 'vi': 'Vietnamese', + 'xh': 'Xhosa', + 'yi': 'Yiddish', + 'yo': 'Yoruba', + 'zh': 'Chinese', + 'zu': 'Zulu' +} + +dict_languages_iso = { + 'Afrikaans': 'af', + 'Amharic': 'am', + 'Arabic': 'ar', + 'Bulgarian': 'bg', + 'Bangla': 'bn', + 'Bosnian': 'bs', + 'Catalan': 'ca', + 'Cebuano': 'ceb', + 'Corsican': 'co', + 'Czech': 'cs', + 'Welsh': 'cy', + 'Danish': 'da', + 'German': 'de', + 'Greek': 'el', + 'English': 'en', + 'Esperanto': 'eo', + 'Spanish': 'es', + 'Estonian': 'et', + 'Basque': 'eu', + 'Persian': 'fa', + 'Finnish': 'fi', + 'Filipino': 'fil', + 'French': 'fr', + 'Western Frisian': 'fy', + 'Irish': 'ga', + 'Scottish Gaelic': 'gd', + 'Galician': 'gl', + 'Gujarati': 'gu', + 'Hausa': 'ha', + 'Hawaiian': 'haw', + 'Hindi': 'hi', + 'Hmong': 'hmn', + 'Croatian': 'hr', + 'Haitian Creole': 'ht', + 'Hungarian': 'hu', + 'Armenian': 'hy', + 'Indonesian': 'id', + 'Igbo': 'ig', + 'Icelandic': 'is', + 'Italian': 'it', + 'Hebrew': 'iw', + 'Japanese': 'ja', + 'Javanese': 'jv', + 'Georgian': 'ka', + 'Kazakh': 'kk', + 'Khmer': 'km', + 'Kannada': 'kn', + 'Korean': 'ko', + 'Kurdish': 'ku', + 'Kyrgyz': 'ky', + 'Latin': 'la', + 'Luxembourgish': 'lb', + 'Lao': 'lo', + 'Lithuanian': 'lt', + 'Latvian': 'lv', + 'Malagasy': 'mg', + 'Maori': 'mi', + 'Macedonian': 'mk', + 'Malayalam': 'ml', + 'Mongolian': 'mn', + 'Marathi': 'mr', + 'Malay': 'ms', + 'Maltese': 'mt', + 'Burmese': 'my', + 'Nepali': 'ne', + 'Dutch': 'nl', + 'Norwegian': 'no', + 'Nyanja': 'ny', + 'Punjabi': 'pa', + 'Polish': 'pl', + 'Pashto': 'ps', + 'Portuguese': 'pt', + 'Romanian': 'ro', + 'Russian': 'ru', + 'Sindhi': 'sd', + 'Sinhala': 'si', + 'Slovak': 'sk', + 'Slovenian': 'sl', + 'Samoan': 'sm', + 'Shona': 'sn', + 'Somali': 'so', + 'Albanian': 'sq', + 'Serbian': 'sr', + 'Southern Sotho': 'st', + 'Sundanese': 'su', + 'Swedish': 'sv', + 'Swahili': 'sw', + 'Tamil': 'ta', + 'Telugu': 'te', + 'Tajik': 'tg', + 'Thai': 'th', + 'Turkish': 'tr', + 'Ukrainian': 'uk', + 'Urdu': 'ur', + 'Uzbek': 'uz', + 'Vietnamese': 'vi', + 'Xhosa': 'xh', + 'Yiddish': 'yi', + 'Yoruba': 'yo', + 'Chinese': 'zh', + 'Zulu': 'zu' +} + +def get_language_from_iso(iso_language): + return dict_iso_languages.get(iso_language, None) + +def get_languages_from_iso(l_iso_languages, sort=False): + l_languages = [] + for iso_language in l_iso_languages: + language = get_language_from_iso(iso_language) + if language: + l_languages.append(language) + if sort: + l_languages = sorted(l_languages) + return l_languages + +def get_iso_from_language(language): + return dict_languages_iso.get(language, None) + +def get_iso_from_languages(l_languages, sort=False): + l_iso = [] + for language in l_languages: + iso_lang = get_iso_from_language(language) + if iso_lang: + l_iso.append(iso_lang) + if sort: + l_iso = sorted(l_iso) + return l_iso diff --git a/bin/packages/Item.py b/bin/packages/Item.py index 15993d7a..e2b08f7d 100755 --- a/bin/packages/Item.py +++ b/bin/packages/Item.py @@ -2,8 +2,10 @@ # -*-coding:UTF-8 -* import os +import re import sys import redis +import cld3 import html2text from io import BytesIO @@ -101,13 +103,62 @@ def add_item_parent(item_parent, item_id): def get_item_content(item_id): return item_basic.get_item_content(item_id) -def get_item_content_html2text(item_id, item_content=None): +def get_item_content_html2text(item_id, item_content=None, ignore_links=False): if not item_content: item_content = get_item_content(item_id) h = html2text.HTML2Text() - h.ignore_links = False + h.ignore_links = ignore_links + h.ignore_images = ignore_links return h.handle(item_content) +def remove_all_urls_from_content(item_id, item_content=None): + if not item_content: + item_content = get_item_content(item_id) + regex = r'\b(?:http://|https://)?(?:[a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63})+)(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*\b' + url_regex = re.compile(regex) + urls = url_regex.findall(item_content) + urls = sorted(urls, key=len, reverse=True) + for url in urls: + item_content = item_content.replace(url, '') + + regex_pgp_public_blocs = r'-----BEGIN PGP PUBLIC KEY BLOCK-----[\s\S]+?-----END PGP PUBLIC KEY BLOCK-----' + regex_pgp_signature = r'-----BEGIN PGP SIGNATURE-----[\s\S]+?-----END PGP SIGNATURE-----' + regex_pgp_message = r'-----BEGIN PGP MESSAGE-----[\s\S]+?-----END PGP MESSAGE-----' + re.compile(regex_pgp_public_blocs) + re.compile(regex_pgp_signature) + re.compile(regex_pgp_message) + + res = re.findall(regex_pgp_public_blocs, item_content) + for it in res: + item_content = item_content.replace(it, '') + res = re.findall(regex_pgp_signature, item_content) + for it in res: + item_content = item_content.replace(it, '') + res = re.findall(regex_pgp_message, item_content) + for it in res: + item_content = item_content.replace(it, '') + + return item_content + +def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7): + all_languages = [] + + ## CLEAN CONTENT ## + content = get_item_content_html2text(item_id, ignore_links=True) + content = remove_all_urls_from_content(item_id, item_content=content) + + # REMOVE USELESS SPACE + content = ' '.join(content.split()) + #- CLEAN CONTENT -# + + #print(content) + #print(len(content)) + if len(content) >= min_len: + for lang in cld3.get_frequent_languages(content, num_langs=num_langs): + if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable: + all_languages.append(lang) + return all_languages + # API def get_item(request_dict): if not request_dict: @@ -496,3 +547,17 @@ def delete_domain_node(item_id): domain_basic.delete_domain_item_core(item_id, domain, port) for child_id in get_all_domain_node_by_item_id(item_id): delete_item(child_id) + +# if __name__ == '__main__': +# import Domain +# domain = Domain.Domain('domain.onion') +# for domain_history in domain.get_domain_history(): +# domain_item = domain.get_domain_items_crawled(epoch=domain_history[1]) # item_tag +# if "items" in domain_item: +# for item_dict in domain_item['items']: +# item_id = item_dict['id'] +# print(item_id) +# for lang in get_item_languages(item_id, min_proportion=0.2, min_probability=0.8): +# print(lang) +# print() +# print(get_item_languages(item_id, min_proportion=0.2, min_probability=0.6)) # 0.7 ? diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index 233b9066..cf65a126 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -46,6 +46,9 @@ publish = Redis_Tags subscribe = Redis_Global publish = Redis_Tags +[Languages] +subscribe = Redis_Global + [Categ] subscribe = Redis_Global publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey diff --git a/bin/update-background.py b/bin/update-background.py index d1ec6eaf..838ddf7b 100755 --- a/bin/update-background.py +++ b/bin/update-background.py @@ -17,6 +17,25 @@ import subprocess sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) import ConfigLoader +def launch_background_upgrade(version, script_name): + if r_serv.sismember('ail:to_update', version): + r_serv.delete('ail:update_error') + r_serv.set('ail:update_in_progress', version) + r_serv.set('ail:current_background_update', version) + r_serv.set('ail:current_background_script', 'domain tags update') + + update_file = os.path.join(os.environ['AIL_HOME'], 'update', version, script_name) + process = subprocess.run(['python' ,update_file]) + + update_progress = r_serv.get('ail:current_background_script_stat') + if update_progress: + if int(update_progress) == 100: + r_serv.delete('ail:update_in_progress') + r_serv.delete('ail:current_background_script') + r_serv.delete('ail:current_background_script_stat') + r_serv.delete('ail:current_background_update') + r_serv.srem('ail:to_update', new_version) + if __name__ == "__main__": config_loader = ConfigLoader.ConfigLoader() @@ -114,3 +133,8 @@ if __name__ == "__main__": r_serv.delete('ail:current_background_script_stat') r_serv.delete('ail:current_background_update') r_serv.srem('ail:to_update', new_version) + + launch_background_upgrade('v2.6', 'Update_screenshots.py') + launch_background_upgrade('v2.7', 'Update_domain_tags.py') + + launch_background_upgrade('v3.4', 'Update_domain.py') diff --git a/update/v3.4/Update.py b/update/v3.4/Update.py new file mode 100755 index 00000000..37ae0428 --- /dev/null +++ b/update/v3.4/Update.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import re +import sys +import time +import redis +import datetime + +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) +import ConfigLoader + +new_version = 'v3.4' + +if __name__ == '__main__': + + start_deb = time.time() + + config_loader = ConfigLoader.ConfigLoader() + r_serv = config_loader.get_redis_conn("ARDB_DB") + r_serv_onion = config_loader.get_redis_conn("ARDB_Onion") + config_loader = None + + #Set current update_in_progress + r_serv.set('ail:update_in_progress', new_version) + r_serv.set('ail:current_background_update', new_version) + + r_serv_onion.sunionstore('domain_update_v3.4', 'full_onion_up', 'full_regular_up') + r_serv.set('update:nb_elem_to_convert', r_serv_onion.scard('domain_update_v3.4')) + r_serv.set('update:nb_elem_converted',0) + + #Set current ail version + r_serv.set('ail:version', new_version) + + #Set current ail version + r_serv.hset('ail:update_date', new_version, datetime.datetime.now().strftime("%Y%m%d")) diff --git a/update/v3.4/Update.sh b/update/v3.4/Update.sh new file mode 100755 index 00000000..a1eaeb5a --- /dev/null +++ b/update/v3.4/Update.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +[ -z "$AIL_HOME" ] && echo "Needs the env var AIL_HOME. Run the script from the virtual environment." && exit 1; +[ -z "$AIL_REDIS" ] && echo "Needs the env var AIL_REDIS. Run the script from the virtual environment." && exit 1; +[ -z "$AIL_ARDB" ] && echo "Needs the env var AIL_ARDB. Run the script from the virtual environment." && exit 1; +[ -z "$AIL_BIN" ] && echo "Needs the env var AIL_ARDB. Run the script from the virtual environment." && exit 1; +[ -z "$AIL_FLASK" ] && echo "Needs the env var AIL_FLASK. Run the script from the virtual environment." && exit 1; + +export PATH=$AIL_HOME:$PATH +export PATH=$AIL_REDIS:$PATH +export PATH=$AIL_ARDB:$PATH +export PATH=$AIL_BIN:$PATH +export PATH=$AIL_FLASK:$PATH + +GREEN="\\033[1;32m" +DEFAULT="\\033[0;39m" + +echo -e $GREEN"Shutting down AIL ..."$DEFAULT +bash ${AIL_BIN}/LAUNCH.sh -ks +wait + +bash ${AIL_BIN}/LAUNCH.sh -ldbv & +wait +echo "" + +# SUBMODULES # +git submodule update + +# echo "" +# echo -e $GREEN"installing KVORCKS ..."$DEFAULT +# cd ${AIL_HOME} +# test ! -d kvrocks/ && git clone https://github.com/bitleak/kvrocks.git +# pushd kvrocks/ +# make -j4 +# popd + +echo -e $GREEN"Installing html2text ..."$DEFAULT +pip3 install pycld3 + +echo "" +echo -e $GREEN"Updating AIL VERSION ..."$DEFAULT +echo "" +python ${AIL_HOME}/update/v3.4/Update.py +wait +echo "" +echo "" + + +echo "" +echo -e $GREEN"Shutting down ARDB ..."$DEFAULT +bash ${AIL_BIN}/LAUNCH.sh -ks +wait + +exit 0 diff --git a/update/v3.4/Update_domain.py b/update/v3.4/Update_domain.py new file mode 100755 index 00000000..092830c6 --- /dev/null +++ b/update/v3.4/Update_domain.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import re +import sys +import time +import redis +import datetime + +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) +import ConfigLoader +import Domain + +def update_update_stats(): + nb_updated = int(r_serv_db.get('update:nb_elem_converted')) + progress = int((nb_updated * 100) / nb_elem_to_update) + print('{}/{} updated {}%'.format(nb_updated, nb_elem_to_update, progress)) + r_serv_db.set('ail:current_background_script_stat', progress) + +def update_domain_language(domain_obj, item_id): + domain_name = domain_obj.get_domain_name() + Domain.add_domain_languages_by_item_id(domain_name, item_id) + +if __name__ == '__main__': + + start_deb = time.time() + + config_loader = ConfigLoader.ConfigLoader() + r_serv_db = config_loader.get_redis_conn("ARDB_DB") + r_serv_onion = config_loader.get_redis_conn("ARDB_Onion") + config_loader = None + + nb_elem_to_update = r_serv_db.get('update:nb_elem_to_convert') + if not nb_elem_to_update: + nb_elem_to_update = 1 + else: + nb_elem_to_update = int(nb_elem_to_update) + + #Domain._delete_all_domains_languages() + + while True: + domain = r_serv_onion.spop('domain_update_v3.4') + if domain is not None: + print(domain) + domain = Domain.Domain(domain) + for domain_history in domain.get_domain_history(): + domain_item = domain.get_domain_items_crawled(epoch=domain_history[1]) # item_tag + if "items" in domain_item: + for item_dict in domain_item['items']: + update_domain_language(domain, item_dict['id']) + + r_serv_db.incr('update:nb_elem_converted') + update_update_stats() + + else: + sys.exit(0) diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index b2ef7f63..5da9b633 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -26,6 +26,7 @@ import Tag sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) import Domain import crawlers +import Language r_cache = Flask_config.r_cache r_serv_db = Flask_config.r_serv_db @@ -85,6 +86,9 @@ def send_to_spider(): return create_json_response(res[0], res[1]) return redirect(url_for('crawler_splash.manual')) + +#### Domains #### + # add route : /crawlers/show_domain @crawler_splash.route('/crawlers/showDomain', methods=['GET', 'POST']) @login_required @@ -111,6 +115,7 @@ def showDomain(): dict_domain = {**dict_domain, **domain.get_domain_correlation()} dict_domain['correlation_nb'] = Domain.get_domain_total_nb_correlation(dict_domain) dict_domain['father'] = domain.get_domain_father() + dict_domain['languages'] = Language.get_languages_from_iso(domain.get_domain_languages(), sort=True) dict_domain['tags'] = domain.get_domain_tags() dict_domain['tags_safe'] = Tag.is_tags_safe(dict_domain['tags']) dict_domain['history'] = domain.get_domain_history_with_status() @@ -198,6 +203,38 @@ def domains_explorer_web(): dict_data = Domain.get_domains_up_by_filers('regular', page=page, date_from=date_from, date_to=date_to) return render_template("domain_explorer.html", dict_data=dict_data, bootstrap_label=bootstrap_label, domain_type='regular') +@crawler_splash.route('/domains/languages/all/json', methods=['GET']) +@login_required +@login_read_only +def domains_all_languages_json(): + # # TODO: get domain type + iso = request.args.get('iso') + domain_types = request.args.getlist('domain_types') + return jsonify(Language.get_languages_from_iso(Domain.get_all_domains_languages(), sort=True)) + +@crawler_splash.route('/domains/languages/search_get', methods=['GET']) +@login_required +@login_read_only +def domains_search_languages_get(): + page = request.args.get('page') + try: + page = int(page) + except: + page = 1 + domains_types = request.args.getlist('domain_types') + if domains_types: + domains_types = domains_types[0].split(',') + languages = request.args.getlist('languages') + if languages: + languages = languages[0].split(',') + l_dict_domains = Domain.api_get_domains_by_languages(domains_types, Language.get_iso_from_languages(languages), domains_metadata=True, page=page) + return render_template("domains/domains_filter_languages.html", template_folder='../../', + l_dict_domains=l_dict_domains, bootstrap_label=bootstrap_label, + current_languages=languages, domains_types=domains_types) + +##-- --## + + ## Cookiejar ## @crawler_splash.route('/crawler/cookiejar/add', methods=['GET']) @login_required diff --git a/var/www/modules/hiddenServices/templates/Crawler_dashboard.html b/var/www/modules/hiddenServices/templates/Crawler_dashboard.html index 9c0e1933..5cc7f987 100644 --- a/var/www/modules/hiddenServices/templates/Crawler_dashboard.html +++ b/var/www/modules/hiddenServices/templates/Crawler_dashboard.html @@ -148,6 +148,10 @@ {% include 'tags/block_obj_tags_search.html' %} {% endwith %} + {% with object_type='domain' %} + {% include 'domains/block_languages_search.html' %} + {% endwith %} + diff --git a/var/www/templates/crawler/crawler_splash/domain_explorer.html b/var/www/templates/crawler/crawler_splash/domain_explorer.html index 16595014..629cd090 100644 --- a/var/www/templates/crawler/crawler_splash/domain_explorer.html +++ b/var/www/templates/crawler/crawler_splash/domain_explorer.html @@ -68,7 +68,7 @@ - {% with dict_data=dict_data, bootstrap_label=bootstrap_label %} + {% with l_dict_domains=dict_data['list_elem'], bootstrap_label=bootstrap_label %} {% include 'domains/card_img_domain.html' %} {% endwith %} diff --git a/var/www/templates/crawler/crawler_splash/showDomain.html b/var/www/templates/crawler/crawler_splash/showDomain.html index 2fba319f..c63dd5b5 100644 --- a/var/www/templates/crawler/crawler_splash/showDomain.html +++ b/var/www/templates/crawler/crawler_splash/showDomain.html @@ -67,6 +67,7 @@ First Seen Last Check Ports + Languages @@ -74,6 +75,11 @@ {%if "first_seen" in dict_domain%}{{ dict_domain['first_seen'] }}{%endif%} {%if "last_check" in dict_domain%}{{ dict_domain['last_check'] }}{%endif%} {%if dict_domain["ports"]%}{{ dict_domain["ports"] }}{%endif%} + + {% for languages in dict_domain['languages'] %} + {{languages}} + {% endfor %} + diff --git a/var/www/templates/domains/block_languages_search.html b/var/www/templates/domains/block_languages_search.html new file mode 100644 index 00000000..338e91d0 --- /dev/null +++ b/var/www/templates/domains/block_languages_search.html @@ -0,0 +1,73 @@ +
+
+
+ Domains by Languages : +
+
+
+ +
+
+ +
+ +
+ +
+
+ + +
+
+ + +
+
+ + + +
+
+ + + + + diff --git a/var/www/templates/domains/card_img_domain.html b/var/www/templates/domains/card_img_domain.html index eb7e8371..3df796e8 100644 --- a/var/www/templates/domains/card_img_domain.html +++ b/var/www/templates/domains/card_img_domain.html @@ -1,10 +1,10 @@ -{% for dict_domain in dict_data['list_elem'] %} +{% for dict_domain in l_dict_domains %} {% if loop.index0 % 4 == 0 %}
{% endif %} -
+
@@ -13,24 +13,46 @@ {{dict_domain["id"]}} + + {% if dict_domain["status"] %} + + UP + + {% else %} + + DOWN + + {% endif %} +
+ + + + + + {{dict_domain["first_seen"]}} + + + + + {{dict_domain["first_seen"]}} + + + + +

- First seen: {{dict_domain["first_seen"]}}
- Last_seen: {{dict_domain["first_seen"]}}
- Ports: {{dict_domain["ports"]}} + Ports: {{dict_domain["ports"]}}
+ {% if dict_domain['languages'] %} + Languages: + {% for language in dict_domain['languages'] %} + {{ language }} + {% endfor %} + {% endif %}

- Status: - {% if dict_domain["status"] %} - - UP - - {% else %} - - DOWN - - {% endif %} +
{% for tag in dict_domain['tags'] %} @@ -50,6 +72,6 @@ {% endfor %} -{% if dict_data['list_elem']|length % 4 != 0 %} +{% if l_dict_domains|length % 4 != 0 %}
{% endif %} diff --git a/var/www/templates/domains/domains_filter_languages.html b/var/www/templates/domains/domains_filter_languages.html new file mode 100644 index 00000000..b45236c6 --- /dev/null +++ b/var/www/templates/domains/domains_filter_languages.html @@ -0,0 +1,192 @@ + + + + Show Domain - AIL + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'crawler/menu_sidebar.html' %} + +
+ +
+
+ + {% include 'domains/block_languages_search.html' %} + + +
+
+ +
+
+
+
+ +
+
+ +
+
+ +
+
+
+
+ +
+
+ + + {% with l_dict_domains=l_dict_domains['list_elem'], bootstrap_label=bootstrap_label %} + {% include 'domains/card_img_domain.html' %} + {% endwith %} + +
+
+ + {%if l_dict_domains['list_elem']%} + {% with page=l_dict_domains['page'], nb_page_max=l_dict_domains['nb_pages'], nb_first_elem=l_dict_domains['nb_first_elem'], nb_last_elem=l_dict_domains['nb_last_elem'], nb_all_elem=l_dict_domains['nb_all_elem'] %} + {% set target_url=url_for('crawler_splash.domains_search_languages_get') + "?languages=" + ','.join(current_languages)%} + {%if domains_types %} + {% set target_url = target_url + '&domain_types=' + ','.join(domains_types)%} + {%endif%} + {% include 'pagination.html' %} + {% endwith %} + {%endif%} + + +
+
+
+ + + + + + + + +