diff --git a/.gitignore b/.gitignore index b5755ee6..6973080f 100644 --- a/.gitignore +++ b/.gitignore @@ -11,9 +11,10 @@ ardb faup tlsh Blooms -LEVEL_DB_DATA PASTES +CRAWLED_SCREENSHOT BASE64 +HASHS DATA_ARDB indexdir/ logs/ diff --git a/bin/Bitcoin.py b/bin/Bitcoin.py index 5ec2199f..1b7694b7 100755 --- a/bin/Bitcoin.py +++ b/bin/Bitcoin.py @@ -32,6 +32,7 @@ def decode_base58(bc, length): for char in bc: n = n * 58 + digits58.index(char) return n.to_bytes(length, 'big') + def check_bc(bc): try: bcbytes = decode_base58(bc, 25) diff --git a/bin/Crawler.py b/bin/Crawler.py index ab74c64b..3660aa41 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -57,6 +57,12 @@ def crawl_onion(url, domain, date, date_month): if __name__ == '__main__': + if len(sys.argv) != 2: + print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)') + exit(1) + + type_hidden_service = sys.argv[1] + publisher.port = 6380 publisher.channel = "Script" @@ -72,7 +78,6 @@ if __name__ == '__main__': url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" re.compile(url_i2p) - type_hidden_service = 'onion' if type_hidden_service == 'onion': regex_hidden_service = url_onion splash_url = p.config.get("Crawler", "splash_url_onion") @@ -89,8 +94,12 @@ if __name__ == '__main__': print('incorrect crawler type: {}'.format(type_hidden_service)) exit(0) + print(type_hidden_service) + crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) + #signal.signal(signal.SIGINT, signal_handler) r_serv_metadata = redis.StrictRedis( @@ -113,8 +122,10 @@ if __name__ == '__main__': while True: - # Recovering the streamed message informations. + # Recovering the streamed message informations. http://eepsites.i2p message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) + #message = 'http://i2pwiki.i2p;test' + #message = 'http://i2host.i2p;test' # # FIXME: remove if message is None: @@ -122,13 +133,19 @@ if __name__ == '__main__': message = r_onion.spop('mess_onion') if message is not None: + print(message) splitted = message.split(';') if len(splitted) == 2: url, paste = splitted + paste = paste.replace(PASTES_FOLDER+'/', '') + print(paste) + ''' if not '.onion' in url: print('not onion') continue + ''' + url_list = re.findall(regex_hidden_service, url)[0] if url_list[1] == '': @@ -137,7 +154,7 @@ if __name__ == '__main__': link, s, credential, subdomain, domain, host, port, \ resource_path, query_string, f1, f2, f3, f4 = url_list domain = url_list[4] - r_onion.srem('onion_domain_crawler_queue', domain) + r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain) domain_url = 'http://{}'.format(domain) @@ -157,6 +174,8 @@ if __name__ == '__main__': crawl_onion(url, domain, date, date_month) if url != domain_url: + print(url) + print(domain_url) crawl_onion(domain_url, domain, date, date_month) # save down onion @@ -173,6 +192,17 @@ if __name__ == '__main__': # last check r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) + # last_father + r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste) + + # add onion screenshot history + # add crawled days + if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date: + r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date) + # add crawled history by date + r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) #add datetime here + + # check external onions links (full_scrawl) external_domains = set() for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)): @@ -194,6 +224,12 @@ if __name__ == '__main__': r_onion.lpush('last_{}'.format(type_hidden_service), domain) r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) + #send all crawled domain past + msg = domain + p.populate_set_out(msg, 'DomainSubject') + + #time.sleep(30) + else: continue else: diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py index 5143553b..ca07bfd2 100755 --- a/bin/packages/HiddenServices.py +++ b/bin/packages/HiddenServices.py @@ -61,6 +61,7 @@ class HiddenServices(object): self.domain = domain self.type = type + self.tags = {} if type == 'onion': self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) @@ -74,6 +75,20 @@ class HiddenServices(object): ## TODO: # FIXME: add error pass + def get_origin_paste_name(self): + origin_paste = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') + if origin_paste is None: + return '' + return origin_paste.replace(self.paste_directory+'/', '') + + def get_domain_tags(self): + return self.tags + + def update_domain_tags(self, children): + p_tags = self.r_serv_metadata.smembers('tag:'+children) + for tag in p_tags: + self.tags[tag] = self.tags.get(tag, 0) + 1 + #todo use the right paste def get_last_crawled_pastes(self): paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') @@ -81,8 +96,10 @@ class HiddenServices(object): return self.get_all_pastes_domain(paste_parent) def get_all_pastes_domain(self, father): + if father is None: + return [] l_crawled_pastes = [] - paste_parent = father.replace(self.paste_directory, '')[1:] + paste_parent = father.replace(self.paste_directory+'/', '') paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent)) ## TODO: # FIXME: remove me paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) @@ -90,6 +107,7 @@ class HiddenServices(object): for children in paste_childrens: if self.domain in children: l_crawled_pastes.append(children) + self.update_domain_tags(children) l_crawled_pastes.extend(self.get_all_pastes_domain(children)) return l_crawled_pastes @@ -97,7 +115,7 @@ class HiddenServices(object): l_screenshot_paste = [] for paste in l_crawled_pastes: ## FIXME: # TODO: remove me - paste= paste.replace(self.paste_directory, '')[1:] + paste= paste.replace(self.paste_directory+'/', '') paste = paste.replace(self.paste_crawled_directory_name, '') if os.path.isfile( '{}{}.png'.format(self.screenshot_directory, paste) ): diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 135ad0a7..ffbc5da9 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -96,6 +96,7 @@ class TorSplashCrawler(): yield SplashRequest( self.start_urls, self.parse, + errback=self.errback_catcher, endpoint='render.json', meta={'father': self.original_paste}, args={ 'html': 1, @@ -121,6 +122,9 @@ class TorSplashCrawler(): # save new paste on disk if self.save_crawled_paste(filename_paste, response.data['html']): + # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? + self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) + self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) @@ -129,10 +133,6 @@ class TorSplashCrawler(): if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date) - self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'paste_parent', self.original_paste) - - # add onion screenshot history - self.r_serv_onion.sadd('{}_history:{}'.format(self.type, self.domains[0]), self.full_date) #create paste metadata self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) @@ -170,6 +170,7 @@ class TorSplashCrawler(): yield SplashRequest( link.url, self.parse, + errback=self.errback_catcher, endpoint='render.json', meta={'father': relative_filename_paste}, args={ 'html': 1, @@ -179,10 +180,13 @@ class TorSplashCrawler(): 'wait': 10} #errback=self.errback_catcher ) - ''' + def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) + print('failure') + print(failure) + print(failure.request.meta['item']) #if isinstance(failure.value, HttpError): if failure.check(HttpError): @@ -196,14 +200,16 @@ class TorSplashCrawler(): # this is the original request request = failure.request print(DNSLookupError) + print('DNSLookupError') self.logger.error('DNSLookupError on %s', request.url) #elif isinstance(failure.value, TimeoutError): elif failure.check(TimeoutError): request = failure.request + print('TimeoutError') print(TimeoutError) self.logger.error('TimeoutError on %s', request.url) - ''' + def save_crawled_paste(self, filename, content): diff --git a/pip3_packages_requirement.txt b/pip3_packages_requirement.txt index 53ec97e7..ddf60626 100644 --- a/pip3_packages_requirement.txt +++ b/pip3_packages_requirement.txt @@ -58,6 +58,9 @@ pycountry # To fetch Onion urls PySocks +#extract subject +newspaper3k + # decompress files sflock diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 2c0c7e4a..5e63374b 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -39,6 +39,23 @@ def get_date_range(num_day): return list(reversed(date_list)) +def unpack_paste_tags(p_tags): + l_tags = [] + for tag in p_tags: + complete_tag = tag + tag = tag.split('=') + if len(tag) > 1: + if tag[1] != '': + tag = tag[1][1:-1] + # no value + else: + tag = tag[0][1:-1] + # use for custom tags + else: + tag = tag[0] + l_tags.append( (tag, complete_tag) ) + return l_tags + def get_onion_status(domain, date): if r_serv_onion.sismember('onion_up:'+date , domain): return True @@ -76,43 +93,39 @@ def onion_domain(): # # TODO: FIXME return 404 last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check') + last_check = '{}/{}/{}'.format(last_check[0:4], last_check[4:6], last_check[6:8]) first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen') - domain_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent') - date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain)) + first_seen = '{}/{}/{}'.format(first_seen[0:4], first_seen[4:6], first_seen[6:8]) + origin_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent') h = HiddenServices(onion_domain, 'onion') l_pastes = h.get_last_crawled_pastes() + if l_pastes: + status = True + else: + status = False screenshot = h.get_domain_random_screenshot(l_pastes) if screenshot: screenshot = screenshot[0] else: screenshot = 'None' + domain_tags = h.get_domain_tags() + + origin_paste_name = h.get_origin_paste_name() + origin_paste_tags = unpack_paste_tags(r_serv_metadata.smembers('tag:{}'.format(origin_paste))) paste_tags = [] path_name = [] for path in l_pastes: - path_name.append(path.replace(PASTES_FOLDER, '')) + path_name.append(path.replace(PASTES_FOLDER+'/', '')) p_tags = r_serv_metadata.smembers('tag:'+path) - l_tags = [] - for tag in p_tags: - complete_tag = tag - tag = tag.split('=') - if len(tag) > 1: - if tag[1] != '': - tag = tag[1][1:-1] - # no value - else: - tag = tag[0][1:-1] - # use for custom tags - else: - tag = tag[0] - l_tags.append( (tag, complete_tag) ) - paste_tags.append(l_tags) + paste_tags.append(unpack_paste_tags(p_tags)) return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen, - l_pastes=l_pastes, paste_tags=paste_tags, l_tags=l_tags, bootstrap_label=bootstrap_label, - path_name=path_name, - domain_paste=domain_paste, screenshot=screenshot) + l_pastes=l_pastes, paste_tags=paste_tags, bootstrap_label=bootstrap_label, + path_name=path_name, origin_paste_tags=origin_paste_tags, status=status, + origin_paste=origin_paste, origin_paste_name=origin_paste_name, + domain_tags=domain_tags, screenshot=screenshot) # ============= JSON ============== @hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET']) diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html index 29aa821c..b89388aa 100644 --- a/var/www/modules/hiddenServices/templates/showDomain.html +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -36,35 +36,61 @@
-
-
+
- Graph -
+ {% if status %} +
+ + UP +
+ {% else %} +
+ + DOWN +
+ {% endif %} +

{{ domain }} :

+
    +
  • - - - - - - - - - - - - - - - - - - - -
    Domain{{ domain }}
    First Seen{{ first_seen }}
    Last Check{{ last_check }}
    Origin Paste - {{ domain_paste }} -
    -
+ + + + + + + + + + + + + +
First SeenLast Check
{{ first_seen }}{{ last_check }}
+ + +
  • + Origin Paste: {{ origin_paste_name }} +
    + {% for tag in origin_paste_tags %} + + {{ tag[0] }} + + {% endfor %} +
    +
    +
  • + +
    +
    +
    + {% for tag in domain_tags %} + + {{ tag }} {{ domain_tags[tag] }} + + {% endfor %} +
    +