chg: [Crawler UI] display domain information

This commit is contained in:
Terrtia 2018-09-12 09:55:49 +02:00
parent ca982e13e1
commit 6f0817365a
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
8 changed files with 164 additions and 60 deletions

3
.gitignore vendored
View file

@ -11,9 +11,10 @@ ardb
faup faup
tlsh tlsh
Blooms Blooms
LEVEL_DB_DATA
PASTES PASTES
CRAWLED_SCREENSHOT
BASE64 BASE64
HASHS
DATA_ARDB DATA_ARDB
indexdir/ indexdir/
logs/ logs/

View file

@ -32,6 +32,7 @@ def decode_base58(bc, length):
for char in bc: for char in bc:
n = n * 58 + digits58.index(char) n = n * 58 + digits58.index(char)
return n.to_bytes(length, 'big') return n.to_bytes(length, 'big')
def check_bc(bc): def check_bc(bc):
try: try:
bcbytes = decode_base58(bc, 25) bcbytes = decode_base58(bc, 25)

View file

@ -57,6 +57,12 @@ def crawl_onion(url, domain, date, date_month):
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) != 2:
print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)')
exit(1)
type_hidden_service = sys.argv[1]
publisher.port = 6380 publisher.port = 6380
publisher.channel = "Script" publisher.channel = "Script"
@ -72,7 +78,6 @@ if __name__ == '__main__':
url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
re.compile(url_i2p) re.compile(url_i2p)
type_hidden_service = 'onion'
if type_hidden_service == 'onion': if type_hidden_service == 'onion':
regex_hidden_service = url_onion regex_hidden_service = url_onion
splash_url = p.config.get("Crawler", "splash_url_onion") splash_url = p.config.get("Crawler", "splash_url_onion")
@ -89,8 +94,12 @@ if __name__ == '__main__':
print('incorrect crawler type: {}'.format(type_hidden_service)) print('incorrect crawler type: {}'.format(type_hidden_service))
exit(0) exit(0)
print(type_hidden_service)
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
#signal.signal(signal.SIGINT, signal_handler) #signal.signal(signal.SIGINT, signal_handler)
r_serv_metadata = redis.StrictRedis( r_serv_metadata = redis.StrictRedis(
@ -113,8 +122,10 @@ if __name__ == '__main__':
while True: while True:
# Recovering the streamed message informations. # Recovering the streamed message informations. http://eepsites.i2p
message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
#message = 'http://i2pwiki.i2p;test'
#message = 'http://i2host.i2p;test'
# # FIXME: remove # # FIXME: remove
if message is None: if message is None:
@ -122,13 +133,19 @@ if __name__ == '__main__':
message = r_onion.spop('mess_onion') message = r_onion.spop('mess_onion')
if message is not None: if message is not None:
print(message)
splitted = message.split(';') splitted = message.split(';')
if len(splitted) == 2: if len(splitted) == 2:
url, paste = splitted url, paste = splitted
paste = paste.replace(PASTES_FOLDER+'/', '')
print(paste)
'''
if not '.onion' in url: if not '.onion' in url:
print('not onion') print('not onion')
continue continue
'''
url_list = re.findall(regex_hidden_service, url)[0] url_list = re.findall(regex_hidden_service, url)[0]
if url_list[1] == '': if url_list[1] == '':
@ -137,7 +154,7 @@ if __name__ == '__main__':
link, s, credential, subdomain, domain, host, port, \ link, s, credential, subdomain, domain, host, port, \
resource_path, query_string, f1, f2, f3, f4 = url_list resource_path, query_string, f1, f2, f3, f4 = url_list
domain = url_list[4] domain = url_list[4]
r_onion.srem('onion_domain_crawler_queue', domain) r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain)
domain_url = 'http://{}'.format(domain) domain_url = 'http://{}'.format(domain)
@ -157,6 +174,8 @@ if __name__ == '__main__':
crawl_onion(url, domain, date, date_month) crawl_onion(url, domain, date, date_month)
if url != domain_url: if url != domain_url:
print(url)
print(domain_url)
crawl_onion(domain_url, domain, date, date_month) crawl_onion(domain_url, domain, date, date_month)
# save down onion # save down onion
@ -173,6 +192,17 @@ if __name__ == '__main__':
# last check # last check
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
# last_father
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste)
# add onion screenshot history
# add crawled days
if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date:
r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date)
# add crawled history by date
r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) #add datetime here
# check external onions links (full_scrawl) # check external onions links (full_scrawl)
external_domains = set() external_domains = set()
for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)): for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
@ -194,6 +224,12 @@ if __name__ == '__main__':
r_onion.lpush('last_{}'.format(type_hidden_service), domain) r_onion.lpush('last_{}'.format(type_hidden_service), domain)
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
#send all crawled domain past
msg = domain
p.populate_set_out(msg, 'DomainSubject')
#time.sleep(30)
else: else:
continue continue
else: else:

View file

@ -61,6 +61,7 @@ class HiddenServices(object):
self.domain = domain self.domain = domain
self.type = type self.type = type
self.tags = {}
if type == 'onion': if type == 'onion':
self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"))
@ -74,6 +75,20 @@ class HiddenServices(object):
## TODO: # FIXME: add error ## TODO: # FIXME: add error
pass pass
def get_origin_paste_name(self):
origin_paste = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent')
if origin_paste is None:
return ''
return origin_paste.replace(self.paste_directory+'/', '')
def get_domain_tags(self):
return self.tags
def update_domain_tags(self, children):
p_tags = self.r_serv_metadata.smembers('tag:'+children)
for tag in p_tags:
self.tags[tag] = self.tags.get(tag, 0) + 1
#todo use the right paste #todo use the right paste
def get_last_crawled_pastes(self): def get_last_crawled_pastes(self):
paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent')
@ -81,8 +96,10 @@ class HiddenServices(object):
return self.get_all_pastes_domain(paste_parent) return self.get_all_pastes_domain(paste_parent)
def get_all_pastes_domain(self, father): def get_all_pastes_domain(self, father):
if father is None:
return []
l_crawled_pastes = [] l_crawled_pastes = []
paste_parent = father.replace(self.paste_directory, '')[1:] paste_parent = father.replace(self.paste_directory+'/', '')
paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent)) paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent))
## TODO: # FIXME: remove me ## TODO: # FIXME: remove me
paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father))
@ -90,6 +107,7 @@ class HiddenServices(object):
for children in paste_childrens: for children in paste_childrens:
if self.domain in children: if self.domain in children:
l_crawled_pastes.append(children) l_crawled_pastes.append(children)
self.update_domain_tags(children)
l_crawled_pastes.extend(self.get_all_pastes_domain(children)) l_crawled_pastes.extend(self.get_all_pastes_domain(children))
return l_crawled_pastes return l_crawled_pastes
@ -97,7 +115,7 @@ class HiddenServices(object):
l_screenshot_paste = [] l_screenshot_paste = []
for paste in l_crawled_pastes: for paste in l_crawled_pastes:
## FIXME: # TODO: remove me ## FIXME: # TODO: remove me
paste= paste.replace(self.paste_directory, '')[1:] paste= paste.replace(self.paste_directory+'/', '')
paste = paste.replace(self.paste_crawled_directory_name, '') paste = paste.replace(self.paste_crawled_directory_name, '')
if os.path.isfile( '{}{}.png'.format(self.screenshot_directory, paste) ): if os.path.isfile( '{}{}.png'.format(self.screenshot_directory, paste) ):

View file

@ -96,6 +96,7 @@ class TorSplashCrawler():
yield SplashRequest( yield SplashRequest(
self.start_urls, self.start_urls,
self.parse, self.parse,
errback=self.errback_catcher,
endpoint='render.json', endpoint='render.json',
meta={'father': self.original_paste}, meta={'father': self.original_paste},
args={ 'html': 1, args={ 'html': 1,
@ -121,6 +122,9 @@ class TorSplashCrawler():
# save new paste on disk # save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']): if self.save_crawled_paste(filename_paste, response.data['html']):
# add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])
@ -129,10 +133,6 @@ class TorSplashCrawler():
if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date) self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date)
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'paste_parent', self.original_paste)
# add onion screenshot history
self.r_serv_onion.sadd('{}_history:{}'.format(self.type, self.domains[0]), self.full_date)
#create paste metadata #create paste metadata
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
@ -170,6 +170,7 @@ class TorSplashCrawler():
yield SplashRequest( yield SplashRequest(
link.url, link.url,
self.parse, self.parse,
errback=self.errback_catcher,
endpoint='render.json', endpoint='render.json',
meta={'father': relative_filename_paste}, meta={'father': relative_filename_paste},
args={ 'html': 1, args={ 'html': 1,
@ -179,10 +180,13 @@ class TorSplashCrawler():
'wait': 10} 'wait': 10}
#errback=self.errback_catcher #errback=self.errback_catcher
) )
'''
def errback_catcher(self, failure): def errback_catcher(self, failure):
# catch all errback failures, # catch all errback failures,
self.logger.error(repr(failure)) self.logger.error(repr(failure))
print('failure')
print(failure)
print(failure.request.meta['item'])
#if isinstance(failure.value, HttpError): #if isinstance(failure.value, HttpError):
if failure.check(HttpError): if failure.check(HttpError):
@ -196,14 +200,16 @@ class TorSplashCrawler():
# this is the original request # this is the original request
request = failure.request request = failure.request
print(DNSLookupError) print(DNSLookupError)
print('DNSLookupError')
self.logger.error('DNSLookupError on %s', request.url) self.logger.error('DNSLookupError on %s', request.url)
#elif isinstance(failure.value, TimeoutError): #elif isinstance(failure.value, TimeoutError):
elif failure.check(TimeoutError): elif failure.check(TimeoutError):
request = failure.request request = failure.request
print('TimeoutError')
print(TimeoutError) print(TimeoutError)
self.logger.error('TimeoutError on %s', request.url) self.logger.error('TimeoutError on %s', request.url)
'''
def save_crawled_paste(self, filename, content): def save_crawled_paste(self, filename, content):

View file

@ -58,6 +58,9 @@ pycountry
# To fetch Onion urls # To fetch Onion urls
PySocks PySocks
#extract subject
newspaper3k
# decompress files # decompress files
sflock sflock

View file

@ -39,6 +39,23 @@ def get_date_range(num_day):
return list(reversed(date_list)) return list(reversed(date_list))
def unpack_paste_tags(p_tags):
l_tags = []
for tag in p_tags:
complete_tag = tag
tag = tag.split('=')
if len(tag) > 1:
if tag[1] != '':
tag = tag[1][1:-1]
# no value
else:
tag = tag[0][1:-1]
# use for custom tags
else:
tag = tag[0]
l_tags.append( (tag, complete_tag) )
return l_tags
def get_onion_status(domain, date): def get_onion_status(domain, date):
if r_serv_onion.sismember('onion_up:'+date , domain): if r_serv_onion.sismember('onion_up:'+date , domain):
return True return True
@ -76,43 +93,39 @@ def onion_domain():
# # TODO: FIXME return 404 # # TODO: FIXME return 404
last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check') last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check')
last_check = '{}/{}/{}'.format(last_check[0:4], last_check[4:6], last_check[6:8])
first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen') first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen')
domain_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent') first_seen = '{}/{}/{}'.format(first_seen[0:4], first_seen[4:6], first_seen[6:8])
date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain)) origin_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent')
h = HiddenServices(onion_domain, 'onion') h = HiddenServices(onion_domain, 'onion')
l_pastes = h.get_last_crawled_pastes() l_pastes = h.get_last_crawled_pastes()
if l_pastes:
status = True
else:
status = False
screenshot = h.get_domain_random_screenshot(l_pastes) screenshot = h.get_domain_random_screenshot(l_pastes)
if screenshot: if screenshot:
screenshot = screenshot[0] screenshot = screenshot[0]
else: else:
screenshot = 'None' screenshot = 'None'
domain_tags = h.get_domain_tags()
origin_paste_name = h.get_origin_paste_name()
origin_paste_tags = unpack_paste_tags(r_serv_metadata.smembers('tag:{}'.format(origin_paste)))
paste_tags = [] paste_tags = []
path_name = [] path_name = []
for path in l_pastes: for path in l_pastes:
path_name.append(path.replace(PASTES_FOLDER, '')) path_name.append(path.replace(PASTES_FOLDER+'/', ''))
p_tags = r_serv_metadata.smembers('tag:'+path) p_tags = r_serv_metadata.smembers('tag:'+path)
l_tags = [] paste_tags.append(unpack_paste_tags(p_tags))
for tag in p_tags:
complete_tag = tag
tag = tag.split('=')
if len(tag) > 1:
if tag[1] != '':
tag = tag[1][1:-1]
# no value
else:
tag = tag[0][1:-1]
# use for custom tags
else:
tag = tag[0]
l_tags.append( (tag, complete_tag) )
paste_tags.append(l_tags)
return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen, return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen,
l_pastes=l_pastes, paste_tags=paste_tags, l_tags=l_tags, bootstrap_label=bootstrap_label, l_pastes=l_pastes, paste_tags=paste_tags, bootstrap_label=bootstrap_label,
path_name=path_name, path_name=path_name, origin_paste_tags=origin_paste_tags, status=status,
domain_paste=domain_paste, screenshot=screenshot) origin_paste=origin_paste, origin_paste_name=origin_paste_name,
domain_tags=domain_tags, screenshot=screenshot)
# ============= JSON ============== # ============= JSON ==============
@hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET']) @hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET'])

View file

@ -36,35 +36,61 @@
<div class="col-md-6"> <div class="col-md-6">
<div class="row"> <div class="row">
<div class="row"> <div class="panel panel-info">
<div class="panel panel-default">
<div class="panel-heading"> <div class="panel-heading">
<i id="flash-tld" class="glyphicon glyphicon-flash " flash-tld=""></i> Graph {% if status %}
<div class="pull-right" style="color:Green;">
<i class="fa fa-check-circle fa-2x"></i>
UP
</div> </div>
{% else %}
<div class="pull-right" style="color:Red;">
<i class="fa fa-times-circle fa-2x"></i>
DOWN
</div>
{% endif %}
<h3>{{ domain }} :</h3>
<ul class="list-group">
<li class="list-group-item">
<table class="table table-hover table-striped"> <table class="table table-condensed">
<thead>
<tr>
<th>First Seen</th>
<th>Last Check</th>
</tr>
</thead>
<tbody> <tbody>
<tr> <tr>
<td>Domain</td> <td class="panelText"><a href="#">{{ first_seen }}</a></td>
<td>{{ domain }}</td> <td class="panelText"><a href="#">{{ last_check }}</a></td>
</tr>
<tr>
<td>First Seen</td>
<td>{{ first_seen }}</td>
</tr>
<tr>
<td>Last Check</td>
<td>{{ last_check }}</td>
</tr>
<tr>
<td>Origin Paste</td>
<td>
<a target="_blank" href="{{ url_for('showsavedpastes.showsavedpaste', paste=domain_paste) }}" />{{ domain_paste }}</a>
</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
</li>
<li class="list-group-item">
Origin Paste: <a target="_blank" href="{{ url_for('showsavedpastes.showsavedpaste', paste=origin_paste) }}" />{{ origin_paste_name }}</a>
<div>
{% for tag in origin_paste_tags %}
<a href="{{ url_for('Tags.get_tagged_paste') }}?ltags={{ tag[1] }}">
<span class="label label-{{ bootstrap_label[loop.index0 % 5] }} pull-left">{{ tag[0] }}</span>
</a>
{% endfor %}
<br>
</div> </div>
</li>
</ul>
</div>
</div>
<div>
{% for tag in domain_tags %}
<a href="{{ url_for('Tags.get_tagged_paste') }}?ltags={{ tag }}">
<span class="label label-{{ bootstrap_label[loop.index0 % 5] }} pull-left">{{ tag }} <i>{{ domain_tags[tag] }}</i></span>
</a>
{% endfor %}
<br>
<br>
</div> </div>
<table class="test table table-striped table-bordered table-hover table-responsive " id="myTable_"> <table class="test table table-striped table-bordered table-hover table-responsive " id="myTable_">