mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 00:28:22 +00:00
chg: [Crawler UI] display domain information
This commit is contained in:
parent
ca982e13e1
commit
6f0817365a
8 changed files with 164 additions and 60 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -11,9 +11,10 @@ ardb
|
|||
faup
|
||||
tlsh
|
||||
Blooms
|
||||
LEVEL_DB_DATA
|
||||
PASTES
|
||||
CRAWLED_SCREENSHOT
|
||||
BASE64
|
||||
HASHS
|
||||
DATA_ARDB
|
||||
indexdir/
|
||||
logs/
|
||||
|
|
|
@ -32,6 +32,7 @@ def decode_base58(bc, length):
|
|||
for char in bc:
|
||||
n = n * 58 + digits58.index(char)
|
||||
return n.to_bytes(length, 'big')
|
||||
|
||||
def check_bc(bc):
|
||||
try:
|
||||
bcbytes = decode_base58(bc, 25)
|
||||
|
|
|
@ -57,6 +57,12 @@ def crawl_onion(url, domain, date, date_month):
|
|||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)')
|
||||
exit(1)
|
||||
|
||||
type_hidden_service = sys.argv[1]
|
||||
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
||||
|
@ -72,7 +78,6 @@ if __name__ == '__main__':
|
|||
url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||
re.compile(url_i2p)
|
||||
|
||||
type_hidden_service = 'onion'
|
||||
if type_hidden_service == 'onion':
|
||||
regex_hidden_service = url_onion
|
||||
splash_url = p.config.get("Crawler", "splash_url_onion")
|
||||
|
@ -89,8 +94,12 @@ if __name__ == '__main__':
|
|||
print('incorrect crawler type: {}'.format(type_hidden_service))
|
||||
exit(0)
|
||||
|
||||
print(type_hidden_service)
|
||||
|
||||
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
||||
|
||||
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
|
||||
|
||||
#signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
r_serv_metadata = redis.StrictRedis(
|
||||
|
@ -113,8 +122,10 @@ if __name__ == '__main__':
|
|||
|
||||
while True:
|
||||
|
||||
# Recovering the streamed message informations.
|
||||
# Recovering the streamed message informations. http://eepsites.i2p
|
||||
message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
|
||||
#message = 'http://i2pwiki.i2p;test'
|
||||
#message = 'http://i2host.i2p;test'
|
||||
|
||||
# # FIXME: remove
|
||||
if message is None:
|
||||
|
@ -122,13 +133,19 @@ if __name__ == '__main__':
|
|||
message = r_onion.spop('mess_onion')
|
||||
|
||||
if message is not None:
|
||||
print(message)
|
||||
|
||||
splitted = message.split(';')
|
||||
if len(splitted) == 2:
|
||||
url, paste = splitted
|
||||
paste = paste.replace(PASTES_FOLDER+'/', '')
|
||||
print(paste)
|
||||
'''
|
||||
if not '.onion' in url:
|
||||
print('not onion')
|
||||
continue
|
||||
'''
|
||||
|
||||
|
||||
url_list = re.findall(regex_hidden_service, url)[0]
|
||||
if url_list[1] == '':
|
||||
|
@ -137,7 +154,7 @@ if __name__ == '__main__':
|
|||
link, s, credential, subdomain, domain, host, port, \
|
||||
resource_path, query_string, f1, f2, f3, f4 = url_list
|
||||
domain = url_list[4]
|
||||
r_onion.srem('onion_domain_crawler_queue', domain)
|
||||
r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain)
|
||||
|
||||
domain_url = 'http://{}'.format(domain)
|
||||
|
||||
|
@ -157,6 +174,8 @@ if __name__ == '__main__':
|
|||
|
||||
crawl_onion(url, domain, date, date_month)
|
||||
if url != domain_url:
|
||||
print(url)
|
||||
print(domain_url)
|
||||
crawl_onion(domain_url, domain, date, date_month)
|
||||
|
||||
# save down onion
|
||||
|
@ -173,6 +192,17 @@ if __name__ == '__main__':
|
|||
# last check
|
||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
||||
|
||||
# last_father
|
||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste)
|
||||
|
||||
# add onion screenshot history
|
||||
# add crawled days
|
||||
if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date:
|
||||
r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date)
|
||||
# add crawled history by date
|
||||
r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) #add datetime here
|
||||
|
||||
|
||||
# check external onions links (full_scrawl)
|
||||
external_domains = set()
|
||||
for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
|
||||
|
@ -194,6 +224,12 @@ if __name__ == '__main__':
|
|||
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
|
||||
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
|
||||
|
||||
#send all crawled domain past
|
||||
msg = domain
|
||||
p.populate_set_out(msg, 'DomainSubject')
|
||||
|
||||
#time.sleep(30)
|
||||
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
|
|
|
@ -61,6 +61,7 @@ class HiddenServices(object):
|
|||
|
||||
self.domain = domain
|
||||
self.type = type
|
||||
self.tags = {}
|
||||
|
||||
if type == 'onion':
|
||||
self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"))
|
||||
|
@ -74,6 +75,20 @@ class HiddenServices(object):
|
|||
## TODO: # FIXME: add error
|
||||
pass
|
||||
|
||||
def get_origin_paste_name(self):
|
||||
origin_paste = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent')
|
||||
if origin_paste is None:
|
||||
return ''
|
||||
return origin_paste.replace(self.paste_directory+'/', '')
|
||||
|
||||
def get_domain_tags(self):
|
||||
return self.tags
|
||||
|
||||
def update_domain_tags(self, children):
|
||||
p_tags = self.r_serv_metadata.smembers('tag:'+children)
|
||||
for tag in p_tags:
|
||||
self.tags[tag] = self.tags.get(tag, 0) + 1
|
||||
|
||||
#todo use the right paste
|
||||
def get_last_crawled_pastes(self):
|
||||
paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent')
|
||||
|
@ -81,8 +96,10 @@ class HiddenServices(object):
|
|||
return self.get_all_pastes_domain(paste_parent)
|
||||
|
||||
def get_all_pastes_domain(self, father):
|
||||
if father is None:
|
||||
return []
|
||||
l_crawled_pastes = []
|
||||
paste_parent = father.replace(self.paste_directory, '')[1:]
|
||||
paste_parent = father.replace(self.paste_directory+'/', '')
|
||||
paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent))
|
||||
## TODO: # FIXME: remove me
|
||||
paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father))
|
||||
|
@ -90,6 +107,7 @@ class HiddenServices(object):
|
|||
for children in paste_childrens:
|
||||
if self.domain in children:
|
||||
l_crawled_pastes.append(children)
|
||||
self.update_domain_tags(children)
|
||||
l_crawled_pastes.extend(self.get_all_pastes_domain(children))
|
||||
return l_crawled_pastes
|
||||
|
||||
|
@ -97,7 +115,7 @@ class HiddenServices(object):
|
|||
l_screenshot_paste = []
|
||||
for paste in l_crawled_pastes:
|
||||
## FIXME: # TODO: remove me
|
||||
paste= paste.replace(self.paste_directory, '')[1:]
|
||||
paste= paste.replace(self.paste_directory+'/', '')
|
||||
|
||||
paste = paste.replace(self.paste_crawled_directory_name, '')
|
||||
if os.path.isfile( '{}{}.png'.format(self.screenshot_directory, paste) ):
|
||||
|
|
|
@ -96,6 +96,7 @@ class TorSplashCrawler():
|
|||
yield SplashRequest(
|
||||
self.start_urls,
|
||||
self.parse,
|
||||
errback=self.errback_catcher,
|
||||
endpoint='render.json',
|
||||
meta={'father': self.original_paste},
|
||||
args={ 'html': 1,
|
||||
|
@ -121,6 +122,9 @@ class TorSplashCrawler():
|
|||
# save new paste on disk
|
||||
if self.save_crawled_paste(filename_paste, response.data['html']):
|
||||
|
||||
# add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
|
||||
self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
|
||||
|
||||
self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
|
||||
self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
|
||||
self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])
|
||||
|
@ -129,10 +133,6 @@ class TorSplashCrawler():
|
|||
if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
|
||||
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
|
||||
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date)
|
||||
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'paste_parent', self.original_paste)
|
||||
|
||||
# add onion screenshot history
|
||||
self.r_serv_onion.sadd('{}_history:{}'.format(self.type, self.domains[0]), self.full_date)
|
||||
|
||||
#create paste metadata
|
||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
|
||||
|
@ -170,6 +170,7 @@ class TorSplashCrawler():
|
|||
yield SplashRequest(
|
||||
link.url,
|
||||
self.parse,
|
||||
errback=self.errback_catcher,
|
||||
endpoint='render.json',
|
||||
meta={'father': relative_filename_paste},
|
||||
args={ 'html': 1,
|
||||
|
@ -179,10 +180,13 @@ class TorSplashCrawler():
|
|||
'wait': 10}
|
||||
#errback=self.errback_catcher
|
||||
)
|
||||
'''
|
||||
|
||||
def errback_catcher(self, failure):
|
||||
# catch all errback failures,
|
||||
self.logger.error(repr(failure))
|
||||
print('failure')
|
||||
print(failure)
|
||||
print(failure.request.meta['item'])
|
||||
|
||||
#if isinstance(failure.value, HttpError):
|
||||
if failure.check(HttpError):
|
||||
|
@ -196,14 +200,16 @@ class TorSplashCrawler():
|
|||
# this is the original request
|
||||
request = failure.request
|
||||
print(DNSLookupError)
|
||||
print('DNSLookupError')
|
||||
self.logger.error('DNSLookupError on %s', request.url)
|
||||
|
||||
#elif isinstance(failure.value, TimeoutError):
|
||||
elif failure.check(TimeoutError):
|
||||
request = failure.request
|
||||
print('TimeoutError')
|
||||
print(TimeoutError)
|
||||
self.logger.error('TimeoutError on %s', request.url)
|
||||
'''
|
||||
|
||||
|
||||
def save_crawled_paste(self, filename, content):
|
||||
|
||||
|
|
|
@ -58,6 +58,9 @@ pycountry
|
|||
# To fetch Onion urls
|
||||
PySocks
|
||||
|
||||
#extract subject
|
||||
newspaper3k
|
||||
|
||||
# decompress files
|
||||
sflock
|
||||
|
||||
|
|
|
@ -39,6 +39,23 @@ def get_date_range(num_day):
|
|||
|
||||
return list(reversed(date_list))
|
||||
|
||||
def unpack_paste_tags(p_tags):
|
||||
l_tags = []
|
||||
for tag in p_tags:
|
||||
complete_tag = tag
|
||||
tag = tag.split('=')
|
||||
if len(tag) > 1:
|
||||
if tag[1] != '':
|
||||
tag = tag[1][1:-1]
|
||||
# no value
|
||||
else:
|
||||
tag = tag[0][1:-1]
|
||||
# use for custom tags
|
||||
else:
|
||||
tag = tag[0]
|
||||
l_tags.append( (tag, complete_tag) )
|
||||
return l_tags
|
||||
|
||||
def get_onion_status(domain, date):
|
||||
if r_serv_onion.sismember('onion_up:'+date , domain):
|
||||
return True
|
||||
|
@ -76,43 +93,39 @@ def onion_domain():
|
|||
# # TODO: FIXME return 404
|
||||
|
||||
last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check')
|
||||
last_check = '{}/{}/{}'.format(last_check[0:4], last_check[4:6], last_check[6:8])
|
||||
first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen')
|
||||
domain_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent')
|
||||
date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain))
|
||||
first_seen = '{}/{}/{}'.format(first_seen[0:4], first_seen[4:6], first_seen[6:8])
|
||||
origin_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent')
|
||||
|
||||
h = HiddenServices(onion_domain, 'onion')
|
||||
l_pastes = h.get_last_crawled_pastes()
|
||||
if l_pastes:
|
||||
status = True
|
||||
else:
|
||||
status = False
|
||||
screenshot = h.get_domain_random_screenshot(l_pastes)
|
||||
if screenshot:
|
||||
screenshot = screenshot[0]
|
||||
else:
|
||||
screenshot = 'None'
|
||||
|
||||
domain_tags = h.get_domain_tags()
|
||||
|
||||
origin_paste_name = h.get_origin_paste_name()
|
||||
origin_paste_tags = unpack_paste_tags(r_serv_metadata.smembers('tag:{}'.format(origin_paste)))
|
||||
paste_tags = []
|
||||
path_name = []
|
||||
for path in l_pastes:
|
||||
path_name.append(path.replace(PASTES_FOLDER, ''))
|
||||
path_name.append(path.replace(PASTES_FOLDER+'/', ''))
|
||||
p_tags = r_serv_metadata.smembers('tag:'+path)
|
||||
l_tags = []
|
||||
for tag in p_tags:
|
||||
complete_tag = tag
|
||||
tag = tag.split('=')
|
||||
if len(tag) > 1:
|
||||
if tag[1] != '':
|
||||
tag = tag[1][1:-1]
|
||||
# no value
|
||||
else:
|
||||
tag = tag[0][1:-1]
|
||||
# use for custom tags
|
||||
else:
|
||||
tag = tag[0]
|
||||
l_tags.append( (tag, complete_tag) )
|
||||
paste_tags.append(l_tags)
|
||||
paste_tags.append(unpack_paste_tags(p_tags))
|
||||
|
||||
return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen,
|
||||
l_pastes=l_pastes, paste_tags=paste_tags, l_tags=l_tags, bootstrap_label=bootstrap_label,
|
||||
path_name=path_name,
|
||||
domain_paste=domain_paste, screenshot=screenshot)
|
||||
l_pastes=l_pastes, paste_tags=paste_tags, bootstrap_label=bootstrap_label,
|
||||
path_name=path_name, origin_paste_tags=origin_paste_tags, status=status,
|
||||
origin_paste=origin_paste, origin_paste_name=origin_paste_name,
|
||||
domain_tags=domain_tags, screenshot=screenshot)
|
||||
|
||||
# ============= JSON ==============
|
||||
@hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET'])
|
||||
|
|
|
@ -36,35 +36,61 @@
|
|||
|
||||
<div class="col-md-6">
|
||||
<div class="row">
|
||||
<div class="row">
|
||||
<div class="panel panel-default">
|
||||
<div class="panel panel-info">
|
||||
<div class="panel-heading">
|
||||
<i id="flash-tld" class="glyphicon glyphicon-flash " flash-tld=""></i> Graph
|
||||
</div>
|
||||
{% if status %}
|
||||
<div class="pull-right" style="color:Green;">
|
||||
<i class="fa fa-check-circle fa-2x"></i>
|
||||
UP
|
||||
</div>
|
||||
{% else %}
|
||||
<div class="pull-right" style="color:Red;">
|
||||
<i class="fa fa-times-circle fa-2x"></i>
|
||||
DOWN
|
||||
</div>
|
||||
{% endif %}
|
||||
<h3>{{ domain }} :</h3>
|
||||
<ul class="list-group">
|
||||
<li class="list-group-item">
|
||||
|
||||
<table class="table table-hover table-striped">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Domain</td>
|
||||
<td>{{ domain }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>First Seen</td>
|
||||
<td>{{ first_seen }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Last Check</td>
|
||||
<td>{{ last_check }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Origin Paste</td>
|
||||
<td>
|
||||
<a target="_blank" href="{{ url_for('showsavedpastes.showsavedpaste', paste=domain_paste) }}" />{{ domain_paste }}</a>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<table class="table table-condensed">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>First Seen</th>
|
||||
<th>Last Check</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="panelText"><a href="#">{{ first_seen }}</a></td>
|
||||
<td class="panelText"><a href="#">{{ last_check }}</a></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
</li>
|
||||
<li class="list-group-item">
|
||||
Origin Paste: <a target="_blank" href="{{ url_for('showsavedpastes.showsavedpaste', paste=origin_paste) }}" />{{ origin_paste_name }}</a>
|
||||
<div>
|
||||
{% for tag in origin_paste_tags %}
|
||||
<a href="{{ url_for('Tags.get_tagged_paste') }}?ltags={{ tag[1] }}">
|
||||
<span class="label label-{{ bootstrap_label[loop.index0 % 5] }} pull-left">{{ tag[0] }}</span>
|
||||
</a>
|
||||
{% endfor %}
|
||||
<br>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
{% for tag in domain_tags %}
|
||||
<a href="{{ url_for('Tags.get_tagged_paste') }}?ltags={{ tag }}">
|
||||
<span class="label label-{{ bootstrap_label[loop.index0 % 5] }} pull-left">{{ tag }} <i>{{ domain_tags[tag] }}</i></span>
|
||||
</a>
|
||||
{% endfor %}
|
||||
<br>
|
||||
<br>
|
||||
</div>
|
||||
|
||||
<table class="test table table-striped table-bordered table-hover table-responsive " id="myTable_">
|
||||
|
|
Loading…
Reference in a new issue