mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 08:38:28 +00:00
fix: [Crawler] limit max crawled pages
This commit is contained in:
parent
64ffdd52e8
commit
c1b34bd99c
3 changed files with 7 additions and 7 deletions
|
@ -205,6 +205,11 @@ if __name__ == '__main__':
|
||||||
date_month = datetime.datetime.now().strftime("%Y%m")
|
date_month = datetime.datetime.now().strftime("%Y%m")
|
||||||
|
|
||||||
if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain):
|
if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain):
|
||||||
|
# last_father
|
||||||
|
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste)
|
||||||
|
|
||||||
|
# last check
|
||||||
|
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
||||||
|
|
||||||
crawl_onion(url, domain, date, date_month, message)
|
crawl_onion(url, domain, date, date_month, message)
|
||||||
if url != domain_url:
|
if url != domain_url:
|
||||||
|
@ -226,12 +231,6 @@ if __name__ == '__main__':
|
||||||
msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste)
|
msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste)
|
||||||
p.populate_set_out(msg, 'Tags')
|
p.populate_set_out(msg, 'Tags')
|
||||||
|
|
||||||
# last check
|
|
||||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
|
||||||
|
|
||||||
# last_father
|
|
||||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste)
|
|
||||||
|
|
||||||
# add onion screenshot history
|
# add onion screenshot history
|
||||||
# add crawled days
|
# add crawled days
|
||||||
if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date:
|
if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date:
|
||||||
|
|
|
@ -42,6 +42,7 @@ class TorSplashCrawler():
|
||||||
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
||||||
'HTTPERROR_ALLOW_ALL': True,
|
'HTTPERROR_ALLOW_ALL': True,
|
||||||
'RETRY_TIMES': 2,
|
'RETRY_TIMES': 2,
|
||||||
|
'CLOSESPIDER_PAGECOUNT': 1000,
|
||||||
'DEPTH_LIMIT': crawler_depth_limit
|
'DEPTH_LIMIT': crawler_depth_limit
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
|
@ -426,7 +426,7 @@ function refresh_list_crawled(){
|
||||||
newCell.innerHTML = "<td><i class=\"fa fa-"+icon+"-circle fa-2x\" style=\"color:"+text_color+";\"></i>"+crawler['crawler_info']+"</td>";
|
newCell.innerHTML = "<td><i class=\"fa fa-"+icon+"-circle fa-2x\" style=\"color:"+text_color+";\"></i>"+crawler['crawler_info']+"</td>";
|
||||||
|
|
||||||
newCell = newRow.insertCell(1);
|
newCell = newRow.insertCell(1);
|
||||||
newCell.innerHTML = "<td>"+crawler['crawling_domain']+"</td>";
|
newCell.innerHTML = "<td><a target=\"_blank\" href=\"{{ url_for('hiddenServices.onion_domain') }}?onion_domain="+crawler['crawling_domain']+"\">"+crawler['crawling_domain']+"</a></td>";
|
||||||
|
|
||||||
newCell = newRow.insertCell(2);
|
newCell = newRow.insertCell(2);
|
||||||
newCell.innerHTML = "<td><div style=\"color:"+text_color+";\">"+crawler['status_info']+"</div></td>";
|
newCell.innerHTML = "<td><div style=\"color:"+text_color+";\">"+crawler['status_info']+"</div></td>";
|
||||||
|
|
Loading…
Reference in a new issue