2020-12-11 20:02:07 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
|
2022-11-23 10:02:33 +00:00
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
|
|
|
##################################
|
|
|
|
# Import Project packages
|
|
|
|
##################################
|
|
|
|
from lib import ConfigLoader
|
|
|
|
from lib.objects.Items import Item
|
|
|
|
|
|
|
|
def get_domain_type(domain_name):
|
|
|
|
if str(domain_name).endswith('.onion'):
|
|
|
|
return 'onion'
|
|
|
|
else:
|
|
|
|
return 'regular'
|
|
|
|
|
|
|
|
def add_domain_language(domain_name, language):
|
|
|
|
language = language.split('-')[0]
|
|
|
|
domain_type = get_domain_type(domain_name)
|
|
|
|
r_serv_onion.sadd('all_domains_languages', language)
|
|
|
|
r_serv_onion.sadd(f'all_domains_languages:{domain_type}', language)
|
|
|
|
r_serv_onion.sadd(f'language:domains:{domain_type}:{language}', domain_name)
|
|
|
|
r_serv_onion.sadd(f'domain:language:{domain_name}', language)
|
|
|
|
|
|
|
|
def add_domain_languages_by_item_id(domain_name, item_id):
|
|
|
|
item = Item(item_id)
|
|
|
|
for lang in item.get_languages():
|
|
|
|
add_domain_language(domain_name, lang.language)
|
2020-12-11 20:02:07 +00:00
|
|
|
|
|
|
|
def update_update_stats():
|
|
|
|
nb_updated = int(r_serv_db.get('update:nb_elem_converted'))
|
|
|
|
progress = int((nb_updated * 100) / nb_elem_to_update)
|
2022-11-23 10:02:33 +00:00
|
|
|
print(f'{nb_updated}/{nb_elem_to_update} updated {progress}%')
|
2020-12-11 20:02:07 +00:00
|
|
|
r_serv_db.set('ail:current_background_script_stat', progress)
|
|
|
|
|
|
|
|
def update_domain_language(domain_obj, item_id):
|
|
|
|
domain_name = domain_obj.get_domain_name()
|
2022-11-23 10:02:33 +00:00
|
|
|
add_domain_languages_by_item_id(domain_name, item_id)
|
|
|
|
|
|
|
|
def get_domain_history(domain_type, domain_name):
|
|
|
|
return r_serv_onion.zrange(f'crawler_history_{domain_type}:{domain_name}:80', 0, -1, withscores=True)
|
|
|
|
|
|
|
|
|
|
|
|
def get_item_children(item_id):
|
|
|
|
return r_serv_metadata.smembers(f'paste_children:{item_id}')
|
|
|
|
|
|
|
|
def get_domain_items(domain_name, root_item_id):
|
|
|
|
dom_item = get_domain_item_children(domain_name, root_item_id)
|
|
|
|
dom_item.append(root_item_id)
|
|
|
|
return dom_item
|
|
|
|
|
|
|
|
def is_item_in_domain(domain_name, item_id):
|
|
|
|
is_in_domain = False
|
|
|
|
domain_length = len(domain_name)
|
|
|
|
if len(item_id) > (domain_length+48):
|
|
|
|
if item_id[-36-domain_length:-36] == domain_name:
|
|
|
|
is_in_domain = True
|
|
|
|
return is_in_domain
|
|
|
|
|
|
|
|
def get_domain_item_children(domain_name, root_item_id):
|
|
|
|
all_items = []
|
|
|
|
for item_id in get_item_children(root_item_id):
|
|
|
|
if is_item_in_domain(domain_name, item_id):
|
|
|
|
all_items.append(item_id)
|
|
|
|
all_items.extend(get_domain_item_children(domain_name, item_id))
|
|
|
|
return all_items
|
|
|
|
|
|
|
|
def get_domain_crawled_item_root(domain_name, domain_type, epoch):
|
|
|
|
res = r_serv_onion.zrevrangebyscore(f'crawler_history_{domain_type}:{domain_name}:80', int(epoch), int(epoch))
|
|
|
|
return {"root_item": res[0], "epoch": int(epoch)}
|
|
|
|
|
|
|
|
def get_domain_items_crawled(domain_name, domain_type, epoch):
|
|
|
|
item_crawled = []
|
|
|
|
item_root = get_domain_crawled_item_root(domain_name, domain_type, epoch)
|
|
|
|
if item_root:
|
|
|
|
if item_root['root_item'] != str(item_root['epoch']) and item_root['root_item']:
|
|
|
|
for item_id in get_domain_items(domain_name, item_root['root_item']):
|
|
|
|
item_crawled.append(item_id)
|
|
|
|
return item_crawled
|
|
|
|
|
2020-12-11 20:02:07 +00:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
start_deb = time.time()
|
|
|
|
config_loader = ConfigLoader.ConfigLoader()
|
|
|
|
r_serv_db = config_loader.get_redis_conn("ARDB_DB")
|
|
|
|
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
|
2022-11-23 10:02:33 +00:00
|
|
|
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
2020-12-11 20:02:07 +00:00
|
|
|
config_loader = None
|
|
|
|
|
2021-06-08 14:46:36 +00:00
|
|
|
r_serv_db.set('ail:current_background_script', 'domain languages update')
|
2021-01-22 16:03:43 +00:00
|
|
|
|
2020-12-11 20:02:07 +00:00
|
|
|
nb_elem_to_update = r_serv_db.get('update:nb_elem_to_convert')
|
|
|
|
if not nb_elem_to_update:
|
|
|
|
nb_elem_to_update = 1
|
|
|
|
else:
|
|
|
|
nb_elem_to_update = int(nb_elem_to_update)
|
|
|
|
|
2022-11-23 10:02:33 +00:00
|
|
|
# _delete_all_domains_languages()
|
2020-12-11 20:02:07 +00:00
|
|
|
|
|
|
|
while True:
|
|
|
|
domain = r_serv_onion.spop('domain_update_v3.4')
|
|
|
|
if domain is not None:
|
|
|
|
print(domain)
|
2022-11-23 10:02:33 +00:00
|
|
|
domain = str(domain)
|
|
|
|
domain_t = get_domain_type(domain)
|
|
|
|
for domain_history in get_domain_history(domain_t, domain):
|
|
|
|
domain_items = get_domain_items_crawled(domain, domain_t, domain_history[1])
|
|
|
|
for id_item in domain_items:
|
|
|
|
update_domain_language(domain, id_item)
|
2020-12-11 20:02:07 +00:00
|
|
|
|
|
|
|
r_serv_db.incr('update:nb_elem_converted')
|
|
|
|
update_update_stats()
|
|
|
|
|
|
|
|
else:
|
2021-01-06 14:56:03 +00:00
|
|
|
r_serv_db.set('ail:current_background_script_stat', 100)
|
2020-12-11 20:02:07 +00:00
|
|
|
sys.exit(0)
|