ail-framework/update/v3.4/Update_domain.py

121 lines
4.3 KiB
Python
Executable file

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import time
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib import ConfigLoader
from lib.objects.Items import Item
def get_domain_type(domain_name):
if str(domain_name).endswith('.onion'):
return 'onion'
else:
return 'regular'
def add_domain_language(domain_name, language):
language = language.split('-')[0]
domain_type = get_domain_type(domain_name)
r_serv_onion.sadd('all_domains_languages', language)
r_serv_onion.sadd(f'all_domains_languages:{domain_type}', language)
r_serv_onion.sadd(f'language:domains:{domain_type}:{language}', domain_name)
r_serv_onion.sadd(f'domain:language:{domain_name}', language)
def add_domain_languages_by_item_id(domain_name, item_id):
item = Item(item_id)
for lang in item.get_languages():
add_domain_language(domain_name, lang.language)
def update_update_stats():
nb_updated = int(r_serv_db.get('update:nb_elem_converted'))
progress = int((nb_updated * 100) / nb_elem_to_update)
print(f'{nb_updated}/{nb_elem_to_update} updated {progress}%')
r_serv_db.set('ail:current_background_script_stat', progress)
def update_domain_language(domain_obj, item_id):
domain_name = domain_obj.get_domain_name()
add_domain_languages_by_item_id(domain_name, item_id)
def get_domain_history(domain_type, domain_name):
return r_serv_onion.zrange(f'crawler_history_{domain_type}:{domain_name}:80', 0, -1, withscores=True)
def get_item_children(item_id):
return r_serv_metadata.smembers(f'paste_children:{item_id}')
def get_domain_items(domain_name, root_item_id):
dom_item = get_domain_item_children(domain_name, root_item_id)
dom_item.append(root_item_id)
return dom_item
def is_item_in_domain(domain_name, item_id):
is_in_domain = False
domain_length = len(domain_name)
if len(item_id) > (domain_length+48):
if item_id[-36-domain_length:-36] == domain_name:
is_in_domain = True
return is_in_domain
def get_domain_item_children(domain_name, root_item_id):
all_items = []
for item_id in get_item_children(root_item_id):
if is_item_in_domain(domain_name, item_id):
all_items.append(item_id)
all_items.extend(get_domain_item_children(domain_name, item_id))
return all_items
def get_domain_crawled_item_root(domain_name, domain_type, epoch):
res = r_serv_onion.zrevrangebyscore(f'crawler_history_{domain_type}:{domain_name}:80', int(epoch), int(epoch))
return {"root_item": res[0], "epoch": int(epoch)}
def get_domain_items_crawled(domain_name, domain_type, epoch):
item_crawled = []
item_root = get_domain_crawled_item_root(domain_name, domain_type, epoch)
if item_root:
if item_root['root_item'] != str(item_root['epoch']) and item_root['root_item']:
for item_id in get_domain_items(domain_name, item_root['root_item']):
item_crawled.append(item_id)
return item_crawled
if __name__ == '__main__':
start_deb = time.time()
config_loader = ConfigLoader.ConfigLoader()
r_serv_db = config_loader.get_redis_conn("ARDB_DB")
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
config_loader = None
r_serv_db.set('ail:current_background_script', 'domain languages update')
nb_elem_to_update = r_serv_db.get('update:nb_elem_to_convert')
if not nb_elem_to_update:
nb_elem_to_update = 1
else:
nb_elem_to_update = int(nb_elem_to_update)
# _delete_all_domains_languages()
while True:
domain = r_serv_onion.spop('domain_update_v3.4')
if domain is not None:
print(domain)
domain = str(domain)
domain_t = get_domain_type(domain)
for domain_history in get_domain_history(domain_t, domain):
domain_items = get_domain_items_crawled(domain, domain_t, domain_history[1])
for id_item in domain_items:
update_domain_language(domain, id_item)
r_serv_db.incr('update:nb_elem_converted')
update_update_stats()
else:
r_serv_db.set('ail:current_background_script_stat', 100)
sys.exit(0)