ail-framework/update/v5.9/reprocess_dom_hash.py
terrtia b988f46c90
Some checks are pending
CI / ail_test (3.10) (push) Waiting to run
CI / ail_test (3.7) (push) Waiting to run
CI / ail_test (3.8) (push) Waiting to run
CI / ail_test (3.9) (push) Waiting to run
chg: [dom-hash] add dom-hash object compute dom-hash for domains and crawled items
2024-10-17 12:14:48 +02:00

39 lines
1.2 KiB
Python
Executable file

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import gzip
import os
import sys
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib import ail_updates
from lib.objects import ail_objects
from lib.objects import DomHashs
from lib.objects.Domains import Domain
if __name__ == '__main__':
update = ail_updates.AILBackgroundUpdate('v5.9')
n = 0
nb_items = ail_objects.card_obj_iterator('item', filters={'sources': ['crawled']})
update.set_nb_to_update(nb_items)
for item in ail_objects.obj_iterator('item', filters={'sources': ['crawled']}):
dom = item.get_domain()
domain = Domain(dom)
i_content = item.get_content()
if domain.exists() and i_content:
date = item.get_date()
# DOM-HASH
dom_hash = DomHashs.create(i_content)
dom_hash.add(date, item)
dom_hash.add_correlation('domain', '', domain.id)
print(domain.id, item.id, dom_hash.id)
update.inc_nb_updated()
n += 1
if n % 100 == 0:
update.update_progress()