2022-10-25 14:25:19 +00:00
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
"""
|
|
|
|
Base Class for AIL Objects
|
|
|
|
"""
|
|
|
|
|
|
|
|
##################################
|
|
|
|
# Import External packages
|
|
|
|
##################################
|
|
|
|
import os
|
2023-05-25 12:33:12 +00:00
|
|
|
import re
|
2022-10-25 14:25:19 +00:00
|
|
|
import sys
|
|
|
|
from abc import abstractmethod, ABC
|
|
|
|
|
2022-11-29 15:01:01 +00:00
|
|
|
# from flask import url_for
|
2022-10-25 14:25:19 +00:00
|
|
|
|
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
|
|
|
##################################
|
|
|
|
# Import Project packages
|
|
|
|
##################################
|
|
|
|
from lib.objects.abstract_object import AbstractObject
|
|
|
|
from lib.ConfigLoader import ConfigLoader
|
|
|
|
from lib.item_basic import is_crawled, get_item_domain
|
2022-12-19 15:38:20 +00:00
|
|
|
from lib.data_retention_engine import update_obj_date
|
2022-10-25 14:25:19 +00:00
|
|
|
|
|
|
|
from packages import Date
|
|
|
|
|
|
|
|
# LOAD CONFIG
|
|
|
|
config_loader = ConfigLoader()
|
|
|
|
r_object = config_loader.get_db_conn("Kvrocks_Objects")
|
|
|
|
config_loader = None
|
|
|
|
|
|
|
|
class AbstractDaterangeObject(AbstractObject, ABC):
|
|
|
|
"""
|
|
|
|
Abstract Subtype Object
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, obj_type, id):
|
|
|
|
""" Abstract for all the AIL object
|
|
|
|
|
|
|
|
:param obj_type: object type (item, ...)
|
|
|
|
:param id: Object ID
|
|
|
|
"""
|
|
|
|
super().__init__(obj_type, id)
|
|
|
|
|
|
|
|
def exists(self):
|
2022-12-19 15:38:20 +00:00
|
|
|
return r_object.exists(f'meta:{self.type}:{self.id}')
|
2022-10-25 14:25:19 +00:00
|
|
|
|
2023-05-25 12:33:12 +00:00
|
|
|
def _get_field(self, field):
|
|
|
|
return r_object.hget(f'meta:{self.type}:{self.id}', field)
|
|
|
|
|
|
|
|
def _set_field(self, field, value):
|
|
|
|
return r_object.hset(f'meta:{self.type}:{self.id}', field, value)
|
|
|
|
|
2022-10-25 14:25:19 +00:00
|
|
|
def get_first_seen(self, r_int=False):
|
2023-05-25 12:33:12 +00:00
|
|
|
first_seen = self._get_field('first_seen')
|
2022-10-25 14:25:19 +00:00
|
|
|
if r_int:
|
|
|
|
if first_seen:
|
|
|
|
return int(first_seen)
|
|
|
|
else:
|
|
|
|
return 99999999
|
|
|
|
else:
|
|
|
|
return first_seen
|
|
|
|
|
|
|
|
def get_last_seen(self, r_int=False):
|
2023-05-25 12:33:12 +00:00
|
|
|
last_seen = self._get_field('last_seen')
|
2022-10-25 14:25:19 +00:00
|
|
|
if r_int:
|
|
|
|
if last_seen:
|
|
|
|
return int(last_seen)
|
|
|
|
else:
|
|
|
|
return 0
|
|
|
|
else:
|
|
|
|
return last_seen
|
|
|
|
|
|
|
|
def get_nb_seen(self):
|
2023-01-09 15:03:06 +00:00
|
|
|
return self.get_nb_correlation('item')
|
2022-10-25 14:25:19 +00:00
|
|
|
|
|
|
|
def get_nb_seen_by_date(self, date):
|
2022-12-21 13:20:13 +00:00
|
|
|
nb = r_object.zscore(f'{self.type}:date:{date}', self.id)
|
2022-10-25 14:25:19 +00:00
|
|
|
if nb is None:
|
|
|
|
return 0
|
|
|
|
else:
|
|
|
|
return int(nb)
|
|
|
|
|
|
|
|
def _get_meta(self, options=[]):
|
|
|
|
meta_dict = {'first_seen': self.get_first_seen(),
|
|
|
|
'last_seen': self.get_last_seen(),
|
|
|
|
'nb_seen': self.get_nb_seen()}
|
|
|
|
if 'sparkline' in options:
|
|
|
|
meta_dict['sparkline'] = self.get_sparkline()
|
|
|
|
return meta_dict
|
|
|
|
|
|
|
|
def set_first_seen(self, first_seen):
|
2023-05-25 12:33:12 +00:00
|
|
|
self._set_field('first_seen', first_seen)
|
2022-10-25 14:25:19 +00:00
|
|
|
|
|
|
|
def set_last_seen(self, last_seen):
|
2023-05-25 12:33:12 +00:00
|
|
|
self._set_field('last_seen', last_seen)
|
2022-10-25 14:25:19 +00:00
|
|
|
|
|
|
|
def update_daterange(self, date):
|
|
|
|
date = int(date)
|
|
|
|
# obj don't exit
|
|
|
|
if not self.exists():
|
|
|
|
self.set_first_seen(date)
|
|
|
|
self.set_last_seen(date)
|
|
|
|
else:
|
|
|
|
first_seen = self.get_first_seen(r_int=True)
|
|
|
|
last_seen = self.get_last_seen(r_int=True)
|
|
|
|
if date < first_seen:
|
|
|
|
self.set_first_seen(date)
|
|
|
|
if date > last_seen:
|
|
|
|
self.set_last_seen(date)
|
|
|
|
|
|
|
|
def get_sparkline(self):
|
|
|
|
sparkline = []
|
|
|
|
for date in Date.get_previous_date_list(6):
|
|
|
|
sparkline.append(self.get_nb_seen_by_date(date))
|
|
|
|
return sparkline
|
|
|
|
|
2023-05-12 13:43:41 +00:00
|
|
|
def get_content(self, r_type='str'):
|
|
|
|
if r_type == 'str':
|
|
|
|
return self.id
|
|
|
|
elif r_type == 'bytes':
|
|
|
|
return self.id.encode()
|
2023-05-04 14:35:56 +00:00
|
|
|
|
2023-01-09 15:03:06 +00:00
|
|
|
def _add_create(self):
|
|
|
|
r_object.sadd(f'{self.type}:all', self.id)
|
|
|
|
|
|
|
|
# TODO don't increase nb if same hash in item with different encoding
|
|
|
|
# if hash already in item
|
2022-10-25 14:25:19 +00:00
|
|
|
def _add(self, date, item_id):
|
|
|
|
if not self.exists():
|
2023-04-05 07:51:42 +00:00
|
|
|
self._add_create()
|
2022-10-25 14:25:19 +00:00
|
|
|
self.set_first_seen(date)
|
|
|
|
self.set_last_seen(date)
|
|
|
|
else:
|
|
|
|
self.update_daterange(date)
|
2022-12-19 15:38:20 +00:00
|
|
|
update_obj_date(date, self.type)
|
2022-10-25 14:25:19 +00:00
|
|
|
|
|
|
|
# NB Object seen by day
|
2023-01-09 15:03:06 +00:00
|
|
|
if not self.is_correlated('item', '', item_id): # if decoded not already in object
|
|
|
|
r_object.zincrby(f'{self.type}:date:{date}', 1, self.id)
|
2022-10-25 14:25:19 +00:00
|
|
|
|
|
|
|
# Correlations
|
|
|
|
self.add_correlation('item', '', item_id)
|
|
|
|
if is_crawled(item_id): # Domain
|
|
|
|
domain = get_item_domain(item_id)
|
|
|
|
self.add_correlation('domain', '', domain)
|
|
|
|
|
|
|
|
# TODO:ADD objects + Stats
|
2023-05-25 12:33:12 +00:00
|
|
|
def _create(self, first_seen=None, last_seen=None):
|
|
|
|
if first_seen:
|
|
|
|
self.set_first_seen(first_seen)
|
|
|
|
if last_seen:
|
|
|
|
self.set_last_seen(last_seen)
|
2022-10-25 14:25:19 +00:00
|
|
|
r_object.sadd(f'{self.type}:all', self.id)
|
|
|
|
|
|
|
|
# TODO
|
|
|
|
def _delete(self):
|
|
|
|
pass
|
2023-05-25 12:33:12 +00:00
|
|
|
|
|
|
|
|
|
|
|
class AbstractDaterangeObjects(ABC):
|
|
|
|
"""
|
|
|
|
Abstract Daterange Objects
|
|
|
|
"""
|
|
|
|
|
2023-06-13 13:29:11 +00:00
|
|
|
def __init__(self, obj_type, obj_class):
|
2023-05-25 12:33:12 +00:00
|
|
|
""" Abstract for Daterange Objects
|
|
|
|
|
|
|
|
:param obj_type: object type (item, ...)
|
2023-06-13 13:29:11 +00:00
|
|
|
:param obj_class: object python class (Item, ...)
|
2023-05-25 12:33:12 +00:00
|
|
|
"""
|
|
|
|
self.type = obj_type
|
2023-06-13 13:29:11 +00:00
|
|
|
self.obj_class = obj_class
|
2023-05-25 12:33:12 +00:00
|
|
|
|
2023-06-13 13:29:11 +00:00
|
|
|
def get_ids(self):
|
2023-05-25 12:33:12 +00:00
|
|
|
return r_object.smembers(f'{self.type}:all')
|
|
|
|
|
2023-06-13 13:29:11 +00:00
|
|
|
# def get_ids_iterator(self):
|
|
|
|
# return r_object.sscan_iter(r_object, f'{self.type}:all')
|
|
|
|
|
2023-05-25 12:33:12 +00:00
|
|
|
def get_by_date(self, date):
|
|
|
|
return r_object.zrange(f'{self.type}:date:{date}', 0, -1)
|
|
|
|
|
|
|
|
def get_nb_by_date(self, date):
|
|
|
|
return r_object.zcard(f'{self.type}:date:{date}')
|
|
|
|
|
|
|
|
def get_by_daterange(self, date_from, date_to):
|
|
|
|
obj_ids = set()
|
|
|
|
for date in Date.substract_date(date_from, date_to):
|
|
|
|
obj_ids = obj_ids | set(self.get_by_date(date))
|
|
|
|
return obj_ids
|
|
|
|
|
|
|
|
def get_metas(self, obj_ids, options=set()):
|
|
|
|
dict_obj = {}
|
|
|
|
for obj_id in obj_ids:
|
2023-06-13 13:29:11 +00:00
|
|
|
obj = self.obj_class(obj_id)
|
2023-05-25 12:33:12 +00:00
|
|
|
dict_obj[obj_id] = obj.get_meta(options=options)
|
|
|
|
return dict_obj
|
|
|
|
|
|
|
|
@abstractmethod
|
2023-06-13 13:29:11 +00:00
|
|
|
def sanitize_id_to_search(self, id_to_search):
|
|
|
|
return id_to_search
|
2023-05-25 12:33:12 +00:00
|
|
|
|
2023-06-13 13:29:11 +00:00
|
|
|
def search_by_id(self, name_to_search, r_pos=False, case_sensitive=True):
|
2023-05-25 12:33:12 +00:00
|
|
|
objs = {}
|
2023-06-13 13:29:11 +00:00
|
|
|
if case_sensitive:
|
|
|
|
flags = 0
|
|
|
|
else:
|
|
|
|
flags = re.IGNORECASE
|
2023-05-25 12:33:12 +00:00
|
|
|
# for subtype in subtypes:
|
2023-06-13 13:29:11 +00:00
|
|
|
r_name = self.sanitize_id_to_search(name_to_search)
|
2023-05-25 12:33:12 +00:00
|
|
|
if not name_to_search or isinstance(r_name, dict):
|
|
|
|
return objs
|
2023-06-13 13:29:11 +00:00
|
|
|
r_name = re.compile(r_name, flags=flags)
|
|
|
|
for obj_id in self.get_ids(): # TODO REPLACE ME WITH AN ITERATOR
|
|
|
|
res = re.search(r_name, obj_id)
|
2023-05-25 12:33:12 +00:00
|
|
|
if res:
|
2023-06-13 13:29:11 +00:00
|
|
|
objs[obj_id] = {}
|
2023-05-25 12:33:12 +00:00
|
|
|
if r_pos:
|
2023-06-13 13:29:11 +00:00
|
|
|
objs[obj_id]['hl-start'] = res.start()
|
|
|
|
objs[obj_id]['hl-end'] = res.end()
|
|
|
|
return objs
|
|
|
|
|
|
|
|
def sanitize_content_to_search(self, content_to_search):
|
|
|
|
return content_to_search
|
|
|
|
|
|
|
|
def search_by_content(self, content_to_search, r_pos=False, case_sensitive=True):
|
|
|
|
objs = {}
|
|
|
|
if case_sensitive:
|
|
|
|
flags = 0
|
|
|
|
else:
|
|
|
|
flags = re.IGNORECASE
|
|
|
|
# for subtype in subtypes:
|
|
|
|
r_search = self.sanitize_content_to_search(content_to_search)
|
|
|
|
if not r_search or isinstance(r_search, dict):
|
|
|
|
return objs
|
|
|
|
r_search = re.compile(r_search, flags=flags)
|
|
|
|
for obj_id in self.get_ids(): # TODO REPLACE ME WITH AN ITERATOR
|
|
|
|
obj = self.obj_class(obj_id)
|
|
|
|
content = obj.get_content()
|
|
|
|
res = re.search(r_search, content)
|
|
|
|
if res:
|
|
|
|
objs[obj_id] = {}
|
|
|
|
if r_pos: # TODO ADD CONTENT ????
|
|
|
|
objs[obj_id]['hl-start'] = res.start()
|
|
|
|
objs[obj_id]['hl-end'] = res.end()
|
|
|
|
objs[obj_id]['content'] = content
|
2023-05-25 12:33:12 +00:00
|
|
|
return objs
|
|
|
|
|
|
|
|
def api_get_chart_nb_by_daterange(self, date_from, date_to):
|
|
|
|
date_type = []
|
|
|
|
for date in Date.substract_date(date_from, date_to):
|
|
|
|
d = {'date': f'{date[0:4]}-{date[4:6]}-{date[6:8]}',
|
|
|
|
self.type: self.get_nb_by_date(date)}
|
|
|
|
date_type.append(d)
|
|
|
|
return date_type
|
|
|
|
|
|
|
|
def api_get_meta_by_daterange(self, date_from, date_to):
|
|
|
|
date = Date.sanitise_date_range(date_from, date_to)
|
2023-06-13 13:29:11 +00:00
|
|
|
return self.get_metas(self.get_by_daterange(date['date_from'], date['date_to']), options={'sparkline'})
|