2016-06-30 12:38:28 +00:00
|
|
|
#!/usr/bin/env python2
|
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
"""
|
|
|
|
Template for new modules
|
|
|
|
"""
|
|
|
|
|
|
|
|
import time
|
2016-07-01 14:59:08 +00:00
|
|
|
import datetime
|
2016-06-30 12:38:28 +00:00
|
|
|
import redis
|
|
|
|
import os
|
2016-07-01 14:59:08 +00:00
|
|
|
from packages import lib_words
|
2016-07-05 14:53:03 +00:00
|
|
|
from packages.Date import Date
|
2016-06-30 12:38:28 +00:00
|
|
|
from pubsublogger import publisher
|
|
|
|
from Helper import Process
|
2016-07-01 14:59:08 +00:00
|
|
|
from pyfaup.faup import Faup
|
2016-06-30 12:38:28 +00:00
|
|
|
|
2016-07-05 14:53:03 +00:00
|
|
|
# Config Var
|
2016-07-21 11:44:22 +00:00
|
|
|
threshold_total_sum = 200 # Above this value, a keyword is eligible for a progression
|
|
|
|
threshold_increase = 1.0 # The percentage representing the keyword occurence since num_day_to_look
|
|
|
|
max_set_cardinality = 10 # The cardinality of the progression set
|
|
|
|
num_day_to_look = 5 # the detection of the progression start num_day_to_look in the past
|
2016-07-20 12:12:18 +00:00
|
|
|
|
2016-07-22 07:25:05 +00:00
|
|
|
def analyse(server, field_name, date, url_parsed):
|
2016-07-01 14:59:08 +00:00
|
|
|
field = url_parsed[field_name]
|
|
|
|
if field is not None:
|
2016-08-18 13:34:19 +00:00
|
|
|
server.hincrby(field, date, 1)
|
|
|
|
if field_name == "domain": #save domain in a set for the monthly plot
|
|
|
|
domain_set_name = "domain_set_" + date[0:6]
|
|
|
|
server.sadd(domain_set_name, field)
|
|
|
|
print "added in " + domain_set_name +": "+ field
|
2016-07-05 14:53:03 +00:00
|
|
|
|
2016-07-21 11:44:22 +00:00
|
|
|
def get_date_range(num_day):
|
2016-07-05 14:53:03 +00:00
|
|
|
curr_date = datetime.date.today()
|
2016-07-21 11:44:22 +00:00
|
|
|
date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
|
|
|
|
date_list = []
|
|
|
|
|
|
|
|
for i in range(0, num_day+1):
|
|
|
|
date_list.append(date.substract_day(i))
|
|
|
|
return date_list
|
|
|
|
|
|
|
|
def compute_progression(server, field_name, num_day, url_parsed):
|
|
|
|
redis_progression_name = 'top_progression_'+field_name
|
|
|
|
redis_progression_name_set = 'top_progression_'+field_name+'_set'
|
|
|
|
|
|
|
|
keyword = url_parsed[field_name]
|
|
|
|
if keyword is not None:
|
|
|
|
date_range = get_date_range(num_day)
|
2016-08-08 13:28:26 +00:00
|
|
|
|
2016-07-21 11:44:22 +00:00
|
|
|
# check if this keyword is eligible for progression
|
|
|
|
keyword_total_sum = 0
|
|
|
|
value_list = []
|
2016-08-08 13:28:26 +00:00
|
|
|
for date in date_range: # get value up to date_range
|
2016-07-21 11:44:22 +00:00
|
|
|
curr_value = server.hget(keyword, date)
|
|
|
|
value_list.append(int(curr_value if curr_value is not None else 0))
|
|
|
|
keyword_total_sum += int(curr_value) if curr_value is not None else 0
|
|
|
|
oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division
|
2016-08-08 13:28:26 +00:00
|
|
|
|
|
|
|
# The progression is based on the ratio: value[i] / value[i-1]
|
|
|
|
keyword_increase = 0
|
|
|
|
value_list_reversed = value_list[:]
|
|
|
|
value_list_reversed.reverse()
|
|
|
|
for i in range(1, len(value_list_reversed)):
|
|
|
|
divisor = value_list_reversed[i-1] if value_list_reversed[i-1] != 0 else 1
|
|
|
|
keyword_increase += value_list_reversed[i] / divisor
|
|
|
|
|
2016-07-21 11:44:22 +00:00
|
|
|
# filter
|
|
|
|
if (keyword_total_sum > threshold_total_sum) and (keyword_increase > threshold_increase):
|
|
|
|
|
|
|
|
if server.sismember(redis_progression_name_set, keyword): #if keyword is in the set
|
|
|
|
server.hset(redis_progression_name, keyword, keyword_increase) #update its value
|
|
|
|
|
|
|
|
elif (server.scard(redis_progression_name_set) < max_set_cardinality):
|
2016-08-08 13:28:26 +00:00
|
|
|
server.sadd(redis_progression_name_set, keyword)
|
2016-07-21 11:44:22 +00:00
|
|
|
|
|
|
|
else: #not in the set
|
|
|
|
#Check value for all members
|
|
|
|
member_set = []
|
|
|
|
for keyw in server.smembers(redis_progression_name_set):
|
2016-08-17 09:26:07 +00:00
|
|
|
member_set.append((keyw, int(server.hget(redis_progression_name, keyw))))
|
2016-07-21 11:44:22 +00:00
|
|
|
member_set.sort(key=lambda tup: tup[1])
|
2016-08-18 13:34:19 +00:00
|
|
|
if member_set[0][1] < keyword_increase:
|
2016-08-25 08:43:04 +00:00
|
|
|
print 'removing', member_set[0][0] + '('+str(member_set[0][1])+')', 'and adding', keyword, str(keyword_increase)
|
2016-07-21 11:44:22 +00:00
|
|
|
#remove min from set and add the new one
|
2016-08-25 08:43:04 +00:00
|
|
|
server.srem(redis_progression_name_set, member_set[0][0])
|
2016-07-21 11:44:22 +00:00
|
|
|
server.sadd(redis_progression_name_set, keyword)
|
2016-08-25 08:43:04 +00:00
|
|
|
server.hdel(redis_progression_name, member_set[0][0])
|
|
|
|
server.hset(redis_progression_name, keyword, keyword_increase)
|
2016-07-05 14:53:03 +00:00
|
|
|
|
2016-06-30 12:38:28 +00:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
|
|
|
|
# Port of the redis instance used by pubsublogger
|
|
|
|
publisher.port = 6380
|
|
|
|
# Script is the default channel used for the modules.
|
|
|
|
publisher.channel = 'Script'
|
|
|
|
|
|
|
|
# Section name in bin/packages/modules.cfg
|
|
|
|
config_section = 'WebStats'
|
|
|
|
|
|
|
|
# Setup the I/O queues
|
|
|
|
p = Process(config_section)
|
|
|
|
|
|
|
|
# Sent to the logging a description of the module
|
|
|
|
publisher.info("Makes statistics about valid URL")
|
|
|
|
|
|
|
|
# REDIS #
|
2016-07-21 11:44:22 +00:00
|
|
|
r_serv_trend = redis.StrictRedis(
|
|
|
|
host=p.config.get("Redis_Level_DB_Trending", "host"),
|
|
|
|
port=p.config.get("Redis_Level_DB_Trending", "port"),
|
|
|
|
db=p.config.get("Redis_Level_DB_Trending", "db"))
|
2016-06-30 12:38:28 +00:00
|
|
|
|
|
|
|
# FILE CURVE SECTION #
|
2016-07-01 14:59:08 +00:00
|
|
|
csv_path_proto = os.path.join(os.environ['AIL_HOME'],
|
2016-07-20 12:12:18 +00:00
|
|
|
p.config.get("Directories", "protocolstrending_csv"))
|
2016-06-30 12:38:28 +00:00
|
|
|
protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
|
|
|
|
p.config.get("Directories", "protocolsfile"))
|
2016-07-01 14:59:08 +00:00
|
|
|
|
|
|
|
csv_path_tld = os.path.join(os.environ['AIL_HOME'],
|
2016-07-20 12:12:18 +00:00
|
|
|
p.config.get("Directories", "tldstrending_csv"))
|
2016-07-01 14:59:08 +00:00
|
|
|
tldsfile_path = os.path.join(os.environ['AIL_HOME'],
|
|
|
|
p.config.get("Directories", "tldsfile"))
|
|
|
|
|
2016-07-05 14:53:03 +00:00
|
|
|
csv_path_domain = os.path.join(os.environ['AIL_HOME'],
|
2016-07-20 12:12:18 +00:00
|
|
|
p.config.get("Directories", "domainstrending_csv"))
|
2016-07-05 14:53:03 +00:00
|
|
|
|
2016-07-01 14:59:08 +00:00
|
|
|
faup = Faup()
|
|
|
|
generate_new_graph = False
|
2016-06-30 12:38:28 +00:00
|
|
|
# Endless loop getting messages from the input queue
|
|
|
|
while True:
|
|
|
|
# Get one message from the input queue
|
|
|
|
message = p.get_from_set()
|
2016-07-20 12:12:18 +00:00
|
|
|
|
2016-06-30 12:38:28 +00:00
|
|
|
if message is None:
|
|
|
|
if generate_new_graph:
|
|
|
|
generate_new_graph = False
|
|
|
|
today = datetime.date.today()
|
|
|
|
year = today.year
|
|
|
|
month = today.month
|
2016-07-12 09:47:51 +00:00
|
|
|
|
2016-07-21 11:44:22 +00:00
|
|
|
print 'Building protocol graph'
|
|
|
|
lib_words.create_curve_with_word_file(r_serv_trend, csv_path_proto,
|
2016-06-30 12:38:28 +00:00
|
|
|
protocolsfile_path, year,
|
|
|
|
month)
|
2016-07-12 09:47:51 +00:00
|
|
|
|
2016-07-21 11:44:22 +00:00
|
|
|
print 'Building tld graph'
|
|
|
|
lib_words.create_curve_with_word_file(r_serv_trend, csv_path_tld,
|
2016-07-01 14:59:08 +00:00
|
|
|
tldsfile_path, year,
|
|
|
|
month)
|
2016-07-12 09:47:51 +00:00
|
|
|
|
2016-07-21 11:44:22 +00:00
|
|
|
print 'Building domain graph'
|
|
|
|
lib_words.create_curve_from_redis_set(r_serv_trend, csv_path_domain,
|
|
|
|
"domain", year,
|
2016-07-05 14:53:03 +00:00
|
|
|
month)
|
|
|
|
print 'end building'
|
2016-07-12 09:47:51 +00:00
|
|
|
|
2016-06-30 12:38:28 +00:00
|
|
|
publisher.debug("{} queue is empty, waiting".format(config_section))
|
2016-07-05 14:53:03 +00:00
|
|
|
print 'sleeping'
|
2016-07-21 11:44:22 +00:00
|
|
|
time.sleep(5*60)
|
2016-06-30 12:38:28 +00:00
|
|
|
continue
|
|
|
|
|
2016-07-20 12:12:18 +00:00
|
|
|
else:
|
2016-06-30 12:38:28 +00:00
|
|
|
generate_new_graph = True
|
|
|
|
# Do something with the message from the queue
|
2016-08-02 13:43:11 +00:00
|
|
|
url, date, path = message.split()
|
2016-07-01 14:59:08 +00:00
|
|
|
faup.decode(url)
|
|
|
|
url_parsed = faup.get()
|
|
|
|
|
2016-07-21 11:44:22 +00:00
|
|
|
analyse(r_serv_trend, 'scheme', date, url_parsed) #Scheme analysis
|
|
|
|
analyse(r_serv_trend, 'tld', date, url_parsed) #Tld analysis
|
|
|
|
analyse(r_serv_trend, 'domain', date, url_parsed) #Domain analysis
|
|
|
|
compute_progression(r_serv_trend, 'scheme', num_day_to_look, url_parsed)
|
|
|
|
compute_progression(r_serv_trend, 'tld', num_day_to_look, url_parsed)
|
|
|
|
compute_progression(r_serv_trend, 'domain', num_day_to_look, url_parsed)
|