ail-framework/bin/WebStats.py
Mokaddem a80f6ca0c2 Bug fix related with redis:
Fixed typo key in redis for module creditcard and sqlinjection
Modified Curve redisLvlDb server
Modified Url.py so that it forwards name of protocol from saved protocolsfile
Added Cache control in Flask
Modified key-tab name into keys-tab
2016-08-09 11:59:36 +02:00

180 lines
7.4 KiB
Python
Executable file

#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
Template for new modules
"""
import time
import datetime
import redis
import os
from packages import lib_words
from packages.Date import Date
from pubsublogger import publisher
from Helper import Process
from pyfaup.faup import Faup
# Config Var
threshold_total_sum = 200 # Above this value, a keyword is eligible for a progression
threshold_increase = 1.0 # The percentage representing the keyword occurence since num_day_to_look
max_set_cardinality = 10 # The cardinality of the progression set
num_day_to_look = 5 # the detection of the progression start num_day_to_look in the past
def analyse(server, field_name, date, url_parsed):
field = url_parsed[field_name]
if field is not None:
prev_score = server.hget(field, date)
if prev_score is not None:
server.hset(field, date, int(prev_score) + 1)
else:
server.hset(field, date, 1)
if field_name == "domain": #save domain in a set for the monthly plot
domain_set_name = "domain_set_" + date[0:6]
server.sadd(domain_set_name, field)
print "added in " + domain_set_name +": "+ field
def get_date_range(num_day):
curr_date = datetime.date.today()
date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
date_list = []
for i in range(0, num_day+1):
date_list.append(date.substract_day(i))
return date_list
def compute_progression(server, field_name, num_day, url_parsed):
redis_progression_name = 'top_progression_'+field_name
redis_progression_name_set = 'top_progression_'+field_name+'_set'
keyword = url_parsed[field_name]
if keyword is not None:
date_range = get_date_range(num_day)
# check if this keyword is eligible for progression
keyword_total_sum = 0
value_list = []
for date in date_range: # get value up to date_range
curr_value = server.hget(keyword, date)
value_list.append(int(curr_value if curr_value is not None else 0))
keyword_total_sum += int(curr_value) if curr_value is not None else 0
oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division
# The progression is based on the ratio: value[i] / value[i-1]
keyword_increase = 0
value_list_reversed = value_list[:]
value_list_reversed.reverse()
for i in range(1, len(value_list_reversed)):
divisor = value_list_reversed[i-1] if value_list_reversed[i-1] != 0 else 1
keyword_increase += value_list_reversed[i] / divisor
# filter
if (keyword_total_sum > threshold_total_sum) and (keyword_increase > threshold_increase):
if server.sismember(redis_progression_name_set, keyword): #if keyword is in the set
server.hset(redis_progression_name, keyword, keyword_increase) #update its value
elif (server.scard(redis_progression_name_set) < max_set_cardinality):
server.sadd(redis_progression_name_set, keyword)
else: #not in the set
#Check value for all members
member_set = []
for keyw in server.smembers(redis_progression_name_set):
member_set += (keyw, int(server.hget(redis_progression_name, keyw)))
member_set.sort(key=lambda tup: tup[1])
if member_set[0] < keyword_increase:
#remove min from set and add the new one
server.srem(redis_progression_name_set, member_set[0])
server.sadd(redis_progression_name_set, keyword)
if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
# Port of the redis instance used by pubsublogger
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg
config_section = 'WebStats'
# Setup the I/O queues
p = Process(config_section)
# Sent to the logging a description of the module
publisher.info("Makes statistics about valid URL")
# REDIS #
r_serv1 = redis.StrictRedis(
host=p.config.get("Redis_Level_DB", "host"),
port=p.config.get("Redis_Level_DB", "port"),
db=p.config.get("Redis_Level_DB", "db"))
r_serv_trend = redis.StrictRedis(
host=p.config.get("Redis_Level_DB_Trending", "host"),
port=p.config.get("Redis_Level_DB_Trending", "port"),
db=p.config.get("Redis_Level_DB_Trending", "db"))
# FILE CURVE SECTION #
csv_path_proto = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "protocolstrending_csv"))
protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "protocolsfile"))
csv_path_tld = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "tldstrending_csv"))
tldsfile_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "tldsfile"))
csv_path_domain = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "domainstrending_csv"))
faup = Faup()
generate_new_graph = False
# Endless loop getting messages from the input queue
while True:
# Get one message from the input queue
message = p.get_from_set()
if message is None:
if generate_new_graph:
generate_new_graph = False
today = datetime.date.today()
year = today.year
month = today.month
print 'Building protocol graph'
lib_words.create_curve_with_word_file(r_serv_trend, csv_path_proto,
protocolsfile_path, year,
month)
print 'Building tld graph'
lib_words.create_curve_with_word_file(r_serv_trend, csv_path_tld,
tldsfile_path, year,
month)
print 'Building domain graph'
lib_words.create_curve_from_redis_set(r_serv_trend, csv_path_domain,
"domain", year,
month)
print 'end building'
publisher.debug("{} queue is empty, waiting".format(config_section))
print 'sleeping'
time.sleep(5*60)
continue
else:
generate_new_graph = True
# Do something with the message from the queue
url, date, path = message.split()
faup.decode(url)
url_parsed = faup.get()
analyse(r_serv_trend, 'scheme', date, url_parsed) #Scheme analysis
analyse(r_serv_trend, 'tld', date, url_parsed) #Tld analysis
analyse(r_serv_trend, 'domain', date, url_parsed) #Domain analysis
compute_progression(r_serv_trend, 'scheme', num_day_to_look, url_parsed)
compute_progression(r_serv_trend, 'tld', num_day_to_look, url_parsed)
compute_progression(r_serv_trend, 'domain', num_day_to_look, url_parsed)