ail-framework/bin/WebStats.py

#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
    Template for new modules
"""

import time
import datetime
import redis
import os
from packages import lib_words
from packages.Date import Date
from pubsublogger import publisher
from Helper import Process
from pyfaup.faup import Faup

# Config Var
threshold_total_sum = 200 # Above this value, a keyword is eligible for a progression
threshold_increase = 1.0  # The percentage representing the keyword occurence since num_day_to_look
max_set_cardinality = 10  # The cardinality of the progression set
num_day_to_look = 5       # the detection of the progression start num_day_to_look in the past

def analyse(server, field_name, date, url_parsed):
    field = url_parsed[field_name]
    if field is not None:
        server.hincrby(field, date, 1)
        if field_name == "domain": #save domain in a set for the monthly plot
            domain_set_name = "domain_set_" + date[0:6]
            server.sadd(domain_set_name, field)
            print "added in " + domain_set_name +": "+ field

def get_date_range(num_day):
    curr_date = datetime.date.today()
    date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
    date_list = []

    for i in range(0, num_day+1):
        date_list.append(date.substract_day(i))
    return date_list

def compute_progression(server, field_name, num_day, url_parsed):
    redis_progression_name = 'top_progression_'+field_name
    redis_progression_name_set = 'top_progression_'+field_name+'_set'

    keyword = url_parsed[field_name]
    if keyword is not None:
        date_range = get_date_range(num_day) 

        # check if this keyword is eligible for progression
        keyword_total_sum = 0 
        value_list = []
        for date in date_range: # get value up to date_range
            curr_value = server.hget(keyword, date)
            value_list.append(int(curr_value if curr_value is not None else 0))
            keyword_total_sum += int(curr_value) if curr_value is not None else 0
        oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division

        # The progression is based on the ratio: value[i] / value[i-1]
        keyword_increase = 0
        value_list_reversed = value_list[:]
        value_list_reversed.reverse()
        for i in range(1, len(value_list_reversed)):
            divisor = value_list_reversed[i-1] if value_list_reversed[i-1] != 0 else 1
            keyword_increase += value_list_reversed[i] / divisor

        # filter
        if (keyword_total_sum > threshold_total_sum) and (keyword_increase > threshold_increase):
            
            if server.sismember(redis_progression_name_set, keyword): #if keyword is in the set
                server.hset(redis_progression_name, keyword, keyword_increase) #update its value

            elif (server.scard(redis_progression_name_set) < max_set_cardinality):
                server.sadd(redis_progression_name_set, keyword)

            else: #not in the set
                #Check value for all members
                member_set = []
                for keyw in server.smembers(redis_progression_name_set):
                    member_set.append((keyw, int(server.hget(redis_progression_name, keyw))))
                member_set.sort(key=lambda tup: tup[1])
                if member_set[0][1] < keyword_increase:
                    print 'removing', member_set[0][0] + '('+str(member_set[0][1])+')', 'and adding', keyword, str(keyword_increase)
                    #remove min from set and add the new one
                    server.srem(redis_progression_name_set, member_set[0][0])
                    server.sadd(redis_progression_name_set, keyword)
                    server.hdel(redis_progression_name, member_set[0][0])
                    server.hset(redis_progression_name, keyword, keyword_increase)


if __name__ == '__main__':
    # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
    # Port of the redis instance used by pubsublogger
    publisher.port = 6380
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'

    # Section name in bin/packages/modules.cfg
    config_section = 'WebStats'

    # Setup the I/O queues
    p = Process(config_section)

    # Sent to the logging a description of the module
    publisher.info("Makes statistics about valid URL")

    # REDIS #
    r_serv_trend = redis.StrictRedis(
        host=p.config.get("Redis_Level_DB_Trending", "host"),
        port=p.config.get("Redis_Level_DB_Trending", "port"),
        db=p.config.get("Redis_Level_DB_Trending", "db"))

    # FILE CURVE SECTION #
    csv_path_proto = os.path.join(os.environ['AIL_HOME'],
                                  p.config.get("Directories", "protocolstrending_csv"))
    protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
                                 p.config.get("Directories", "protocolsfile"))
    
    csv_path_tld = os.path.join(os.environ['AIL_HOME'],
                                p.config.get("Directories", "tldstrending_csv"))
    tldsfile_path = os.path.join(os.environ['AIL_HOME'],
                                 p.config.get("Directories", "tldsfile"))

    csv_path_domain = os.path.join(os.environ['AIL_HOME'],
                                   p.config.get("Directories", "domainstrending_csv"))

    faup = Faup()
    generate_new_graph = False
    # Endless loop getting messages from the input queue
    while True:
        # Get one message from the input queue
        message = p.get_from_set()

        if message is None:
            if generate_new_graph:
                generate_new_graph = False
                today = datetime.date.today()
                year = today.year
                month = today.month

                print 'Building protocol graph'
                lib_words.create_curve_with_word_file(r_serv_trend, csv_path_proto,
                                                      protocolsfile_path, year,
                                                      month)

                print 'Building tld graph'
                lib_words.create_curve_with_word_file(r_serv_trend, csv_path_tld,
                                                      tldsfile_path, year,
                                                      month)

                print 'Building domain graph'
                lib_words.create_curve_from_redis_set(r_serv_trend, csv_path_domain,
                                                      "domain", year,
                                                      month)
                print 'end building'

            publisher.debug("{} queue is empty, waiting".format(config_section))
            print 'sleeping'
            time.sleep(5*60)
            continue

        else:
            generate_new_graph = True
            # Do something with the message from the queue
            url, date, path = message.split()
            faup.decode(url)
            url_parsed = faup.get()
            
            analyse(r_serv_trend, 'scheme', date, url_parsed)	#Scheme analysis
            analyse(r_serv_trend, 'tld', date, url_parsed)	#Tld analysis
	    analyse(r_serv_trend, 'domain', date, url_parsed)	#Domain analysis
            compute_progression(r_serv_trend, 'scheme', num_day_to_look, url_parsed)
            compute_progression(r_serv_trend, 'tld', num_day_to_look, url_parsed)
            compute_progression(r_serv_trend, 'domain', num_day_to_look, url_parsed)
Added new modules and started WebTrending web interface 2016-06-30 12:38:28 +00:00			`#!/usr/bin/env python2`
			`# --coding:UTF-8 -`
			`"""`
			`Template for new modules`
			`"""`

			`import time`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 14:59:08 +00:00			`import datetime`
Added new modules and started WebTrending web interface 2016-06-30 12:38:28 +00:00			`import redis`
			`import os`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 14:59:08 +00:00			`from packages import lib_words`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 14:53:03 +00:00			`from packages.Date import Date`
Added new modules and started WebTrending web interface 2016-06-30 12:38:28 +00:00			`from pubsublogger import publisher`
			`from Helper import Process`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 14:59:08 +00:00			`from pyfaup.faup import Faup`
Added new modules and started WebTrending web interface 2016-06-30 12:38:28 +00:00
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 14:53:03 +00:00			`# Config Var`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`threshold_total_sum = 200 # Above this value, a keyword is eligible for a progression`
			`threshold_increase = 1.0 # The percentage representing the keyword occurence since num_day_to_look`
			`max_set_cardinality = 10 # The cardinality of the progression set`
			`num_day_to_look = 5 # the detection of the progression start num_day_to_look in the past`
Fix pep8 2016-07-20 12:12:18 +00:00
Fixed bug introduced in merge-conflict 2016-07-22 07:25:05 +00:00			`def analyse(server, field_name, date, url_parsed):`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 14:59:08 +00:00			`field = url_parsed[field_name]`
			`if field is not None:`
- Modified redis connection (from levelDB to redis). - Added term frequency in curve. - Modified ModuleStats and WebStats to use redis special command (incrby instead of get-set and zset) and Flask to perform the correct queries. - Added panel color in sentiment trending 2016-08-18 13:34:19 +00:00			`server.hincrby(field, date, 1)`
			`if field_name == "domain": #save domain in a set for the monthly plot`
			`domain_set_name = "domain_set_" + date[0:6]`
			`server.sadd(domain_set_name, field)`
			`print "added in " + domain_set_name +": "+ field`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 14:53:03 +00:00
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`def get_date_range(num_day):`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 14:53:03 +00:00			`curr_date = datetime.date.today()`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))`
			`date_list = []`

			`for i in range(0, num_day+1):`
			`date_list.append(date.substract_day(i))`
			`return date_list`

			`def compute_progression(server, field_name, num_day, url_parsed):`
			`redis_progression_name = 'top_progression_'+field_name`
			`redis_progression_name_set = 'top_progression_'+field_name+'_set'`

			`keyword = url_parsed[field_name]`
			`if keyword is not None:`
			`date_range = get_date_range(num_day)`
Fixeed bug in Duplicate (The comparison value was not saved correctly in redis) + Modified to progression detection algo 2016-08-08 13:28:26 +00:00
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`# check if this keyword is eligible for progression`
			`keyword_total_sum = 0`
			`value_list = []`
Fixeed bug in Duplicate (The comparison value was not saved correctly in redis) + Modified to progression detection algo 2016-08-08 13:28:26 +00:00			`for date in date_range: # get value up to date_range`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`curr_value = server.hget(keyword, date)`
			`value_list.append(int(curr_value if curr_value is not None else 0))`
			`keyword_total_sum += int(curr_value) if curr_value is not None else 0`
			`oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division`
Fixeed bug in Duplicate (The comparison value was not saved correctly in redis) + Modified to progression detection algo 2016-08-08 13:28:26 +00:00
			`# The progression is based on the ratio: value[i] / value[i-1]`
			`keyword_increase = 0`
			`value_list_reversed = value_list[:]`
			`value_list_reversed.reverse()`
			`for i in range(1, len(value_list_reversed)):`
			`divisor = value_list_reversed[i-1] if value_list_reversed[i-1] != 0 else 1`
			`keyword_increase += value_list_reversed[i] / divisor`

Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`# filter`
			`if (keyword_total_sum > threshold_total_sum) and (keyword_increase > threshold_increase):`

			`if server.sismember(redis_progression_name_set, keyword): #if keyword is in the set`
			`server.hset(redis_progression_name, keyword, keyword_increase) #update its value`

			`elif (server.scard(redis_progression_name_set) < max_set_cardinality):`
Fixeed bug in Duplicate (The comparison value was not saved correctly in redis) + Modified to progression detection algo 2016-08-08 13:28:26 +00:00			`server.sadd(redis_progression_name_set, keyword)`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00
			`else: #not in the set`
			`#Check value for all members`
			`member_set = []`
			`for keyw in server.smembers(redis_progression_name_set):`
modified + into append 2016-08-17 09:26:07 +00:00			`member_set.append((keyw, int(server.hget(redis_progression_name, keyw))))`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`member_set.sort(key=lambda tup: tup[1])`
- Modified redis connection (from levelDB to redis). - Added term frequency in curve. - Modified ModuleStats and WebStats to use redis special command (incrby instead of get-set and zset) and Flask to perform the correct queries. - Added panel color in sentiment trending 2016-08-18 13:34:19 +00:00			`if member_set[0][1] < keyword_increase:`
Fixed bug in WebStats not deleting correct key + fixed typo in CurveSetManager preventing connecting to redis + modified display in moduleInfo 2016-08-25 08:43:04 +00:00			`print 'removing', member_set[0][0] + '('+str(member_set[0][1])+')', 'and adding', keyword, str(keyword_increase)`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`#remove min from set and add the new one`
Fixed bug in WebStats not deleting correct key + fixed typo in CurveSetManager preventing connecting to redis + modified display in moduleInfo 2016-08-25 08:43:04 +00:00			`server.srem(redis_progression_name_set, member_set[0][0])`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`server.sadd(redis_progression_name_set, keyword)`
Fixed bug in WebStats not deleting correct key + fixed typo in CurveSetManager preventing connecting to redis + modified display in moduleInfo 2016-08-25 08:43:04 +00:00			`server.hdel(redis_progression_name, member_set[0][0])`
			`server.hset(redis_progression_name, keyword, keyword_increase)`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 14:53:03 +00:00
Added new modules and started WebTrending web interface 2016-06-30 12:38:28 +00:00
			`if __name__ == '__main__':`
			`# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)`
			`# Port of the redis instance used by pubsublogger`
			`publisher.port = 6380`
			`# Script is the default channel used for the modules.`
			`publisher.channel = 'Script'`

			`# Section name in bin/packages/modules.cfg`
			`config_section = 'WebStats'`

			`# Setup the I/O queues`
			`p = Process(config_section)`

			`# Sent to the logging a description of the module`
			`publisher.info("Makes statistics about valid URL")`

			`# REDIS #`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`r_serv_trend = redis.StrictRedis(`
			`host=p.config.get("Redis_Level_DB_Trending", "host"),`
			`port=p.config.get("Redis_Level_DB_Trending", "port"),`
			`db=p.config.get("Redis_Level_DB_Trending", "db"))`
Added new modules and started WebTrending web interface 2016-06-30 12:38:28 +00:00
			`# FILE CURVE SECTION #`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 14:59:08 +00:00			`csv_path_proto = os.path.join(os.environ['AIL_HOME'],`
Fix pep8 2016-07-20 12:12:18 +00:00			`p.config.get("Directories", "protocolstrending_csv"))`
Added new modules and started WebTrending web interface 2016-06-30 12:38:28 +00:00			`protocolsfile_path = os.path.join(os.environ['AIL_HOME'],`
			`p.config.get("Directories", "protocolsfile"))`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 14:59:08 +00:00
			`csv_path_tld = os.path.join(os.environ['AIL_HOME'],`
Fix pep8 2016-07-20 12:12:18 +00:00			`p.config.get("Directories", "tldstrending_csv"))`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 14:59:08 +00:00			`tldsfile_path = os.path.join(os.environ['AIL_HOME'],`
			`p.config.get("Directories", "tldsfile"))`

Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 14:53:03 +00:00			`csv_path_domain = os.path.join(os.environ['AIL_HOME'],`
Fix pep8 2016-07-20 12:12:18 +00:00			`p.config.get("Directories", "domainstrending_csv"))`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 14:53:03 +00:00
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 14:59:08 +00:00			`faup = Faup()`
			`generate_new_graph = False`
Added new modules and started WebTrending web interface 2016-06-30 12:38:28 +00:00			`# Endless loop getting messages from the input queue`
			`while True:`
			`# Get one message from the input queue`
			`message = p.get_from_set()`
Fix pep8 2016-07-20 12:12:18 +00:00
Added new modules and started WebTrending web interface 2016-06-30 12:38:28 +00:00			`if message is None:`
			`if generate_new_graph:`
			`generate_new_graph = False`
			`today = datetime.date.today()`
			`year = today.year`
			`month = today.month`
Optimized create_plot and removed test commemts 2016-07-12 09:47:51 +00:00
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`print 'Building protocol graph'`
			`lib_words.create_curve_with_word_file(r_serv_trend, csv_path_proto,`
Added new modules and started WebTrending web interface 2016-06-30 12:38:28 +00:00			`protocolsfile_path, year,`
			`month)`
Optimized create_plot and removed test commemts 2016-07-12 09:47:51 +00:00
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`print 'Building tld graph'`
			`lib_words.create_curve_with_word_file(r_serv_trend, csv_path_tld,`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 14:59:08 +00:00			`tldsfile_path, year,`
			`month)`
Optimized create_plot and removed test commemts 2016-07-12 09:47:51 +00:00
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`print 'Building domain graph'`
			`lib_words.create_curve_from_redis_set(r_serv_trend, csv_path_domain,`
			`"domain", year,`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 14:53:03 +00:00			`month)`
			`print 'end building'`
Optimized create_plot and removed test commemts 2016-07-12 09:47:51 +00:00
Added new modules and started WebTrending web interface 2016-06-30 12:38:28 +00:00			`publisher.debug("{} queue is empty, waiting".format(config_section))`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 14:53:03 +00:00			`print 'sleeping'`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`time.sleep(5*60)`
Added new modules and started WebTrending web interface 2016-06-30 12:38:28 +00:00			`continue`

Fix pep8 2016-07-20 12:12:18 +00:00			`else:`
Added new modules and started WebTrending web interface 2016-06-30 12:38:28 +00:00			`generate_new_graph = True`
			`# Do something with the message from the queue`
Added SQLInjectionDetection module 2016-08-02 13:43:11 +00:00			`url, date, path = message.split()`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 14:59:08 +00:00			`faup.decode(url)`
			`url_parsed = faup.get()`

Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 11:44:22 +00:00			`analyse(r_serv_trend, 'scheme', date, url_parsed) #Scheme analysis`
			`analyse(r_serv_trend, 'tld', date, url_parsed) #Tld analysis`
			`analyse(r_serv_trend, 'domain', date, url_parsed) #Domain analysis`
			`compute_progression(r_serv_trend, 'scheme', num_day_to_look, url_parsed)`
			`compute_progression(r_serv_trend, 'tld', num_day_to_look, url_parsed)`
			`compute_progression(r_serv_trend, 'domain', num_day_to_look, url_parsed)`