maxi cleanup old code :'(

2024-11-10 08:38:28 +00:00 · 2014-08-14 11:48:46 +02:00 · 2014-08-14 11:48:46 +02:00 · 04a8f1bdf2
commit 04a8f1bdf2
parent 7a3c216787
22 changed files with 14 additions and 2078 deletions
--- a/bin/tests/indexer_lookup.py
+++ b/bin/tests/indexer_lookup.py
--- a/bin/packages/lib_gephi.py
+++ b/bin/packages/lib_gephi.py
@ -1,64 +0,0 @@
-import networkx as nx
-import xml.sax.saxutils as xlm
-import redis
-
-def Gephi_Graph(r_serv, graphpath, mincard, maxcard, insert_type):
-    """Create Gephi Graph by calling a "Sub function": Create_Graph
-
-    :param r_serv: -- connexion to redis database
-    :param graphpath: -- the absolute path of the .gephi graph created.
-    :param mincard: -- the minimum links between 2 nodes to be created
-    :param maxcard: -- the maximum links between 2 nodes to be created
-    :param insert_type: -- the type of datastructure used to create the graph.
-
-    In fact this function is juste here to be able to choose between two kind of
-    Redis database structure: One which is a Sorted set and the other a simple
-    set.
-
-    """
-    g = nx.Graph()
-
-    if (insert_type == 0):
-
-        for h in r_serv.smembers("hash"):
-            Create_Graph(r_serv, g, h, graphpath, mincard, maxcard)
-
-    elif (insert_type == 2):
-
-        for h in r_serv.zrange("hash", 0, -1):
-            Create_Graph(r_serv, g, h, graphpath, mincard, maxcard)
-
-    nx.write_gexf(g,graphpath)
-    print nx.info(g)
-
-
-
-
-def Create_Graph(r_serv, graph, h, graphpath, mincard, maxcard):
-    """Create Gephi Graph.
-
-    :param r_serv: -- connexion to redis database
-    :param graph: -- networkx graph object
-    :param h: -- (str) the hash which will be transform into a node.
-    :param graphpath: -- the absolute path of the .gephi graph created.
-    :param mincard: -- the minimum links between 2 nodes to be created
-    :param maxcard: -- the maximum links between 2 nodes to be created
-
-    This function link all the pastes with theirs own hashed lines.
-    Of course a paste can have multiple hashed lines and an hashed line can be
-    contained in multiple paste.
-    In this case it's a common hash.
-
-    """
-    if (r_serv.scard(h) >= mincard) and (r_serv.scard(h) <= maxcard):
-
-                for filename in r_serv.smembers(h):
-
-                    for line in r_serv.smembers(filename):
-
-                        line = line.decode('UTF-8', errors='ignore')
-                        line = xlm.quoteattr(line, {'"':'&quot;', "'":"&apos;"})
-
-                        graph.add_edge(h, line+" -- "+filename)
-
-#OK
--- a/bin/packages/lib_jobs.py
+++ b/bin/packages/lib_jobs.py
@ -1,151 +0,0 @@
-import redis, time, sys, os, inspect
-
-from datetime import timedelta, date, datetime
-
-from pubsublogger import publisher
-
-def set_listof_pid(r_serv, filename, name):
-    """Create the pid list and it's pid members
-
-    :param r_serv: -- Connexion to redis.
-    :param filename: -- the absolute pastes path name.
-    :param name: -- the traditionnal argv[0] (The name of the launched script)
-
-    This function create a hashes in redis as follows and a set of pid.
-
-    +------------+------------+---------------------+
-    |     Keys   | Fields     | Values              |
-    +============+============+=====================+
-    | 2045       | startime   | 2014-05-09_11:44:17 |
-    +------------+------------+---------------------+
-    | ...        | prog       | ./programme         |
-    +------------+------------+---------------------+
-    | ...        | pid        | 2045                |
-    +------------+------------+---------------------+
-    | ...        | paste      | /home/folder/aux.gz |
-    +------------+------------+---------------------+
-    | ...        | kb         | 54.12               |
-    +------------+------------+---------------------+
-
-    +------------+------------+
-    |     Keys   | Members    |
-    +============+============+
-    | pid        | 2045       |
-    +------------+------------+
-    | ...        | 2480       |
-    +------------+------------+
-
-    """
-    r_serv.sadd("pid", os.getpid())
-    r_serv.hmset(os.getpid(),
-    {
-    "startime":time.strftime("%Y-%m-%d_%H:%M:%S"),
-    "prog":name,
-    "pid":str(os.getpid()),
-    "paste":filename,
-    "Kb":round(os.path.getsize(filename)/1024.0,2)
-    })
-
-
-
-
-def update_listof_pid(r_serv):
-    """Remove pid from the pid list
-
-    :param r_serv: -- Connexion to redis.
-
-    Remove from the list and redis, pid which are terminated.
-
-    """
-    r_serv.srem("pid", os.getpid())
-    r_serv.delete(os.getpid())
-
-
-
-
-def flush_list_of_pid(r_serv):
-    """Flush the datas in redis
-
-    :param r_serv: -- Connexion to redis.
-
-    Clean the redis database from the previous pid and pidlist inserted
-
-    """
-    for x in r_serv.smembers("pid"):
-        r_serv.delete(x)
-
-    r_serv.delete("pid")
-
-
-
-
-def format_display_listof_pid(dico, arg):
-    """Formating data for shell and human
-
-    :param dico: (dict) dictionnary
-    :param arg: (str) Choosing argument
-
-    :returns: (str)
-
-    This function provide different displaying formats for the dictionnary's data.
-
-    """
-    if arg == 'pid':
-        var = "{0}".format(dico['pid'])
-    elif arg == 'up':
-        var = "{0}".format(dico['uptime'])
-    elif arg == 'kb':
-        var = "{0}".format(dico['Kb'])
-    elif arg == 'paste':
-        var = "{0}".format(dico['paste'])
-    elif arg == 'startime':
-        var = "{0}".format(dico['startime'])
-    elif arg == 'prg':
-        var = "{0}".format(dico['prog'])
-    else:
-        var = "PID:{0},uptime:{1},kb:{2},paste:{3},prog:{4},startime:{5}".format(dico['pid'],
-        dico['uptime'],
-        dico['Kb'],
-        dico['paste'],
-        dico['prog'],
-        dico['startime'])
-
-    return var
-
-
-
-
-def display_listof_pid(r_serv, arg):
-    """Display the pid list from redis
-
-    This function display infos in the shell about lauched process
-
-    """
-    jobs = {}
-    joblist = []
-    try:
-        for job in r_serv.smembers("pid"):
-            jobs = r_serv.hgetall(job)
-
-            if jobs != None:
-                start = datetime.strptime(r_serv.hget(job, "startime"), "%Y-%m-%d_%H:%M:%S")
-
-                end = datetime.strptime(time.strftime("%Y-%m-%d_%H:%M:%S"), "%Y-%m-%d_%H:%M:%S")
-                jobs['uptime'] = str(abs(start - end))
-                joblist.append(jobs)
-            else:
-                publisher.debug("display_list_of_pid Aborted due to lack of Information in Redis")
-
-        joblist = sorted(joblist, key=lambda k: k['uptime'], reverse=True)
-
-        for job in joblist:
-            print format_display_listof_pid(job, arg)
-
-        if arg == "remain":
-            print "Remaining: {0}".format(r_serv.llen("filelist"))
-
-        if arg == "processed":
-            print "processed: {0}".format(r_serv.llen("processed"))
-
-    except TypeError:
-        publisher.error("TypeError for display_listof_pid")
--- a/bin/packages/lib_redis_insert.py
+++ b/bin/packages/lib_redis_insert.py
@ -1,203 +0,0 @@
-import sys, hashlib, os, os.path, gzip, string, glob, itertools, copy, shutil
-import redis, crcmod, mmh3, time, fileinput
-import crcmod, mmh3
-
-from operator import itemgetter, attrgetter
-from pubsublogger import publisher
-
-
-
-
-def listdirectory(path):
-    """Path Traversing Function.
-
-    :param path: -- The absolute pathname to a directory.
-
-    This function is returning all the absolute path of the files contained in
-    the argument directory.
-
-    """
-    fichier=[]
-    for root, dirs, files in os.walk(path):
-
-        for i in files:
-
-            fichier.append(os.path.join(root, i))
-
-    return fichier
-
-
-
-
-clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
-"""It filters out non-printable characters from the string it receives."""
-
-
-
-def select_hash(hashkind, line):
-    """Select the kind of hashing for the line.
-
-    :param hashkind: -- (str) The name of the hash
-    :param line: -- (str) The string to hash.
-
-    This function is a kind of hash selector which will use the hash passed
-    in argument to hash the string also passed in argument.
-
-    """
-    if hashkind == "md5":
-        hashline = hashlib.md5(line).hexdigest()
-
-    elif hashkind == "sha1":
-        hashline = hashlib.sha1(line).hexdigest()
-
-    elif hashkind == "crc":
-        crc32 = crcmod.Crc(0x104c11db7, initCrc=0, xorOut=0xFFFFFFFF)
-        crc32.update(line)
-        hashline = crc32.hexdigest()
-
-    elif hashkind == "murmur":
-        hashline = mmh3.hash(line)
-
-    return str(hashline)
-
-
-
-
-def redis_populate(pipe, folder, minline, hashkind, jmp, insert_type):
-    """Call another function with different "mode"
-
-    :param pipe: -- Redis pipe
-    :param folder: -- the absolute path name to the folder where to process
-    :param minline: -- the minimum lenght of line to hash
-    :param hashkind: -- the hash to use
-    :param jmp: -- (bool) trigger the jumping line mode or not
-     :param insert_type: -- which kind of datastructure to create in redis.
-
-     This Function actually call the function "insert_redis" with differents
-     method to process it.
-     In one way, x lines are jumped before the Insertion.
-     In another, all the line are hashed and inserted in redis.
-
-    """
-    for filename in folder:
-
-        with gzip.open(filename, 'rb') as F:
-            start_line = 1
-
-            for num, line in enumerate(F, start_line):
-
-                if jmp != 1:
-
-                    if (num % jmp) == 1 :
-                        insert_redis(filename,
-                            line,
-                            pipe,
-                            minline,
-                            hashkind,
-                            num,
-                            insert_type)
-
-                else:
-                    insert_redis(filename,
-                        line,
-                        pipe,
-                        minline,
-                        hashkind,
-                        num,
-                        insert_type)
-
-            pipe.execute()
-
-
-
-
-def insert_redis(filename, line, pipe, minline, hashkind, num, insert_type):
-    """Insert hashed line in redis.
-
-    :param filename: -- the absolute path name to the folder where to process
-    :param line: -- the clear line which will be hashed.
-    :param pipe: -- Redis pipe
-    :param minline: -- the minimum lenght of line to hash
-    :param hashkind: -- the hash to use
-    :param num: -- (int) the first line of the file (better human read)
-    :param insert_type: -- (int) Choose the datastructure used in redis.
-
-    This function insert hashed lines in the selected redis datastructure
-    The datastructure is represented as follow:
-
-    case one: ALLIN
-    "hash"[hashedline][occurence] => to index all different hashs + scoring
-    "hashedline"[filename.gz] => to associate the file.gz to his hashedline
-    "L:hashedline"[clearline] => for the correspondance
-
-    case two: SORTED SET (for the ./top.py script)
-    "hash"[hashedline][occurence] => to index all different hashs + scoring
-    "hashedline"[filename.gz] => to associate the file.gz to his hashedline
-
-    case tree: BASIC SET (for ./Graph.py)
-    "hash"[hashedline] to index all different hashs (without scores)
-    "hashedline"[filename.gz] => to associate the file.gz to his hashedline
-    "filename.gz"[firstline] => for human reading
-
-    """
-    if (insert_type == 2): # ALLIN
-        if len(line) >= minline:
-
-            pipe.zincrby("hash", select_hash(hashkind, line), 1)
-            pipe.sadd(select_hash(hashkind,line), filename.split('/',20)[-1])
-            pipe.sadd("L:"+select_hash(hashkind, line), clean(line))
-
-            if (num == 1):
-
-                pipe.sadd(filename.split('/',20)[-1], clean(line[0:80]))
-
-
-    elif (insert_type == 1): # SORTED SET FOR TOP100.py
-
-        if len(line) >= minline:
-
-            pipe.zincrby("hash", select_hash(hashkind, line), 1)
-            pipe.sadd(select_hash(hashkind, line), clean(line))
-
-
-    elif (insert_type == 0): # SET FOR THE GRAPH
-
-        if len(line) >= minline:
-
-            pipe.sadd("hash", select_hash(hashkind, line))
-            pipe.sadd(select_hash(hashkind,line), filename.split('/',20)[-1])
-
-            if (num == 1):
-
-                pipe.sadd(filename.split('/',20)[-1], clean(line[0:80]))
-
-
-
-
-def remove_pure_doppelganger(r_serv, nb):
-    """Remove identic paste
-
-    :param r_serv: -- Redis connexion database
-    :param nb: -- (int) Number of execution wanted
-
-    Add to a temporary list the hash of wholes files and compare the new hash
-    to the element of this list. If the hash is already inside, the file
-    is deleted otherwise the hash is added in the list.
-
-    """
-    hashlist = []
-    for x in xrange(0,nb):
-        filename = r_serv.lpop("filelist")
-
-        with open(filename, 'rb') as L:
-            hashline = hashlib.md5(L.read()).hexdigest()
-
-            print len(hashlist)
-
-            if hashline in hashlist:
-
-                os.remove(filename)
-                publisher.debug("{0} removed".format(filename))
-                print "{0} removed".format(filename)
-            else:
-                hashlist.append(hashline)
--- a/bin/packages/lib_refine.py
+++ b/bin/packages/lib_refine.py
@ -15,32 +15,6 @@ from datetime import date, timedelta
 from dateutil.rrule import rrule, DAILY


-
-def create_graph_by_day_datastruct(r_serv, r_key, year, month):
-    """Creating a datastructure in redis.
-
-    :param r_serv: -- Redis connexion database
-    :param r_key: -- (str) The name of the key read in redis (often the name of
-    the keywords category list)
-    :param year: -- (integer) The year to process
-    :param month: -- (integer) The month to process
-
-
-    """
-    a = date(year, month, 01)
-    b = date(year, month, cal.monthrange(year, month)[1])
-
-    for dt in rrule(DAILY, dtstart = a, until = b):
-        r_serv.zadd(r_key+'_by_day',0,dt.strftime("%Y%m%d"))
-
-    for Tfilename in r_serv.zrange(r_key+'_occur', 0, -1, withscores = True):
-        r_serv.zincrby(r_key+'_by_day',
-        Tfilename[0][-22:-12].replace('/',''),
-        Tfilename[1])
-
-
-
-
 def is_luhn_valid(card_number):
    """Apply the Luhn algorithm to validate credit card.

@ -156,155 +130,3 @@ def checking_A_record(r_serv, domains_set):

    publisher.debug("URLs before: {0} after: {1} (valid)".format(num, score))
    return (num, WalidA)
-
-
-
-
-def refining_regex_dataset(r_serv, r_key, regex, min_match, year, month, luhn = True, dnscheck = True):
-    """Refine the "raw dataset" of paste with regulars expressions
-
-    :param r_serv: -- Redis connexion database
-    :param r_key: -- (str) The name of the key read in redis (often the name of
-        the keywords category list)
-    :param min_match: -- (int) Below this number file are deleted
-    :param regex: -- Regular expression which will be match.
-
-    This function Refine database created with classify_token_paste function.
-    It opening again the files which matchs the keywords category list, found
-    regular expression inside it and count how many time is found.
-
-    If there is not too much match about the regular expression the file is
-    deleted from the list.
-
-    Than it finally merge the result by day to be able to create a bar graph
-    which will represent how many occurence by day the regex match.
-
-    """
-    for filename in r_serv.zrange(r_key, 0, -1):
-
-        with gzip.open(filename, 'rb') as F:
-            var = 0
-            matchs = set([])
-
-            for num, kword in enumerate(F):
-
-                match = re.findall(regex, kword)
-                var += len(match)
-
-                for y in match:
-                    if y != '' and len(y) < 100:
-                        matchs.add(y)
-            # If there is less match than min_match delete it (False pos)
-            if len(matchs) <= min_match :
-                r_serv.zrem(r_key, filename)
-                publisher.debug("{0} deleted".format(filename))
-            else:
-            # else changing the score.
-                if r_key == "creditcard_categ" and luhn:
-                    for card_number in matchs:
-                        if is_luhn_valid(card_number):
-
-                            r_serv.zincrby(r_key+'_occur', filename, 1)
-
-                            publisher.info("{1} is valid in the file {0}".format(filename, card_number))
-                        else:
-                            publisher.debug("{0} card is invalid".format(card_number))
-
-                if r_key == "mails_categ" and dnscheck:
-                    r_serv.zadd(r_key+'_occur', checking_MX_record(r_serv, matchs), filename)
-
-                else:
-                    # LUHN NOT TRIGGERED (Other Categs)
-                    r_serv.zadd(r_key+'_occur',
-                        len(matchs),
-                        filename)
-
-    create_graph_by_day_datastruct(r_serv, r_key, year, month)
-
-
-
-
-def graph_categ_by_day(r_serv, filename, year, month, r_key):
-    """Create a bargraph representing regex matching by day
-
-    :param r_serv: -- Redis connexion database
-    :param filename: -- (str) The absolute path where to save the figure.png
-    :param r_key: -- (str) The name of the key read in redis (often the name of
-        the keywords category list)
-    :param year: -- (integer) The year to process
-    :param month: -- (integer) The month to process
-
-    This function display the amount of the category per day.
-
-    """
-    adate = []
-    categ_num = []
-    rcParams['figure.figsize'] = 15, 10
-
-    a = date(year, month, 01)
-    b = date(year, month, cal.monthrange(year, month)[1])
-
-    for dt in rrule(DAILY, dtstart = a, until = b):
-        adate.append(dt.strftime("%d"))
-        categ_num.append(r_serv.zscore(r_key+'_by_day',dt.strftime("%Y%m%d")))
-
-    n_groups = len(categ_num)
-    adress_scores = tuple(categ_num)
-
-    index = np.arange(n_groups)
-    bar_width = 0.5
-    opacity = 0.6
-
-    ladress = plt.bar(index, adress_scores, bar_width,
-                 alpha = opacity,
-                 color = 'b',
-                 label = r_key)
-
-
-    plt.plot(tuple(categ_num), 'r--')
-    #plt.yscale('log')
-    plt.xlabel('Days')
-    plt.ylabel('Amount')
-    plt.title('Occurence of '+r_key+' by day')
-    plt.xticks(index + bar_width/2 , tuple(adate))
-
-    plt.legend()
-    plt.grid()
-
-    plt.tight_layout()
-
-    plt.savefig(filename+".png", dpi=None, facecolor='w', edgecolor='b',
-        orientation='portrait', papertype=None, format="png",
-        transparent=False, bbox_inches=None, pad_inches=0.1,
-        frameon=True)
-
-    publisher.info(filename+".png"+" saved!")
-
-
-
-
-def create_tld_list(url = "https://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1"):
-    """Recover a tld list from url.
-
-    :param url: -- The url of the tld list.
-    :return: -- list
-
-    This function recover from mozilla.org the list of the effective tld names,
-    Save it as a file, and return a list of all the tld.
-
-
-    """
-    domains = []
-    htmlSource = urllib.urlopen(url).read()
-    with open("ICCANdomain", 'wb') as F:
-        F.write(htmlSource)
-
-    with open("ICCANdomain", 'rb') as F:
-
-        for num, line in enumerate(F):
-            if re.match(r"^\/\/|\n", line) == None:
-                domains.append(re.sub(r'\*', '', line[:-1]))
-            else:
-                publisher.info("Comment line ignored.")
-
-    return domains
--- a/bin/packages/lib_search.py
+++ b/bin/packages/lib_search.py
@ -1,103 +0,0 @@
-import redis
-import string
-
-
-def create_common_hash_file(r_serv, zmin, zmax, filename):
-    """ Create a "top100".txt file.
-
-    :param r_serv: -- connexion to redis database
-    :param zmin: -- (int) Offset of the top list
-    :param zmax: -- (int) Number of element wanted to be in the top list.
-    :param filename: -- the pathname to the created file.
-
-    This Function create a ranking list between zmin and zman of the most common
-    hashs.
-    Line are written as follow in the file:
-    hash:[md5hash]:[cardinality]:[line]
-    All hashes represent a full line which mean it can be one char or more...
-
-    """
-    with open(filename, 'wb') as F:
-
-        for h, num in r_serv.zrevrangebyscore("hash", "+inf", "-inf", zmin, zmax, True):
-
-            F.write("hash:{0}:{1}:{2}\n".format(h, num, list(r_serv.smembers('L:'+h))))
-
-
-
-
-def paste_searching(r_serv, filename, pastename, mincard, maxcard):
-    """Search similar hashs from a given file.
-
-    :param r_serv: -- connexion to redis database
-    :param filename: -- the pathname to the created file.
-    :param pastename: -- the name of the paste used to search in redis database.
-    :param mincard: -- the minimum occurence needed of an hash to be taken in count.
-    :param maxcard: -- the maximum occurence needed of an hash to be taken in count.
-
-    This function return a text file which is a kind of synthesis about
-    where (in the others pastes) the hash of the given pastename have been found.
-
-    """
-    P = set([pastename])
-    tmp_h = str()
-    tmp_set = set([])
-
-    with open(filename, 'wb') as F:
-
-        F.write("Paste: {0}\nOptions used:\nMincard: {1}\nMaxcard: {2}\n\nContaining Following Hash:\n".format(pastename,mincard,maxcard))
-
-        for h in r_serv.smembers("hash"):
-
-            if (r_serv.smembers(h).intersection(P) and r_serv.scard(h) >= mincard and r_serv.scard(h) <= maxcard):
-
-                F.write(h+'\n')
-                tmp_set = tmp_set.union(r_serv.smembers(h).union(r_serv.smembers(tmp_h)))
-
-            tmp_h = h
-
-        F.write("\nSimilar Files:\n")
-
-        for n, s in enumerate(tmp_set):
-
-            F.write(str(n) + ': ' + s + '\n')
-
-
-
-
-def paste_searching2(r_serv, filename, pastename, mincard, maxcard):
-    """Search similar hashs from a given file.
-    (On another kind of redis data structure)
-
-    :param r_serv: -- connexion to redis database
-    :param filename: -- the pathname to the created file.
-    :param pastename: -- the name of the paste used to search in redis database.
-    :param mincard: -- the minimum occurence needed of an hash to be taken in count.
-    :param maxcard: -- the maximum occurence needed of an hash to be taken in count.
-
-    This function return a text file which is a kind of synthesis about
-    where (in the others pastes) the hash of the given pastename have been found.
-
-    """
-    P = set([pastename])
-    tmp_h = str()
-    tmp_set = set([])
-
-    with open(filename, 'wb') as F:
-
-        F.write("Paste: {0}\nOptions used:\nMincard: {1}\nMaxcard: {2}\n\n###Containing Following Hash:### ###Occur### ###### Corresponding Line ######\n".format(pastename,mincard,maxcard))
-
-        for h in r_serv.zrange("hash", 0, -1):
-
-            if (r_serv.smembers(h).intersection(P) and r_serv.scard(h) >= mincard and r_serv.scard(h) <= maxcard):
-
-                F.write(h + ' -- ' + str(r_serv.zscore("hash",h)) + ' -- ' + str(list(r_serv.smembers('L:' + h))) + '\n')
-                tmp_set = tmp_set.union(r_serv.smembers(h).union(r_serv.smembers(tmp_h)))
-
-            tmp_h = h
-
-        F.write("\nSimilar Files:\n")
-
-        for n, s in enumerate(tmp_set):
-
-            F.write(str(n) + ': ' + s + '\n')
--- a/bin/packages/lib_words.py
+++ b/bin/packages/lib_words.py
@ -19,316 +19,30 @@ from dateutil.rrule import rrule, DAILY

 from packages import *

-def redis_words_ranking(pipe, r_serv, nb, minlength, maxlength):
-    """Looping function

-    :param pipe: -- Redis pipe.
-    :param nb: -- (int) Number of pastes proceeded by function
-    :param minlength: -- (int) passed to the next function
-    :param maxlength: -- (int) passed to the next function
+def listdirectory(path):
+    """Path Traversing Function.
+
+    :param path: -- The absolute pathname to a directory.
+
+    This function is returning all the absolute path of the files contained in
+    the argument directory.

    """
-    try:
-        for n in xrange(0,nb):
+    fichier=[]
+    for root, dirs, files in os.walk(path):

-                path = r_serv.lpop("filelist")
+        for i in files:

-                if path != None:
-                    set_listof_pid(r_serv, path, sys.argv[0])
+            fichier.append(os.path.join(root, i))

-                    redis_zincr_words(pipe, path, minlength, maxlength)
+    return fichier

-                    update_listof_pid(r_serv)

-                    r_serv.lpush("processed",path)

-                    publisher.debug(path)
-                else:
-                    publisher.debug("Empty list")
-                    break
-    except (KeyboardInterrupt, SystemExit) as e:
-        flush_list_of_pid(r_serv)
-        publisher.debug("Pid list flushed")

-
-
-
-
-def redis_zincr_words(pipe, filename, minlength, maxlength):
-    """Create news sorted set in redis.
-
-    :param minlength: -- (int) Minimum words length inserted
-    :param maxlength: -- (int) Maximum words length inserted
-    :param filename: -- The absolute path to the file.gz to process.
-
-    Representation of the set in redis:
-
-    +------------+------------+-----------+
-    |     Keys   | Members    | Scores    |
-    +============+============+===========+
-    | 20131001   | word1      | 142       |
-    +------------+------------+-----------+
-    | ...        | word2      | 120       |
-    +------------+------------+-----------+
-    | 20131002   | ...        | ...       |
-    +------------+------------+-----------+
-
-    This function store all words between minlength and maxlength in redis.
-    Redis will count as well how much time each word will appear by day:
-    The cardinality.
-
-    """
-    tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)
-
-    with gzip.open(filename, 'rb') as F:
-
-        blob = TextBlob(clean(F.read()), tokenizer = tokenizer)
-
-        for word in blob.tokens:
-
-            if (len(word) >= minlength) and (len(word) <= maxlength):
-                pipe.zincrby(filename[-22:-12].replace('/',''), word, 1)
-
-            if (len(word) >= maxlength):
-                publisher.info("word bigger than {0} detected at {1}".format(maxlength, filename))
-                publisher.info(word)
-
-        pipe.execute()
-
-
-
-
-def classify_token_paste(r_serv, listname, choicedatastruct, nb, r_set):
-    """Tokenizing on word category
-
-    :param r_serv: -- Redis database connexion
-    :param listname: -- (str) path to the file containing the list of path of category files
-    :param choicedatastruct: -- (bool) Changing the index of datastructure
-    :param nb: -- (int) Number of pastes proceeded by function
-
-    Redis data structures cas be choose as follow:
-
-    +---------------+------------+-----------+
-    |     Keys      | Members    | Scores    |
-    +===============+============+===========+
-    | mails_categ   | filename   | 25000     |
-    +---------------+------------+-----------+
-    | ...           | filename2  | 2400      |
-    +---------------+------------+-----------+
-    | web_categ     | ...        | ...       |
-    +---------------+------------+-----------+
-
-    Or
-
-    +--------------+-------------+-----------+
-    |     Keys     | Members     | Scores    |
-    +==============+=============+===========+
-    | filename     | mails_categ | 100000    |
-    +--------------+-------------+-----------+
-    | ...          | web_categ   | 24050     |
-    +--------------+-------------+-----------+
-    | filename2    | ...         | ...       |
-    +--------------+-------------+-----------+
-
-    This function tokenise on all special characters like: @^\|[{#~}]!:;$^=
-    And insert data in redis if the token match the keywords in a list previously
-    created.
-    These lists of keywords can be list of everything you want but it's better
-    to create "category" of keywords.
-
-    """
-
-    try:
-        for n in xrange(0,nb):
-            filename = r_serv.lpop(r_set)
-
-            if filename != None:
-
-                tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)
-                set_listof_pid(r_serv, filename, sys.argv[0])
-
-                with open(listname, 'rb') as L:
-                    # for each "categ" listed in the file
-                    for num, fname in enumerate(L):
-                        # contain keywords by categ
-                        tmp_list = []
-                        #for each keywords
-                        with open(fname[:-1], 'rb') as LS:
-
-                            for num, kword in enumerate(LS):
-                                tmp_list.append(kword[:-1])
-
-                            # for each paste
-                            with gzip.open(filename, 'rb') as F:
-
-                                blob = TextBlob(clean(F.read()),
-                                tokenizer = tokenizer)
-
-                                # for each paste token
-                                for word in blob.tokens.lower():
-
-                                    if word in tmp_list:
-                                        # choosing between two data structures.
-                                        if choicedatastruct:
-                                            r_serv.zincrby(filename,
-                                                fname.split('/')[-1][:-1],
-                                                1)
-                                        else:
-                                            r_serv.zincrby(fname.split('/')[-1][:-1],
-                                            filename,
-                                            1)
-
-                update_listof_pid(r_serv)
-
-            else:
-                publisher.debug("Empty list")
-                #r_serv.save()
-                break
-
-    except (KeyboardInterrupt, SystemExit) as e:
-        flush_list_of_pid(r_serv)
-        publisher.debug("Pid list flushed")
-
-
-
-
-def dectect_longlines(r_serv, r_key, store = False, maxlength = 500):
-    """Store longlines's linenumbers in redis
-
-    :param r_serv: -- The redis connexion database
-    :param r_key: -- (str) The key name in redis
-    :param store: -- (bool) Store the line numbers or not.
-    :param maxlength: -- The limit between "short lines" and "long lines"
-
-    This function connect to a redis list of filename (pastes filename);
-    Open the paste and check inside if there is some line with their
-    length >= to maxlength.
-    If yes, the paste is "tagged" as containing a longlines in another
-    redis structures, and the linenumber (of the long lines) can be stored
-    in addition if the argument store is at True.
-
-    """
-    try:
-        while True:
-            #r_key_list (categ)
-            filename = r_serv.lpop(r_key)
-
-            if filename != None:
-
-                set_listof_pid(r_serv, filename, sys.argv[0])
-
-                # for each pastes
-                with gzip.open(filename, 'rb') as F:
-                    var = True
-                    for num, line in enumerate(F):
-
-                        if  len(line) >= maxlength:
-                            #publisher.debug("Longline:{0}".format(line))
-                            if var:
-                                r_serv.rpush("longlines", filename)
-                                var = False
-
-                            if store:
-                                r_serv.sadd(filename, num)
-                            else:
-                                publisher.debug("Line numbers of longlines not stored")
-
-                update_listof_pid(r_serv)
-            else:
-                publisher.debug("Empty list")
-                return False
-                break
-
-    except (KeyboardInterrupt, SystemExit) as e:
-        flush_list_of_pid(r_serv)
-        publisher.debug("Pid list flushed")
-
-
-
-
-# NOT USED RIGHT NOW #
-def recovering_longlines(r_serv):
-    """Get longlines with linenumbers
-
-    """
-    try:
-        for n in xrange(0,nb):
-            filename = r_serv.lpop("longlines")
-
-            if filename != None:
-                # For each values in redis (longline's line number)
-                for numline in r_serv.smembers(filename):
-
-                    with gzip.open(filename,'rb') as F:
-
-                        for num, line in enumerate(F):
-                            #When corresponding.
-                            if int(num) == int(numline):
-                                pass
-                                # TREATMENT
-            else:
-                publisher.debug("Empty list")
-                r_serv.save()
-                break
-
-    except (KeyboardInterrupt, SystemExit) as e:
-        flush_list_of_pid(r_serv)
-        publisher.debug("Pid list flushed")
-
-
-
-
-def remove_longline_from_categ(r_serv, r_key, delete, store, maxlength):
-    """Remove from a set, file with long lines.
-
-    :param r_serv: -- The redis connexion database
-    :param r_key: -- (str) The key name in redis
-    :param store: -- (bool) Store the line numbers or not.
-    :param delete: -- (bool) If true, delete the used key from redis.
-    :param maxlength: -- The limit between "short lines" and "long lines"
-
-    """
-    publisher.info("Number of file before:{0}".format(r_serv.zcard(r_key)))
-
-    #Create a list of file to proceed (1)
-    for filename in r_serv.zrange(r_key, 0, -1):
-        r_serv.rpush(r_key+"_list", filename)
-
-    #detecting longlines in pastes
-    dectect_longlines(r_serv, r_key+"_list", store, maxlength)
-
-    #remove false positive members
-    while True:
-        fp_filename = r_serv.lpop("longlines")
-
-        if fp_filename == None:
-            break
-
-        else:
-            # if wanted, delete in addition the set with linenumbers (created with store)
-            if delete:
-                r_serv.zrem(r_key, fp_filename)
-                r_serv.delete(fp_filename)
-
-            else:
-                #remove the file with longline from the r_key zset.
-                r_serv.zrem(r_key, fp_filename)
-
-    publisher.info("Longline file removed from {0}, {1} Files remaining".format(r_key, r_serv.zcard(r_key)))
-
-
-
-
-def detect_longline_from_list(r_serv, nb):
-    try:
-        for n in xrange(0,nb):
-
-                if not dectect_longlines(r_serv, "filelist", True):
-                    break
-
-    except (KeyboardInterrupt, SystemExit) as e:
-        flush_list_of_pid(r_serv)
-        publisher.debug("Pid list flushed")
+clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
+"""It filters out non-printable characters from the string it receives."""



@ -369,182 +83,6 @@ def create_dirfile(r_serv, directory, overwrite):



-def redis_interbargraph_set(r_serv, year, month, overwrite):
-    """Create a Redis sorted set.
-
-    :param r_serv: -- connexion to redis database
-    :param year: -- (integer) The year to process
-    :param month: -- (integer) The month to process
-    :param overwrite: -- (bool) trigger the overwrite mode
-
-    This function create inside redis the intersection of all days in
-    a month two by two.
-    Example:
-    For a month of 31days it will create 30 sorted set between day and
-    day+1 until the last day.
-    The overwrite mode delete the intersets and re-create them.
-
-    """
-    a = date(year, month, 01)
-    b = date(year, month, cal.monthrange(year, month)[1])
-
-    if overwrite:
-        r_serv.delete("InterSet")
-
-        for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)):
-            dayafter = dt+timedelta(1)
-
-            r_serv.delete(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))
-
-            r_serv.zinterstore(
-                str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")),
-                {str(dt.strftime("%Y%m%d")):1,
-                str(dayafter.strftime("%Y%m%d")):-1})
-
-            r_serv.zadd(
-                "InterSet",
-                1,
-                str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))
-    else:
-        for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)):
-            dayafter = dt+timedelta(1)
-
-            if r_serv.zcard(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))) == 0:
-
-                r_serv.zinterstore(
-                    str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")),
-                    {str(dt.strftime("%Y%m%d")):1,
-                    str(dayafter.strftime("%Y%m%d")):-1})
-
-                r_serv.zadd(
-                    "InterSet",
-                    1,
-                    str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))
-
-                publisher.info(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))+" Intersection Created")
-
-            else:
-                publisher.warning("Data already exist, operation aborted.")
-
-
-
-
-
-def word_bar_graph(r_serv, year, month, filename):
-    """Create an histogram.
-
-    :param r_serv: -- connexion to redis database
-    :param year: -- (integer) The year to process
-    :param month: -- (integer) The month to process
-    :param filename: -- The absolute path where to save the figure.png
-
-    This function use matplotlib to create an histogram.
-    The redis database need obviously to be populated first
-    with functions: redis_words_ranking and redis_interbargraph_set.
-
-    """
-    lw = []
-    adate = []
-    inter = [0]
-    rcParams['figure.figsize'] = 15, 10
-
-    a = date(year, month, 01)
-    b = date(year, month, cal.monthrange(year,month)[1])
-
-    for dt in rrule(DAILY, dtstart = a, until = b):
-        lw.append(r_serv.zcard(dt.strftime("%Y%m%d")))
-        adate.append(dt.strftime("%d"))
-
-    for x in r_serv.zrange("InterSet", 0, 31):
-        inter.append(r_serv.zcard(x))
-
-    n_groups = len(lw)
-    card_words = tuple(lw)
-    card_interword = tuple(inter)
-
-    index = np.arange(n_groups)
-    bar_width = 0.5
-    opacity = 0.6
-
-    words = plt.bar(index, card_words, bar_width,
-                 alpha=opacity,
-                 color='g',
-                 label='Words/day')
-
-    lwords = plt.bar(index - 0.5, card_interword, bar_width,
-                 alpha=opacity,
-                 color='r',
-                 label='Intersection')
-
-
-    plt.plot(tuple(inter), 'b--')
-    plt.xlabel(str(year)+'/'+str(month)+' Days')
-    plt.ylabel('Words')
-    plt.title('Words Cardinality & Intersection Histogram')
-    plt.xticks(index + bar_width/2 , tuple(adate))
-
-    plt.legend()
-    plt.grid()
-
-    plt.tight_layout()
-
-    plt.savefig(filename+".png", dpi=None, facecolor='w', edgecolor='b',
-        orientation='portrait', papertype=None, format="png",
-        transparent=False, bbox_inches=None, pad_inches=0.1,
-        frameon=True)
-
-    publisher.info(filename+".png"+" saved!")
-
-
-
-
-def create_data_words_curve(r_serv, r_serv2, year, month, filename):
-    """Create a Redis hashes.
-
-    :param r_serv: -- connexion to redis database (read)
-    :param r_serv2: -- connexion to redis database (write)
-    :param year: -- (integer) The year to process
-    :param month: -- (integer) The month to process
-    :param filename: -- the path to the file which contain a list of words.
-
-
-    The hashes of redis is created as follow:
-
-    +------------+------------+-----------+
-    |   Keys     | Field      | Values    |
-    +============+============+===========+
-    | word1      | 20131001   | 150       |
-    +------------+------------+-----------+
-    | ...        | 20131002   | 145       |
-    +------------+------------+-----------+
-    | word2      | ...        | ...       |
-    +------------+------------+-----------+
-
-    The filename need to be a list of words separated by a carriage return
-    with an empty line at the end.
-    This function create datas which is used by the function
-    create_curve_with_word_file which create a csv file.
-
-    """
-    stop = stopwords.words('english')
-    a = date(year, month, 01)
-    b = date(year, month, cal.monthrange(year,month)[1])
-
-    with open(filename, 'rb') as F:
-
-        for line in F:
-
-            for dt in rrule(DAILY, dtstart = a, until = b):
-
-                if r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1]) is not None:
-                    #tester si ca existe deja "en option" et ajouter un WARNING log
-                    r_serv2.hmset(line[:-1], {str(dt.strftime("%Y%m%d")):r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1])})
-                else:
-                    pass
-
-
-
-
 def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
    """Create a csv file used with dygraph.

--- a/bin/tests/Bargraph.py
+++ b/bin/tests/Bargraph.py
@ -1,56 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_words import *
-from packages.imported import *
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = '''This script is a part of the Analysis Information Leak
-    framework. It create an histogram which display the occurency
-    of the words per day but also the intersection of day and day-1 of these
-    occurencies''',
-    epilog = '''The Redis database need to be populated by the script
-    Wordsranking_Populate.py before using this one.''')
-
-    parser.add_argument('-db',
-    type = int,
-    default = 0,
-    help = 'The name of the Redis DB (default 0)',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('y',
-    type = int,
-    metavar = "year",
-    help = 'The year processed.',
-    action = 'store')
-
-    parser.add_argument('m',
-    type = int,
-    metavar = "month",
-    help = 'The month processed.',
-    action = 'store')
-
-    parser.add_argument('-f',
-    type = str,
-    metavar = "filename",
-    default = "figure",
-    help = 'The absolute path name of the "figure.png"',
-    action = 'store')
-
-    args = parser.parse_args()
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db)
-
-    p = r.pipeline(False)
-
-    word_bar_graph(r,args.y,args.m, args.f)
-
-if __name__ == "__main__":
-    main()
--- a/bin/tests/Bargraph_categ_by_day.py
+++ b/bin/tests/Bargraph_categ_by_day.py
@ -1,64 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_refine import *
-from packages.imported import *
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = '''This script is a part of the Analysis Information Leak
-    framework. It create an histogram which display the occurency
-    of the word category per days.''',
-    epilog = '''The Redis database need to be populated by the script
-    Classify_Paste_Token.py before.
-    It's also usefull to launch Remove_longline_fp.py and Refine_with_regex.py
-    to create a more accurate histogram.
-    example: ./Bargraph_categ_by_day.py 2013 12 mails_categ''')
-
-    parser.add_argument('-db',
-    type = int,
-    default = 0,
-    help = 'The name of the Redis DB (default 0)',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('-f',
-    type = str,
-    metavar = "filename",
-    default = "figure",
-    help = 'The absolute path name of the "figure.png"',
-    action = 'store')
-
-    parser.add_argument('y',
-    type = int,
-    metavar = "year",
-    help = 'The year processed',
-    action = 'store')
-
-    parser.add_argument('m',
-    type = int,
-    metavar = "month",
-    help = 'The month processed',
-    action = 'store')
-
-    parser.add_argument('key',
-    type = str,
-    help ='name of the key to process in redis (the word_categ concerned)',
-    action = 'store')
-
-    args = parser.parse_args()
-
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db)
-
-    p = r.pipeline(False)
-
-    graph_categ_by_day(r, args.f, args.y, args.m, args.key)
-
-if __name__ == "__main__":
-    main()
--- a/bin/tests/Classify_Paste_Token.py
+++ b/bin/tests/Classify_Paste_Token.py
@ -1,61 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_words import *
-from packages.imported import *
-from pubsublogger import publisher
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = '''This script is a part of the Analysis Information Leak
-    framework. It create sets in redis as much as category
-    defined in the file given by the argument -l ''',
-    epilog = '''Example : seq 5000 | parallel -n0 -j 10
-    ./classify_Paste_Token.py -nbp 200''')
-
-    parser.add_argument('-l',
-    type = str,
-    default = "../files/list_categ_files",
-    help = 'Path to the list_categ_files (../files/list_categ_files)',
-    action = 'store')
-
-    parser.add_argument('-db',
-    type = int,
-    default = 0,
-    help = 'The name of the Redis DB (default 0)',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('-s',
-    help = 'Datastruct type, swapping between keys & members',
-    action = 'store_true')
-
-    parser.add_argument('-nbp',
-    type = int,
-    default = 200,
-    help = 'Nbpaste',
-    action = 'store')
-
-    parser.add_argument('-set',
-    type = str,
-    default = 'filelist',
-    help = 'The name of the list in redis which contain the filename to tokenise',
-    action = 'store')
-
-    args = parser.parse_args()
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db)
-
-    p = r.pipeline(False)
-
-    publisher.channel = "youpi"
-
-    classify_token_paste(r, args.l, args.s, args.nbp, args.set)
-
-if __name__ == "__main__":
-    main()
--- a/bin/tests/Display_pid.py
+++ b/bin/tests/Display_pid.py
@ -1,46 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_words import *
-from packages.imported import *
-from pubsublogger import publisher
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = '''This script is a part of the Analysis Information Leak
-    framework. It's here to monitor some script which take time
-    and lauched in parallel, You can display which process is running on which
-    paste and how much time it spent processing it''',
-    epilog = 'example : ./Display_pid -p pid -db 1 -d remain')
-
-    parser.add_argument('-d',
-    type = str,
-    default = 'all',
-    choices=['paste', 'up', 'start', 'kb', 'all', 'pid', 'prg', 'remain', 'processed'],
-    help = 'Which info to display ?',
-    action = 'store')
-
-    parser.add_argument('-db',
-    type = int,
-    default = 0,
-    help = 'The name of the Redis DB (default 0)',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    args = parser.parse_args()
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db)
-
-    p = r.pipeline(False)
-
-    publisher.channel = "youpi"
-
-    display_listof_pid(r, args.d)
-
-if __name__ == "__main__":
-    main()
--- a/bin/tests/Graph.py
+++ b/bin/tests/Graph.py
@ -1,65 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_gephi import *
-from packages.imported import *
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = '''This script is a part of the Analysis Information
-    Leak framework. It create a gephi graph to have a global
-    view of the pastes but also which one are similar.''',
-    epilog = '''The Redis database need to be populated by the script
-    Populate.py before using this one.''')
-
-    parser.add_argument('-t',
-    type = int,
-    default = 0,
-    help = 'Type of the Redis population (Same arg than in Populate.py)',
-    choices=[0, 2],
-    action = 'store')
-
-    parser.add_argument('-db',
-    type = int,
-    default = 0,
-    help = 'The name of the Redis DB',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('-min',
-    type = int,
-    default = 3,
-    help = 'minimum linked nodes (default 3)',
-    action = 'store')
-
-    parser.add_argument('-max',
-    type = int,
-    default = 50,
-    help = 'maximum linked nodes created (execute top.py before for more info)',
-    action = 'store')
-
-    parser.add_argument('-p',
-    type = str,
-    default = '../graph/out',
-    metavar = 'path',
-    help = "pathname of the graph file created. ex: /home/graph",
-    action = 'store')
-
-    args = parser.parse_args()
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db,
-        unix_socket_path='/tmp/redis.sock')
-
-
-    Gephi_Graph(r, args.p+".gexf", args.min, args.max, args.t)
-    cprint("GRAPH CREATED AT:{0}.gexf".format(args.p),"green")
-
-if __name__ == "__main__":
-    main()
-
-#OK
--- a/bin/tests/Interset.py
+++ b/bin/tests/Interset.py
@ -1,52 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_words import *
-from packages.imported import *
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = '''This script is a part of the Analysis Information
-    Leak framework. It create in redis the intersection
-    between all the days two by two of the date given in argument.''',
-    epilog = '''The Redis database need to be populated by the script
-    Wordsranking_Populate.py before using this one.''')
-
-    parser.add_argument('-db',
-    type = int,
-    default = 0,
-    help = 'The name of the Redis DB (default 0)',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('y',
-    type = int,
-    metavar = "year",
-    help = 'The year',
-    action = 'store')
-
-    parser.add_argument('m',
-    type = int,
-    metavar = "month",
-    help = 'The month',
-    action = 'store')
-
-    parser.add_argument('-ow',
-    help = 'trigger the overwritting mode',
-    action = 'store_true')
-
-    args = parser.parse_args()
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db)
-
-    p = r.pipeline(False)
-
-    redis_interbargraph_set(r, args.y, args.m, args.ow)
-
-if __name__ == "__main__":
-    main()
--- a/bin/tests/Populate.py
+++ b/bin/tests/Populate.py
@ -1,75 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_redis_insert import *
-from packages.imported import *
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = '''This script is a part of the Analysis Information
-    Leak framework. Is Populate the Redis database with
-    the pastes names and theirs hashed line''',
-    epilog = '''This script need to be run first in order to use the others:
-    Graph.py, Search.py, Top.py ...''')
-
-    parser.add_argument('input',
-    type = str,
-    metavar = 'pathfolder',
-    help = 'Input folder',
-    action = 'store')
-
-    parser.add_argument('-t',
-    type = int,
-    default = 0,
-    help = 'type of population wanted 0 = set 1 = zset 2 = mix',
-    choices=[0, 1, 2],
-    action = 'store')
-
-    parser.add_argument('-db',
-    type = int,
-    default = 0,
-    help = 'The name of the Redis DB (default 0)',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('-H',
-    type = str,
-    default = 'md5',
-    metavar='hash',
-    help = 'The hash method (default md5)',
-    choices=["md5", "sha1", "crc", "murmur"],
-    action = 'store')
-
-    parser.add_argument('-jmp',
-    type = int,
-    default = 10,
-    metavar = 'jump',
-    help = '''Jumping line factor. 1 = All the line are taken. X = jump X line
-    (default 10)''',
-    action = 'store')
-
-    parser.add_argument('-ml',
-    type = int,
-    default = 1,
-    metavar = 'minlnline',
-    help = '''Length line factor. 1 = All the line are taken.
-    X = each line >= X char (default 1)''',
-    action = 'store')
-
-    args = parser.parse_args()
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db)
-
-    p = r.pipeline()
-
-    redis_populate(p, listdirectory(args.input), args.ml, args.H, args.jmp, args.t)
-
-if __name__ == "__main__":
-    main()
-
-#OK
--- a/bin/tests/Refine_with_regex.py
+++ b/bin/tests/Refine_with_regex.py
@ -1,78 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_refine import *
-from packages.imported import *
-from pubsublogger import publisher
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = '''This script is a part of the Analysis Information
-    Leak framework. Is refining a redis set by
-    re analysing set with regex and changing the score by the number of
-    regex matching''',
-    epilog = '''example of use: ./Refine_with_regex.py 2013 12 -regex mail
-    -key mails_categ''')
-
-    parser.add_argument('-db',
-    type = int,
-    default = 0,
-    help = 'The name of the Redis DB (default 0)',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('-nbm',
-    type = int,
-    default = 1,
-    help = 'Minimum matching regex occurence per file to keep in redis (1)',
-    action = 'store')
-
-    parser.add_argument('-regex',
-    type = str,
-    default = 'mail',
-    choices=['mail', 'card', 'url', 'bitcoin'],
-    help = 'Which regex wanted to be use to match',
-    action = 'store')
-
-    parser.add_argument('-key',
-    type = str,
-    default = "mails_categ",
-    help = 'Name of the key to process in redis (same name than the wordlist concerned)',
-    action = 'store')
-
-    parser.add_argument('y',
-    type = int,
-    metavar = "year",
-    help = 'The year processed',
-    action = 'store')
-
-    parser.add_argument('m',
-    type = int,
-    metavar = "month",
-    help = 'The month processed',
-    action = 'store')
-
-    args = parser.parse_args()
-
-    if args.regex == 'mail':
-        regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"
-    elif args.regex == 'card':
-        regex = "4[0-9]{12}(?:[0-9]{3})?"
-    elif args.regex == 'bitcoin':
-        regex = "[13][1-9A-HJ-NP-Za-km-z]{26,33}"
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db)
-
-    p = r.pipeline(False)
-
-    publisher.channel = "youpi"
-
-    refining_regex_dataset(r, args.key, regex, args.nbm, args.y, args.m)
-
-if __name__ == "__main__":
-    main()
--- a/bin/tests/Remove_Doppelganger.py
+++ b/bin/tests/Remove_Doppelganger.py
@ -1,44 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_redis_insert import *
-from packages.imported import *
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = '''This script is a part of the Analysis Information
-    Leak framework. It Add to a temporary list the hash
-    of wholes files and compare the new hash to the element of this
-    list. If the hash is already inside, the file is deleted
-    otherwise the hash is added in the list.''',
-    epilog = '''This script need Redis to be populated before by
-    ./Dir.py''')
-
-    parser.add_argument('-db',
-    type = int,
-    default = 0,
-    help = 'The name of the Redis DB (default 0)',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('-nbp',
-    type = int,
-    default = 200,
-    help = 'nbpaste',
-    action = 'store')
-
-    args = parser.parse_args()
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db)
-
-    p = r.pipeline(False)
-
-    remove_pure_doppelganger(r, args.nbp)
-
-if __name__ == "__main__":
-    main()
--- a/bin/tests/Remove_longline_fp.py
+++ b/bin/tests/Remove_longline_fp.py
@ -1,57 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_words import *
-from packages.imported import *
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = '''This script is a part of the Analysis Information Leak
-    framework. It removes the line which are in redis under
-    the "key" name argument''',
-    epilog = '''This script is usually usefull launched after using
-    ./Classify_Paste_Token.py example: ./Remove_longline_fp.py mails_categ''')
-
-    parser.add_argument('-db',
-    type = int,
-    default = 0,
-    help = 'The name of the Redis DB (default 0)',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('key',
-    type = str,
-    help = 'Name of the key to process in redis ("")',
-    action = 'store')
-
-    parser.add_argument('-d',
-    help = 'Delete the set of longline created?',
-    action = 'store_true')
-
-    parser.add_argument('-s',
-    help = 'Store the longline numbers inside a set?',
-    action = 'store_true')
-
-    parser.add_argument('-max',
-    type = int,
-    default = 500,
-    help = 'The limit between "short lines" and "long lines" (500)',
-    action = 'store')
-
-
-    args = parser.parse_args()
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db)
-
-    p = r.pipeline(False)
-
-    #remove_longline_from_categ(r, args.key, args.d, args.s, args.max)
-    detect_longline_from_list(r,args.max)
-
-if __name__ == "__main__":
-    main()
--- a/bin/tests/Search.py
+++ b/bin/tests/Search.py
@ -1,72 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_search import *
-from packages.imported import *
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = 'Analysis Information Leak framework',
-    epilog = 'MSc Student Internship')
-
-    parser.add_argument('-db',
-    default = 0,
-    type = int,
-    help = 'The name of the Redis DB',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('name',
-    type = str,
-    metavar = 'pastename',
-    help = 'The name of the paste',
-    action = 'store')
-
-    parser.add_argument('-min',
-    type = int,
-    default = 3,
-    help = 'minimum linked hashs (default 3)',
-    action = 'store')
-
-    parser.add_argument('-max',
-    type = int,
-    default = 50,
-    help = 'maximum linked hash (execute top.py to be more aware)',
-    action = 'store')
-
-    parser.add_argument('-p',
-    type = str,
-    default = '../graph/Search_',
-    metavar = 'path',
-    help = "pathname of the file created.",
-    action = 'store')
-
-    parser.add_argument('-t',
-    type = int,
-    default = 0,
-    help = 'Type of the Redis population (Same arg than in Populate.py)',
-    choices=[0, 2],
-    action = 'store')
-
-    args = parser.parse_args()
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db,
-        unix_socket_path='/tmp/redis.sock')
-
-
-    if args.t == 2:
-        paste_searching2(r, args.p+args.name+".txt", args.name, args.min, args.max)
-        cprint("GRAPH CREATED AT:{0}{1}.txt".format(args.p,args.name),"green")
-    elif args.t == 0:
-        paste_searching(r, args.p+args.name+".txt", args.name, args.min, args.max)
-        cprint("GRAPH CREATED AT:{0}{1}.txt".format(args.p,args.name),"green")
-        pass
-
-
-if __name__ == "__main__":
-    main()
--- a/bin/tests/Top.py
+++ b/bin/tests/Top.py
@ -1,58 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_search import Create_Common_Hash_File
-from packages.imported import *
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = '''This script is a part of the Analysis Information Leak
-    framework. It create a text file with the top common hash
-    which are in the redis database''',
-    epilog = '''The Redis database need to be populated by the script
-    Populate.py before using this one.''')
-
-    parser.add_argument('-db',
-    default = 0,
-    type = int,
-    help = 'The name of the Redis DB',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('-off',
-    default = 1,
-    type = int,
-    metavar = 'offset',
-    help = 'Starting point of the toplist',
-    action = 'store')
-
-    parser.add_argument('-top',
-    default = 100,
-    type = int,
-    metavar = '100',
-    help = 'How many occurence? top 10-50-100 ?',
-    action = 'store')
-
-    parser.add_argument('-p',
-    type = str,
-    default = '../graph/top',
-    metavar = 'path',
-    help = "pathname of the file created ex: /home/top",
-    action = 'store')
-
-    args = parser.parse_args()
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db)
-
-    create_common_cash_file(r, args.off, args.top, args.p+str(args.top)+".top")
-    cprint("LIST CREATED","green")
-
-if __name__ == "__main__":
-    main()
-
-#OK
--- a/bin/tests/WordsCurve_Populate.py
+++ b/bin/tests/WordsCurve_Populate.py
@ -1,64 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_words import *
-from packages.imported import *
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = 'Analysis Information Leak framework',
-    epilog = 'Thats drawing')
-
-    parser.add_argument('-db',
-    type = int,
-    default = 0,
-    help = 'The name of the Redis DB To get the info (0)',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('-db1',
-    type = int,
-    default = 1,
-    help = 'The name of the Redis DB To store (1)',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('f',
-    type = str,
-    metavar= "file",
-    help = 'Words filename',
-    action = 'store')
-
-    parser.add_argument('y',
-    type = int,
-    metavar = "year",
-    help = 'The year',
-    action = 'store')
-
-    parser.add_argument('m',
-    type = int,
-    metavar = "month",
-    help = 'The month',
-    action = 'store')
-
-    args = parser.parse_args()
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db)
-
-    r2 = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db1)
-
-
-    p = r.pipeline(False)
-
-    create_data_words_curve(r, r2, args.y, args.m, args.f)
-
-if __name__ == "__main__":
-    main()
--- a/bin/tests/WordsCurves.py
+++ b/bin/tests/WordsCurves.py
@ -1,57 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_words import *
-from packages.imported import *
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = 'Analysis Information Leak framework',
-    epilog = 'Thats drawing')
-
-    parser.add_argument('-db',
-    type = int,
-    default = 0,
-    help = 'The name of the Redis DB (default 0)',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('-cvs',
-    type = str,
-    metavar = "filename",
-    default = "wordstrendingdata",
-    help = 'The name of the cvs file wanted to be created',
-    action = 'store')
-
-    parser.add_argument('f',
-    type = str,
-    help = 'The file with the list of words',
-    action = 'store')
-
-    parser.add_argument('y',
-    type = int,
-    metavar = "year",
-    help = 'The year',
-    action = 'store')
-
-    parser.add_argument('m',
-    type = int,
-    metavar = "month",
-    help = 'The month',
-    action = 'store')
-
-    args = parser.parse_args()
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db)
-
-    p = r.pipeline(False)
-
-    create_curve_with_word_file(r, args.cvs, args.f, args.y, args.m)
-
-if __name__ == "__main__":
-    main()
--- a/bin/tests/Wordsranking_Populate.py
+++ b/bin/tests/Wordsranking_Populate.py
@ -1,54 +0,0 @@
-#!/usr/bin/python2.7
-# -*-coding:UTF-8 -*
-
-from packages.lib_words import *
-from packages.imported import *
-from pubsublogger import publisher
-
-def main():
-    """Main Function"""
-
-    parser = argparse.ArgumentParser(
-    description = 'Analysis Information Leak framework',
-    epilog = 'example : seq 2 | parallel ./Wordsranking_Populate.py -nbp 20')
-
-    parser.add_argument('-nbp',
-    type = int,
-    default = 200,
-    help = 'nbpaste',
-    action = 'store')
-
-    parser.add_argument('-db',
-    type = int,
-    default = 0,
-    help = 'The name of the Redis DB (default 0)',
-    choices=[0, 1, 2, 3, 4],
-    action = 'store')
-
-    parser.add_argument('-min',
-    type = int,
-    default = 4,
-    help = 'Minimum length of the inserted words (default 4)',
-    action = 'store')
-
-    parser.add_argument('-max',
-    type = int,
-    default = 200,
-    help = 'Maximum length of the inserted words (default 200)',
-    action = 'store')
-
-    args = parser.parse_args()
-
-    r = redis.StrictRedis(
-        host='localhost',
-        port=6379,
-        db=args.db)
-
-    p = r.pipeline(False)
-
-    publisher.channel = "youpi"
-
-    redis_words_ranking(p, r, args.nbp, args.min, args.max)
-
-if __name__ == "__main__":
-    main()