ail-framework/bin/packages/lib_words.py

import os
import string

from pubsublogger import publisher

import calendar
from datetime import date
from dateutil.rrule import rrule, DAILY


def listdirectory(path):
    """Path Traversing Function.

    :param path: -- The absolute pathname to a directory.

    This function is returning all the absolute path of the files contained in
    the argument directory.

    """
    fichier = []
    for root, dirs, files in os.walk(path):

        for i in files:

            fichier.append(os.path.join(root, i))

    return fichier

clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
"""It filters out non-printable characters from the string it receives."""


def create_dirfile(r_serv, directory, overwrite):
    """Create a file of path.

    :param r_serv: -- connexion to redis database
    :param directory: -- The folder where to launch the listing of the .gz files

    This function create a list in redis with inside the absolute path
    of all the pastes needed to be proceeded by function using parallel
    (like redis_words_ranking)

    """
    if overwrite:
        r_serv.delete("filelist")

        for x in listdirectory(directory):
            r_serv.lpush("filelist", x)

        publisher.info("The list was overwritten")

    else:
        if r_serv.llen("filelist") == 0:

            for x in listdirectory(directory):
                r_serv.lpush("filelist", x)

            publisher.info("New list created")
        else:

            for x in listdirectory(directory):
                r_serv.lpush("filelist", x)

            publisher.info("The list was updated with new elements")


def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
    """Create a csv file used with dygraph.

    :param r_serv: -- connexion to redis database
    :param csvfilename: -- the path to the .csv file created
    :param feederfilename: -- the path to the file which contain a list of words.
    :param year: -- (integer) The year to process
    :param month: -- (integer) The month to process

    This function create a .csv file using datas in redis.
    It's checking if the words contained in feederfilename and
    their respectives values by days exists. If these values are missing
    (Word not present during a day) it's will automatically put a 0
    to keep the timeline of the curve correct.

    """
    a = date(year, month, 01)
    b = date(year, month, calendar.monthrange(year, month)[1])
    days = {}
    words = []

    with open(feederfilename, 'rb') as F:
        # words of the files
        for word in F:
            # list of words (sorted as in the file)
            words.append(word[:-1])

        # for each days
        for dt in rrule(DAILY, dtstart=a, until=b):

            mot = []
            mot1 = []
            mot2 = []

            days[dt.strftime("%Y%m%d")] = ''
            # from the 1srt day to the last of the list
            for word in sorted(words):

                # if the word have a value for the day
                if r_serv.hexists(word, dt.strftime("%Y%m%d")):
                    mot1.append(str(word))
                    mot2.append(r_serv.hget(word, dt.strftime("%Y%m%d")))

                    mot = zip(mot1, mot2)

                    days[dt.strftime("%Y%m%d")] = mot
                else:

                    mot1.append(str(word))
                    mot2.append(0)

                    mot = zip(mot1, mot2)

                    days[dt.strftime("%Y%m%d")] = mot

    with open(csvfilename+".csv", 'wb') as F:
        F.write("Date," + ",".join(sorted(words)) + '\n')

        for x, s in days.items():
            val = []
            for y in s:
                val.append(y[1])

            F.write(x + ',' + str(val) + '\n')

    with open(csvfilename+".csv", 'rb') as F:
        h = F.read()
        h = h.replace("[", "")
        h = h.replace("]", "")
        h = h.replace('\'', "")

    with open(csvfilename+".csv", 'wb') as F:
        F.write(h)