Added DomainTrending seems working.

Started search features with related html pages, not finish yet.
This commit is contained in:
Mokaddem 2016-07-05 16:53:03 +02:00
parent 8c1eeea6e6
commit 7ff9b9a583
8 changed files with 269 additions and 18 deletions

View file

@ -10,19 +10,72 @@ import re
import redis import redis
import os import os
from packages import lib_words from packages import lib_words
from packages.Date import Date
from pubsublogger import publisher from pubsublogger import publisher
from packages import Paste from packages import Paste
from Helper import Process from Helper import Process
from pyfaup.faup import Faup from pyfaup.faup import Faup
def analyse(field_name): # Config Var
threshold_need_to_look = 50
range_to_look = 10
threshold_to_plot = 1 #500%
to_plot = set()
clean_frequency = 10 #minutes
def analyse(server, field_name):
field = url_parsed[field_name] field = url_parsed[field_name]
if field is not None: if field is not None:
prev_score = r_serv1.hget(field, date) prev_score = server.hget(field, date)
if prev_score is not None: if prev_score is not None:
r_serv1.hset(field, date, int(prev_score) + 1) server.hset(field, date, int(prev_score) + 1)
else: else:
r_serv1.hset(field, date, 1) server.hset(field, date, 1)
def analyse_and_progression(server, field_name):
field = url_parsed[field_name]
if field is not None:
prev_score = server.hget(field, date)
if prev_score is not None:
print field + ' prev_score:' + prev_score
server.hset(field, date, int(prev_score) + 1)
if int(prev_score) + 1 > threshold_need_to_look: #threshold for false possitive
if(check_for_progression(server, field, date)):
to_plot.add(field)
else:
server.hset(field, date, 1)
def check_for_progression(server, field, date):
previous_data = set()
tot_sum = 0
for i in range(0, range_to_look):
curr_value = server.hget(field, Date(date).substract_day(i))
if curr_value is None: #no further data
break
else:
curr_value = int(curr_value)
previous_data.add(curr_value)
tot_sum += curr_value
if i == 0:
today_val = curr_value
print 'totsum='+str(tot_sum)
print 'div='+str(tot_sum/today_val)
if tot_sum/today_val >= threshold_to_plot:
return True
else:
return False
def clean_to_plot():
temp_to_plot = set()
curr_date = datetime.date.today()
date = Date(str(curr_date.year)+str(curr_date.month)+str(curr_date.day))
for elem in to_plot:
if(check_for_progression(field, date)):
temp_to_plot.add(elem)
to_plot = temp_to_plot
if __name__ == '__main__': if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
@ -45,6 +98,11 @@ if __name__ == '__main__':
host=p.config.get("Redis_Level_DB", "host"), host=p.config.get("Redis_Level_DB", "host"),
port=p.config.get("Redis_Level_DB", "port"), port=p.config.get("Redis_Level_DB", "port"),
db=p.config.get("Redis_Level_DB", "db")) db=p.config.get("Redis_Level_DB", "db"))
r_serv2 = redis.StrictRedis(
host=p.config.get("Redis_Level_DB_Domain", "host"),
port=p.config.get("Redis_Level_DB_Domain", "port"),
db=p.config.get("Redis_Level_DB_Domain", "db"))
# FILE CURVE SECTION # # FILE CURVE SECTION #
csv_path_proto = os.path.join(os.environ['AIL_HOME'], csv_path_proto = os.path.join(os.environ['AIL_HOME'],
@ -57,6 +115,10 @@ if __name__ == '__main__':
tldsfile_path = os.path.join(os.environ['AIL_HOME'], tldsfile_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "tldsfile")) p.config.get("Directories", "tldsfile"))
csv_path_domain = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "domainstrending_csv"))
faup = Faup() faup = Faup()
generate_new_graph = False generate_new_graph = False
# Endless loop getting messages from the input queue # Endless loop getting messages from the input queue
@ -71,17 +133,22 @@ if __name__ == '__main__':
today = datetime.date.today() today = datetime.date.today()
year = today.year year = today.year
month = today.month month = today.month
print 'b1'
lib_words.create_curve_with_word_file(r_serv1, csv_path_proto, lib_words.create_curve_with_word_file(r_serv1, csv_path_proto,
protocolsfile_path, year, protocolsfile_path, year,
month) month)
print 'b2'
lib_words.create_curve_with_word_file(r_serv1, csv_path_tld, lib_words.create_curve_with_word_file(r_serv1, csv_path_tld,
tldsfile_path, year, tldsfile_path, year,
month) month)
print 'b3'
lib_words.create_curve_with_list(r_serv2, csv_path_domain,
to_plot, year,
month)
print 'end building'
publisher.debug("{} queue is empty, waiting".format(config_section)) publisher.debug("{} queue is empty, waiting".format(config_section))
time.sleep(1) print 'sleeping'
time.sleep(5)
continue continue
else: else:
@ -91,5 +158,8 @@ if __name__ == '__main__':
faup.decode(url) faup.decode(url)
url_parsed = faup.get() url_parsed = faup.get()
analyse('scheme') #Scheme analysis analyse(r_serv1, 'scheme') #Scheme analysis
analyse('tld') #Tld analysis analyse(r_serv1, 'tld') #Tld analysis
analyse_and_progression(r_serv2, 'domain') #Domain analysis
print "to_plot:"
print to_plot

View file

@ -30,3 +30,12 @@ class Date(object):
def _set_day(self, day): def _set_day(self, day):
self.day = day self.day = day
def substract_day(self, numDay):
import datetime
computed_date = datetime.date(int(self.year), int(self.month), int(self.day)) - datetime.timedelta(numDay)
comp_year = str(computed_date.year)
comp_month = str(computed_date.month).zfill(2)
comp_day = str(computed_date.day).zfill(2)
return comp_year + comp_month + comp_day

View file

@ -186,7 +186,9 @@ class Paste(object):
if the paste doesn't contain any human dictionnary words if the paste doesn't contain any human dictionnary words
..seealso: git@github.com:saffsd/langid.py.git ..seealso: git@github.com:saffsd/langid.py.git
""" FIXME: This procedure is using more than 20% of CPU
"""
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
return identifier.classify(self.get_p_content()) return identifier.classify(self.get_p_content())

View file

@ -81,13 +81,14 @@ def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month
to keep the timeline of the curve correct. to keep the timeline of the curve correct.
""" """
threshold = 50
first_day = date(year, month, 01) first_day = date(year, month, 01)
last_day = date(year, month, calendar.monthrange(year, month)[1]) last_day = date(year, month, calendar.monthrange(year, month)[1])
words = [] words = []
with open(feederfilename, 'rb') as f: with open(feederfilename, 'rb') as f:
# words of the files # words of the files
words = sorted([word.strip() for word in f]) words = sorted([word.strip() for word in f if word.strip()[0:2]!='//' ])
headers = ['Date'] + words headers = ['Date'] + words
with open(csvfilename+'.csv', 'wb') as f: with open(csvfilename+'.csv', 'wb') as f:
@ -102,6 +103,47 @@ def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month
# from the 1srt day to the last of the list # from the 1srt day to the last of the list
for word in words: for word in words:
value = r_serv.hget(word, curdate) value = r_serv.hget(word, curdate)
if value is None:
row.append(0)
else:
# if the word have a value for the day
# FIXME Due to performance issues (too many tlds, leads to more than 7s to perform this procedure), I added a threshold
if value >= threshold:
row.append(value)
writer.writerow(row)
def create_curve_with_list(server, csvfilename, to_plot, year, month):
"""Create a csv file used with dygraph.
:param r_serv: -- connexion to redis database
:param csvfilename: -- the path to the .csv file created
:param to_plot: -- the list which contain a words to plot.
:param year: -- (integer) The year to process
:param month: -- (integer) The month to process
This function create a .csv file using datas in redis.
It's checking if the words contained in to_plot and
their respectives values by days exists.
"""
first_day = date(year, month, 01)
last_day = date(year, month, calendar.monthrange(year, month)[1])
words = sorted(to_plot)
headers = ['Date'] + words
with open(csvfilename+'.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(headers)
# for each days
for dt in rrule(DAILY, dtstart=first_day, until=last_day):
row = []
curdate = dt.strftime("%Y%m%d")
row.append(curdate)
# from the 1srt day to the last of the list
for word in words:
value = server.hget(word, curdate)
if value is None: if value is None:
row.append(0) row.append(0)
else: else:

View file

@ -7,7 +7,9 @@ import json
from flask import Flask, render_template, jsonify, request from flask import Flask, render_template, jsonify, request
import flask import flask
import os import os
import sys
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
import Paste
# CONFIG # # CONFIG #
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
@ -18,6 +20,7 @@ if not os.path.exists(configfile):
cfg = ConfigParser.ConfigParser() cfg = ConfigParser.ConfigParser()
cfg.read(configfile) cfg.read(configfile)
max_preview_char = 500
# REDIS # # REDIS #
r_serv = redis.StrictRedis( r_serv = redis.StrictRedis(
@ -49,6 +52,10 @@ def get_queues(r):
r.hgetall("queues").iteritems()] r.hgetall("queues").iteritems()]
def list_len(s):
return len(s)
app.jinja_env.filters['list_len'] = list_len
@app.route("/_logs") @app.route("/_logs")
def logs(): def logs():
return flask.Response(event_stream(), mimetype="text/event-stream") return flask.Response(event_stream(), mimetype="text/event-stream")
@ -65,6 +72,7 @@ def search():
q = [] q = []
q.append(query) q.append(query)
r = [] r = []
c = []
# Search # Search
from whoosh import index from whoosh import index
from whoosh.fields import Schema, TEXT, ID from whoosh.fields import Schema, TEXT, ID
@ -78,7 +86,10 @@ def search():
results = searcher.search(query, limit=None) results = searcher.search(query, limit=None)
for x in results: for x in results:
r.append(x.items()[0][1]) r.append(x.items()[0][1])
return render_template("search.html", r=r) content = Paste.Paste(x.items()[0][1]).get_p_content()
content_range = max_preview_char if len(content)>max_preview_char else len(content)-1
c.append(content[0:content_range])
return render_template("search.html", r=r, c=c)
@app.route("/") @app.route("/")
def index(): def index():
@ -104,6 +115,10 @@ def protocolstrending():
def tldstrending(): def tldstrending():
return render_template("Tldstrending.html") return render_template("Tldstrending.html")
@app.route("/showsavedpaste/")
def showsavedpaste():
return render_template("show_saved_paste.html")
if __name__ == "__main__": if __name__ == "__main__":
app.run(host='0.0.0.0', port=7000, threaded=True) app.run(host='0.0.0.0', port=7000, threaded=True)

View file

@ -130,7 +130,7 @@
<!-- instanciate and plot graphs --> <!-- instanciate and plot graphs -->
<script type="text/javascript"> <script type="text/javascript">
var graph_tld = new Graph("TldsTrending", "../static//csv/tldstrendingdata.csv"); var graph_tld = new Graph("TldsTrending", "../static//csv/tldstrendingdata.csv");
var graph_domain = new Graph("DomainTrending", "../static//csv/tldstrendingdata.csv"); var graph_domain = new Graph("DomainTrending", "../static//csv/domainstrendingdata.csv");
</script> </script>
</div> </div>
<script src="{{ url_for('static', filename='js/bootstrap.min.js') }}"></script> <script src="{{ url_for('static', filename='js/bootstrap.min.js') }}"></script>

View file

@ -16,6 +16,16 @@
<script type="text/javascript" src="{{ url_for('static', filename='js/dygraph-combined.js') }}"></script> <script type="text/javascript" src="{{ url_for('static', filename='js/dygraph-combined.js') }}"></script>
<script src="{{ url_for('static', filename='js/jquery-1.4.2.js') }}"></script> <script src="{{ url_for('static', filename='js/jquery-1.4.2.js') }}"></script>
<script language="javascript" src="{{ url_for('static', filename='js/jquery.js')}}"></script> <script language="javascript" src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<style>
.tooltip-inner {
text-align: left;
height: 200%;
width: 200%;
max-width: 500px;
max-height: 500px;
font-size: 13px;
}
</style>
</head> </head>
<body> <body>
@ -39,6 +49,26 @@
</div> </div>
<!-- /.navbar-static-side --> <!-- /.navbar-static-side -->
</nav> </nav>
<!-- Modal -->
<div id="mymodal" class="modal fade" role="dialog">
<div class="modal-dialog modal-lg">
<!-- Modal content-->
<div id="mymodalcontent" class="modal-content">
<div id="mymodalbody" class="modal-body">
<p>Some text in the modal.</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
<div id="page-wrapper"> <div id="page-wrapper">
<!-- /.row --> <!-- /.row -->
<div class="row"> </div> <div class="row"> </div>
@ -53,10 +83,26 @@
</div> </div>
<!-- /.panel-heading --> <!-- /.panel-heading -->
<div class="panel-body"> <div class="panel-body">
<table class="table"> <table class="table table-hover">
{% for result in r %} <thead>
<tr><td>{{ result }}</td></tr> <tr>
<th>#</th>
<th>Path</th>
<th>Action</th>
</tr>
</thead>
<tbody>
{% set i = 0 %}
{% for path in r %}
{% set prev_content = c[i] %}
<tr>
<td>{{ i + 1 }}</td>
<td><a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ path }}"> {{ path }}</a></td>
<td><p><span class="glyphicon glyphicon-info-sign" data-toggle="tooltip" data-placement="left" title="{{ prev_content }}"></span> <button type="button" class="btn-link" data-toggle="modal" data-target="#mymodal" data-url="{{ url_for('showsavedpaste') }}?paste={{ path }}"><span class="fa fa-search-plus"></span></button></p></td>
</tr>
{% set i = i + 1 %}
{% endfor %} {% endfor %}
</tbody>
</table> </table>
</div> </div>
<!-- /.panel-body --> <!-- /.panel-body -->
@ -69,4 +115,23 @@
<script src="{{ url_for('static', filename='js/bootstrap.min.js') }}"></script> <script src="{{ url_for('static', filename='js/bootstrap.min.js') }}"></script>
</body> </body>
<!-- enable tooltip -->
<script>
$(document).ready(function(){
$('[data-toggle="tooltip"]').tooltip();
});
</script>
<!-- Dynamically update the modal -->
<script type="text/javascript">
// On click, get html content from url and update the corresponding modal
$("[data-toggle='modal']").on("click", function (event) {
event.preventDefault();
var url = $(this).attr('data-url');
var modal_id = $(this).attr('data-target');
$.get(url, function (data) {
$("#mymodalbody").html(data);
});
});
</script>
</html> </html>

View file

@ -0,0 +1,48 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>Paste information</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<h2> Paste: </h2>
<h3> {{ request.args.get('paste') }} </h3>
<hr></br>
<table class="table table-condensed">
<thead>
<tr>
<th>Date</th>
<th>Source</th>
<th>Encoding</th>
<th>Language</th>
<th>Size</th>
<th>Mime</th>
<th>Number of line</th>
</tr>
</thead>
<tbody>
<tr>
<td>John</td>
<td>Doe</td>
<td>john@example.com</td>
</tr>
<tr>
<td>Mary</td>
<td>Moe</td>
<td>mary@example.com</td>
</tr>
<tr>
<td>July</td>
<td>Dooley</td>
<td>july@example.com</td>
</tr>
</tbody>
</table>
</body>
</html>