Merge pull request #114 from mokaddem/newIndexer

New indexer behavior
This commit is contained in:
Raphaël Vinot 2017-04-21 11:59:28 +02:00 committed by GitHub
commit 3eeaefa38a
5 changed files with 198 additions and 28 deletions

View file

@ -15,10 +15,28 @@ from pubsublogger import publisher
from whoosh.index import create_in, exists_in, open_dir from whoosh.index import create_in, exists_in, open_dir
from whoosh.fields import Schema, TEXT, ID from whoosh.fields import Schema, TEXT, ID
import shutil
import os import os
from os.path import join, getsize
from Helper import Process from Helper import Process
# Config variable
TIME_WAIT = 60*15 #sec
# return in bytes
def check_index_size(baseindexpath, indexname):
the_index_name = join(baseindexpath, indexname)
cur_sum = 0
for root, dirs, files in os.walk(the_index_name):
cur_sum += sum(getsize(join(root, name)) for name in files)
return cur_sum
def move_index_into_old_index_folder(baseindexpath):
for cur_file in os.listdir(baseindexpath):
if not cur_file == "old_index":
shutil.move(join(baseindexpath, cur_file), join(join(baseindexpath, "old_index"), cur_file))
if __name__ == "__main__": if __name__ == "__main__":
publisher.port = 6380 publisher.port = 6380
@ -29,20 +47,51 @@ if __name__ == "__main__":
p = Process(config_section) p = Process(config_section)
# Indexer configuration - index dir and schema setup # Indexer configuration - index dir and schema setup
indexpath = os.path.join(os.environ['AIL_HOME'], baseindexpath = join(os.environ['AIL_HOME'],
p.config.get("Indexer", "path")) p.config.get("Indexer", "path"))
indexRegister_path = join(os.environ['AIL_HOME'],
p.config.get("Indexer", "register"))
indexertype = p.config.get("Indexer", "type") indexertype = p.config.get("Indexer", "type")
INDEX_SIZE_THRESHOLD = int(p.config.get("Indexer", "index_max_size"))
if indexertype == "whoosh": if indexertype == "whoosh":
schema = Schema(title=TEXT(stored=True), path=ID(stored=True, schema = Schema(title=TEXT(stored=True), path=ID(stored=True,
unique=True), unique=True),
content=TEXT) content=TEXT)
if not os.path.exists(indexpath): if not os.path.exists(baseindexpath):
os.mkdir(indexpath) os.mkdir(baseindexpath)
# create the index register if not present
time_now = int(time.time())
if not os.path.isfile(indexRegister_path): #index are not organised
print("Indexes are not organized")
print("moving all files in folder 'old_index' ")
#move all files to old_index folder
move_index_into_old_index_folder(baseindexpath)
print("Creating new index")
#create all_index.txt
with open(indexRegister_path, 'w') as f:
f.write(str(time_now))
#create dir
os.mkdir(join(baseindexpath, str(time_now)))
with open(indexRegister_path, "r") as f:
allIndex = f.read()
allIndex = allIndex.split() # format [time1\ntime2]
allIndex.sort()
try:
indexname = allIndex[-1].strip('\n\r')
except IndexError as e:
indexname = time_now
indexpath = join(baseindexpath, str(indexname))
if not exists_in(indexpath): if not exists_in(indexpath):
ix = create_in(indexpath, schema) ix = create_in(indexpath, schema)
else: else:
ix = open_dir(indexpath) ix = open_dir(indexpath)
last_refresh = time_now
# LOGGING # # LOGGING #
publisher.info("ZMQ Indexer is Running") publisher.info("ZMQ Indexer is Running")
@ -58,7 +107,24 @@ if __name__ == "__main__":
continue continue
docpath = message.split(" ", -1)[-1] docpath = message.split(" ", -1)[-1]
paste = PST.get_p_content() paste = PST.get_p_content()
print "Indexing :", docpath print "Indexing - "+indexname+" :", docpath
if time.time() - last_refresh > TIME_WAIT: #avoid calculating the index's size at each message
last_refresh = time.time()
if check_index_size(baseindexpath, indexname) >= INDEX_SIZE_THRESHOLD*(1000*1000):
timestamp = int(time.time())
print("Creating new index", timestamp)
indexpath = join(baseindexpath, str(timestamp))
indexname = str(timestamp)
#update all_index
with open(indexRegister_path, "a") as f:
f.write(str(timestamp))
#create new dir
os.mkdir(indexpath)
ix = create_in(indexpath, schema)
if indexertype == "whoosh": if indexertype == "whoosh":
indexwriter = ix.writer() indexwriter = ix.writer()
indexwriter.update_document( indexwriter.update_document(

View file

@ -123,6 +123,8 @@ cc_tld = r'\.de$'
[Indexer] [Indexer]
type = whoosh type = whoosh
path = indexdir path = indexdir
#size in Mb
index_max_size = 2000
############################################################################### ###############################################################################

View file

@ -7,10 +7,14 @@
import redis import redis
import json import json
import os import os
import datetime
import flask import flask
from flask import Flask, render_template, jsonify, request from flask import Flask, render_template, jsonify, request
import Paste import Paste
from whoosh import index
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
# ============ VARIABLES ============ # ============ VARIABLES ============
import Flask_config import Flask_config
@ -20,7 +24,62 @@ cfg = Flask_config.cfg
r_serv_pasteName = Flask_config.r_serv_pasteName r_serv_pasteName = Flask_config.r_serv_pasteName
max_preview_char = Flask_config.max_preview_char max_preview_char = Flask_config.max_preview_char
max_preview_modal = Flask_config.max_preview_modal max_preview_modal = Flask_config.max_preview_modal
baseindexpath = os.path.join(os.environ['AIL_HOME'], cfg.get("Indexer", "path"))
indexRegister_path = os.path.join(os.environ['AIL_HOME'],
cfg.get("Indexer", "register"))
# ============ FUNCTIONS ============ # ============ FUNCTIONS ============
def get_current_index():
with open(indexRegister_path, "r") as f:
allIndex = f.read()
allIndex = allIndex.split() # format [time1\ntime2]
allIndex.sort()
try:
indexname = allIndex[-1].strip('\n\r')
except IndexError as e:
indexname = "no-index"
indexpath = os.path.join(baseindexpath, indexname)
return indexpath
def get_index_list(selected_index=""):
temp = []
index_list = []
for dirs in os.listdir(baseindexpath):
if os.path.isdir(os.path.join(baseindexpath, dirs)):
value = dirs
name = to_iso_date(dirs) + " - " + \
str(get_dir_size(dirs) / (1000*1000)) + " Mb " + \
"(" + str(get_item_count(dirs)) + " Items" + ")"
flag = dirs==selected_index.split('/')[-1]
if dirs == "old_index":
temp = [value, name, flag]
else:
index_list.append([value, name, flag])
index_list.sort(reverse=True, key=lambda x: x[0])
if len(temp) != 0:
index_list.append(temp)
return index_list
def get_dir_size(directory):
cur_sum = 0
for directory, subdirs, files in os.walk(os.path.join(baseindexpath,directory)):
try:
cur_sum += sum(os.path.getsize(os.path.join(directory, name)) for name in files)
except OSError as e: #File disappeared
pass
return cur_sum
def get_item_count(dirs):
ix = index.open_dir(os.path.join(baseindexpath, dirs))
return ix.doc_count_all()
def to_iso_date(timestamp):
if timestamp == "old_index":
return "old_index"
return str(datetime.datetime.fromtimestamp(int(timestamp))).split()[0]
# ============ ROUTES ============ # ============ ROUTES ============
@ -34,8 +93,15 @@ def search():
c = [] #preview of the paste content c = [] #preview of the paste content
paste_date = [] paste_date = []
paste_size = [] paste_size = []
index_name = request.form['index_name']
num_elem_to_get = 50 num_elem_to_get = 50
# select correct index
if index_name is None or index_name == "0":
selected_index = get_current_index()
else:
selected_index = os.path.join(baseindexpath, index_name)
# Search filename # Search filename
for path in r_serv_pasteName.smembers(q[0]): for path in r_serv_pasteName.smembers(q[0]):
r.append(path) r.append(path)
@ -49,13 +115,9 @@ def search():
paste_size.append(paste._get_p_size()) paste_size.append(paste._get_p_size())
# Search full line # Search full line
from whoosh import index
from whoosh.fields import Schema, TEXT, ID
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
indexpath = os.path.join(os.environ['AIL_HOME'], cfg.get("Indexer", "path")) ix = index.open_dir(selected_index)
ix = index.open_dir(indexpath)
from whoosh.qparser import QueryParser
with ix.searcher() as searcher: with ix.searcher() as searcher:
query = QueryParser("content", ix.schema).parse(" ".join(q)) query = QueryParser("content", ix.schema).parse(" ".join(q))
results = searcher.search_page(query, 1, pagelen=num_elem_to_get) results = searcher.search_page(query, 1, pagelen=num_elem_to_get)
@ -72,7 +134,14 @@ def search():
results = searcher.search(query) results = searcher.search(query)
num_res = len(results) num_res = len(results)
return render_template("search.html", r=r, c=c, query=request.form['query'], paste_date=paste_date, paste_size=paste_size, char_to_display=max_preview_modal, num_res=num_res) index_min = 1
index_max = len(get_index_list())
return render_template("search.html", r=r, c=c,
query=request.form['query'], paste_date=paste_date,
paste_size=paste_size, char_to_display=max_preview_modal,
num_res=num_res, index_min=index_min, index_max=index_max,
index_list=get_index_list(selected_index)
)
@app.route("/get_more_search_result", methods=['POST']) @app.route("/get_more_search_result", methods=['POST'])
@ -81,20 +150,23 @@ def get_more_search_result():
q = [] q = []
q.append(query) q.append(query)
page_offset = int(request.form['page_offset']) page_offset = int(request.form['page_offset'])
index_name = request.form['index_name']
num_elem_to_get = 50 num_elem_to_get = 50
# select correct index
if index_name is None or index_name == "0":
selected_index = get_current_index()
else:
selected_index = os.path.join(baseindexpath, index_name)
path_array = [] path_array = []
preview_array = [] preview_array = []
date_array = [] date_array = []
size_array = [] size_array = []
from whoosh import index
from whoosh.fields import Schema, TEXT, ID
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
indexpath = os.path.join(os.environ['AIL_HOME'], cfg.get("Indexer", "path")) ix = index.open_dir(selected_index)
ix = index.open_dir(indexpath)
from whoosh.qparser import QueryParser
with ix.searcher() as searcher: with ix.searcher() as searcher:
query = QueryParser("content", ix.schema).parse(" ".join(q)) query = QueryParser("content", ix.schema).parse(" ".join(q))
results = searcher.search_page(query, page_offset, num_elem_to_get) results = searcher.search_page(query, page_offset, num_elem_to_get)
@ -113,7 +185,6 @@ def get_more_search_result():
to_return["preview_array"] = preview_array to_return["preview_array"] = preview_array
to_return["date_array"] = date_array to_return["date_array"] = date_array
to_return["size_array"] = size_array to_return["size_array"] = size_array
print "len(path_array)="+str(len(path_array))
if len(path_array) < num_elem_to_get: #pagelength if len(path_array) < num_elem_to_get: #pagelength
to_return["moreData"] = False to_return["moreData"] = False
else: else:

View file

@ -89,6 +89,16 @@
</div> </div>
<!-- /.panel-heading --> <!-- /.panel-heading -->
<div class="panel-body"> <div class="panel-body">
<div class="row">
<div class="col-md-12">
<strong style="">Index: </strong>
<select class="form-control" id="index_name" style="display: inline-block; margin-bottom: 5px; width: 30%">
{% for indexElem in index_list %}
<option {% if indexElem[2] %} selected="selected" {% endif %} value="{{ indexElem[0] }}" >{{ indexElem[1] }}</option>
{% endfor %}
</select>
</div>
</div>
<table class="table table-striped table-bordered table-hover" id="myTable"> <table class="table table-striped table-bordered table-hover" id="myTable">
<thead> <thead>
<tr> <tr>
@ -100,16 +110,14 @@
</tr> </tr>
</thead> </thead>
<tbody id="table_body"> <tbody id="table_body">
{% set i = 0 %}
{% for path in r %} {% for path in r %}
<tr> <tr>
<td>{{ i + 1 }}</td> <td>{{ loop.index0 + 1 }}</td>
<td><a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ path }}&num={{ i+1 }}"> {{ path }}</a></td> <td><a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ path }}&num={{ loop.index0+1 }}"> {{ path }}</a></td>
<td>{{ paste_date[i] }}</td> <td>{{ paste_date[loop.index0] }}</td>
<td>{{ paste_size[i] }}</td> <td>{{ paste_size[loop.index0] }}</td>
<td><p><span class="glyphicon glyphicon-info-sign" data-toggle="tooltip" data-placement="left" title="{{ c[i] }}"></span> <button type="button" class="btn-link" data-num="{{ i + 1 }}" data-toggle="modal" data-target="#mymodal" data-url="{{ url_for('showsavedpaste') }}?paste={{ path }}&num={{ i+1 }}" data-path="{{ path }}"><span class="fa fa-search-plus"></span></button></p></td> <td><p><span class="glyphicon glyphicon-info-sign" data-toggle="tooltip" data-placement="left" title="{{ c[loop.index0] }}"></span> <button type="button" class="btn-link" data-num="{{ loop.index0 + 1 }}" data-toggle="modal" data-target="#mymodal" data-url="{{ url_for('showsavedpaste') }}?paste={{ path }}&num={{ loop.index0+1 }}" data-path="{{ path }}"><span class="fa fa-search-plus"></span></button></p></td>
</tr> </tr>
{% set i = i + 1 %}
{% endfor %} {% endfor %}
</tbody> </tbody>
</table> </table>
@ -157,6 +165,28 @@
if (init_num_of_elements_in_table == pagelen) { if (init_num_of_elements_in_table == pagelen) {
$("#load_more_json_button1").show(); $("#load_more_json_button1").show();
} }
$('#index_name').on('change', function() {
var form = document.createElement('form');
form.setAttribute("method", 'post');
form.setAttribute("action", "{{ url_for('search') }}");
var input1 = document.createElement('input');
input1.setAttribute("type", "hidden");
input1.setAttribute("name", "index_name");
input1.setAttribute("value", this.value);
form.appendChild(input1);
var input2 = document.createElement('input');
input2.setAttribute("type", "hidden");
input2.setAttribute("name", "query");
input2.setAttribute("value", "{{ query }}");
form.appendChild(input2);
document.body.appendChild(form);
form.submit();
})
}); });
</script> </script>
@ -171,7 +201,7 @@
} }
function load_search_50_data() { function load_search_50_data() {
var options = { query: query, page_offset: page_offset }; var options = { query: query, page_offset: page_offset, index_name: $("#index_name").val() };
$.post( "{{ url_for('get_more_search_result') }}", options).done(function( data ) { $.post( "{{ url_for('get_more_search_result') }}", options).done(function( data ) {
for(i=0; i<data.path_array.length; i++) { for(i=0; i<data.path_array.length; i++) {

View file

@ -1,6 +1,7 @@
<div class="input-group custom-search-form"> <div class="input-group custom-search-form">
<form action="/search" id="form-search" method=POST> <form action="/search" id="form-search" method=POST>
<input type="text" name="query" class="form-control" placeholder="Search Paste"> <input type="text" name="query" class="form-control" placeholder="Search Paste">
<input type="hidden" name="index_name" class="form-control" value="0" placeholder="Index Name">
<span class="input-group-btn"> <span class="input-group-btn">
<button class="btn btn-default" type="submit"> <button class="btn btn-default" type="submit">
<i class="fa fa-search"></i> <i class="fa fa-search"></i>