Indexer now create an index_dir when it became too large (search in index not updated yet)

This commit is contained in:
Mokaddem 2017-03-14 10:37:31 +01:00
parent 3c9cdac39a
commit 248469d61e

View file

@ -16,9 +16,24 @@ from pubsublogger import publisher
from whoosh.index import create_in, exists_in, open_dir from whoosh.index import create_in, exists_in, open_dir
from whoosh.fields import Schema, TEXT, ID from whoosh.fields import Schema, TEXT, ID
import os import os
from os.path import join, getsize
from Helper import Process from Helper import Process
# Config variable
INDEX_SIZE_THRESHOLD = 500 #Mb
TIME_WAIT = 1.0 #sec
# return in bytes
def check_index_size(indexnum):
global baseindexpath
the_index_name = "index_"+str(indexnum) if indexnum != 0 else "old_index"
the_index_name = os.path.join(baseindexpath, the_index_name)
cur_sum = 0
for root, dirs, files in os.walk(the_index_name):
cur_sum += sum(getsize(join(root, name)) for name in files)
return cur_sum
if __name__ == "__main__": if __name__ == "__main__":
publisher.port = 6380 publisher.port = 6380
@ -29,20 +44,37 @@ if __name__ == "__main__":
p = Process(config_section) p = Process(config_section)
# Indexer configuration - index dir and schema setup # Indexer configuration - index dir and schema setup
indexpath = os.path.join(os.environ['AIL_HOME'], baseindexpath = os.path.join(os.environ['AIL_HOME'],
p.config.get("Indexer", "path")) p.config.get("Indexer", "path"))
indexRegister_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Indexer", "register"))
indexertype = p.config.get("Indexer", "type") indexertype = p.config.get("Indexer", "type")
if indexertype == "whoosh": if indexertype == "whoosh":
schema = Schema(title=TEXT(stored=True), path=ID(stored=True, schema = Schema(title=TEXT(stored=True), path=ID(stored=True,
unique=True), unique=True),
content=TEXT) content=TEXT)
if not os.path.exists(indexpath): if not os.path.exists(baseindexpath):
os.mkdir(indexpath) os.mkdir(baseindexpath)
# create the index register if not present
if not os.path.isfile(indexRegister_path):
with open(indexRegister_path, 'w') as f:
f.write("1")
with open(indexRegister_path, "r") as f:
allIndex = f.read()
allIndex = allIndex.split(',')
allIndex.sort()
indexnum = int(allIndex[-1])
indexpath = os.path.join(baseindexpath, "index_"+str(indexnum))
if not exists_in(indexpath): if not exists_in(indexpath):
ix = create_in(indexpath, schema) ix = create_in(indexpath, schema)
else: else:
ix = open_dir(indexpath) ix = open_dir(indexpath)
last_refresh = time.time()
# LOGGING # # LOGGING #
publisher.info("ZMQ Indexer is Running") publisher.info("ZMQ Indexer is Running")
@ -59,6 +91,18 @@ if __name__ == "__main__":
docpath = message.split(" ", -1)[-1] docpath = message.split(" ", -1)[-1]
paste = PST.get_p_content() paste = PST.get_p_content()
print "Indexing :", docpath print "Indexing :", docpath
if time.time() - last_refresh > TIME_WAIT: #avoid calculating the index's size at each message
last_refresh = time.time()
if check_index_size(indexnum) > INDEX_SIZE_THRESHOLD*(1000*1000):
indexpath = os.path.join(baseindexpath, "index_"+str(indexnum+1))
ix = create_in(indexpath, schema, indexname=str(indexnum+1))
## Correctly handle the file
with open(indexRegister_path, "a") as f:
f.write(","+str(indexnum))
if indexertype == "whoosh": if indexertype == "whoosh":
indexwriter = ix.writer() indexwriter = ix.writer()
indexwriter.update_document( indexwriter.update_document(