ail-framework/bin/ZMQ_Sub_Indexer.py

79 lines
2.4 KiB
Python
Raw Normal View History

#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
The ZMQ_Sub_Indexer Module
============================
The ZMQ_Sub_Indexer modules is fetching the list of files to be processed
and index each file with a full-text indexer (Whoosh until now).
"""
2014-08-14 15:55:18 +00:00
import time
from packages import Paste
from pubsublogger import publisher
from whoosh.index import create_in, exists_in, open_dir
2014-08-14 15:55:18 +00:00
from whoosh.fields import Schema, TEXT, ID
import os
2014-08-19 17:07:07 +00:00
import Helper
2014-08-14 15:55:18 +00:00
2014-08-19 17:07:07 +00:00
if __name__ == "__main__":
publisher.channel = "Script"
# Subscriber
sub_config_section = 'PubSub_Global'
sub_name = 'indexer'
2014-08-19 17:07:07 +00:00
config_section = 'PubSub_Global'
config_channel = 'channel'
subscriber_name = 'indexer'
2014-08-19 17:07:07 +00:00
h = Helper.Redis_Queues(config_section, config_channel, subscriber_name)
# Indexer configuration - index dir and schema setup
2014-08-19 17:07:07 +00:00
indexpath = h.config.get("Indexer", "path")
indexertype = h.config.get("Indexer", "type")
if indexertype == "whoosh":
2014-08-19 17:07:07 +00:00
schema = Schema(title=TEXT(stored=True), path=ID(stored=True,
unique=True),
content=TEXT)
if not os.path.exists(indexpath):
os.mkdir(indexpath)
if not exists_in(indexpath):
ix = create_in(indexpath, schema)
else:
ix = open_dir(indexpath)
# LOGGING #
publisher.info("""ZMQ Indexer is Running""")
while True:
2014-08-14 15:55:18 +00:00
try:
2014-08-20 13:14:57 +00:00
message = h.redis_rpop()
2014-08-14 15:55:18 +00:00
if message is not None:
PST = Paste.Paste(message.split(" ", -1)[-1])
else:
2014-08-19 17:07:07 +00:00
if h.redis_queue_shutdown():
break
publisher.debug("Script Indexer is idling 10s")
time.sleep(1)
continue
2014-08-14 15:55:18 +00:00
docpath = message.split(" ", -1)[-1]
paste = PST.get_p_content()
print "Indexing :", docpath
if indexertype == "whoosh":
indexwriter = ix.writer()
2014-08-14 15:55:18 +00:00
indexwriter.update_document(
title=unicode(docpath, errors='ignore'),
path=unicode(docpath, errors='ignore'),
content=unicode(paste, errors='ignore'))
indexwriter.commit()
except IOError:
2014-08-14 15:55:18 +00:00
print "CRC Checksum Failed on :", PST.p_path
2014-08-19 17:07:07 +00:00
publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format(
PST.p_source, PST.p_date, PST.p_name))