Merge pull request #26 from adulau/master

DomainClassifier module added
This commit is contained in:
Alexandre Dulaunoy 2014-09-08 16:49:58 +02:00
commit f229e2fbee
3 changed files with 96 additions and 0 deletions

90
bin/ZMQ_Sub_DomainClassifier.py Executable file
View file

@ -0,0 +1,90 @@
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
The ZMQ_Sub_DomainClassifier Module
============================
The ZMQ_Sub_DomainClassifier modules is fetching the list of files to be processed
and index each file with a full-text indexer (Whoosh until now).
"""
import redis
import ConfigParser
import time
from packages import Paste
from packages import ZMQ_PubSub
from pubsublogger import publisher
import DomainClassifier.domainclassifier
import os
configfile = './packages/config.cfg'
def main():
"""Main Function"""
# CONFIG #
cfg = ConfigParser.ConfigParser()
cfg.read(configfile)
# Redis
r_serv1 = redis.StrictRedis(
host=cfg.get("Redis_Queues", "host"),
port=cfg.getint("Redis_Queues", "port"),
db=cfg.getint("Redis_Queues", "db"))
# LOGGING #
publisher.channel = "Script"
# ZMQ #
# Subscriber
channel = cfg.get("PubSub_Global", "channel")
subscriber_name = "DomainClassifier"
subscriber_config_section = "PubSub_Global"
cc = cfg.get("PubSub_DomainClassifier", "cc")
cc_tld = cfg.get("PubSub_DomainClassifier", "cc_tld")
sub = ZMQ_PubSub.ZMQSub(configfile, subscriber_config_section, channel, subscriber_name)
# FUNCTIONS #
publisher.info("""ZMQ DomainClassifier is Running""")
c = DomainClassifier.domainclassifier.Extract(rawtext="")
while True:
try:
message = sub.get_msg_from_queue(r_serv1)
if message is not None:
PST = Paste.Paste(message.split(" ", -1)[-1])
else:
if r_serv1.sismember("SHUTDOWN_FLAGS", "Indexer"):
r_serv1.srem("SHUTDOWN_FLAGS", "Indexer")
publisher.warning("Shutdown Flag Up: Terminating.")
break
publisher.debug("Script DomainClassifier is idling 10s")
time.sleep(1)
continue
docpath = message.split(" ", -1)[-1]
paste = PST.get_p_content()
mimetype = PST._get_p_encoding()
if mimetype == "text/plain":
c.text(rawtext=paste)
c.potentialdomain()
c.validdomain(rtype=['A'],extended=True)
localizeddomains = c.include(expression=cc_tld)
if localizeddomains:
print (localizeddomains)
localizeddomains = c.localizedomain(cc=cc)
if localizeddomains:
print (localizeddomains)
except IOError:
print "CRC Checksum Failed on :", PST.p_path
publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format(PST.p_source, PST.p_date, PST.p_name))
pass
if __name__ == "__main__":
main()

View file

@ -67,6 +67,10 @@ channel = urls
# country code logged as critical # country code logged as critical
cc_critical = DE cc_critical = DE
[PubSub_DomainClassifier]
cc = DE
cc_tld = r'\.de$'
# Indexer configuration # Indexer configuration
[Indexer] [Indexer]
type = whoosh type = whoosh

View file

@ -26,6 +26,8 @@ ipython
flask flask
texttable texttable
#DomainClassifier
DomainClassifier
#Indexer requirements #Indexer requirements
whoosh whoosh