diff --git a/bin/DomClassifier.py b/bin/DomClassifier.py new file mode 100755 index 00000000..1cbe4ed9 --- /dev/null +++ b/bin/DomClassifier.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python2 +# -*-coding:UTF-8 -* + +""" +The DomClassifier Module +============================ + +The DomClassifier modules is fetching the list of files to be +processed and index each file with a full-text indexer (Whoosh until now). + +""" +import time +from packages import Paste +from pubsublogger import publisher + +import DomainClassifier.domainclassifier +from Helper import Process + + +def main(): + publisher.port = 6380 + publisher.channel = "Script" + + config_section = 'DomClassifier' + + p = Process(config_section) + + publisher.info("""ZMQ DomainClassifier is Running""") + + while True: + try: + message = p.get_from_set() + + if message is not None: + PST = Paste.Paste(message) + else: + publisher.debug("Script DomClassifier is idling 10s") + time.sleep(1) + continue + paste = PST.get_p_content() + mimetype = PST._get_p_encoding() + if mimetype == "text/plain": + c = DomainClassifier.domainclassifier.Extract(rawtext=paste) + c.potentialdomain() + c.validdomain(rtype=['A'], extended=True) + localizeddomains = c.include(expression=r'\.lu$') + if localizeddomains: + print (localizeddomains) + localizeddomains = c.localizedomain(cc='LU') + if localizeddomains: + print (localizeddomains) + except IOError: + print "CRC Checksum Failed on :", PST.p_path + publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format( + PST.p_source, PST.p_date, PST.p_name)) + +if __name__ == "__main__": + main() diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 981d32e6..6e8b6941 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -108,6 +108,8 @@ function launching_scripts { sleep 0.1 screen -S "Script" -X screen -t "Line" bash -c './Line.py; read x' sleep 0.1 + screen -S "Script" -X screen -t "DomainClassifier" bash -c './DomClassifier.py; read x' + sleep 0.1 screen -S "Script" -X screen -t "Categ" bash -c './Categ.py; read x' sleep 0.1 screen -S "Script" -X screen -t "Tokenize" bash -c './Tokenize.py; read x' diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 8c1bf8fd..6ddd5f20 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -10,16 +10,10 @@ host = localhost port = 6379 db = 0 -[Redis_Log] -host = localhost -port = 6380 -db = 0 - [Redis_Queues] host = localhost port = 6381 -db_sub = 0 -db_pub = 1 +db = 0 [Redis_Data_Merging] host = localhost @@ -37,35 +31,7 @@ host = localhost port = 2013 db = 1 -# PUB / SUB : ZMQ -[Feed] -address = tcp://crf.circl.lu:5556 -topicfilter = 102 - -[PubSub_Global] -address = tcp://127.0.0.1:5000 -channel = filelist - -[PubSub_Longlines] -address = tcp://127.0.0.1:5001 -channel_0 = Longlines -channel_1 = Shortlines - -[PubSub_Words] -address = tcp://127.0.0.1:5002 -channel_0 = words - -[PubSub_Categ] -address = tcp://127.0.0.1:5003 -channel_0 = creditcard_categ -channel_1 = mails_categ -channel_2 = onion_categ -channel_3 = web_categ - -[PubSub_Url] -address = tcp://127.0.0.1:5004 -channel = urls -# country code logged as critical +[Url] cc_critical = DE # Indexer configuration diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index 97ed166b..677a77d6 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -15,6 +15,9 @@ subscribe = Redis_Global subscribe = Redis_Global publish = Redis_LinesShort,Redis_LinesLong +[DomClassifier] +subscribe = Redis_Global + [Tokenize] subscribe = Redis_LinesShort publish = Redis_Words diff --git a/pip_packages_requirement.txt b/pip_packages_requirement.txt index 0fb95845..d175a4c3 100644 --- a/pip_packages_requirement.txt +++ b/pip_packages_requirement.txt @@ -35,6 +35,8 @@ pycountry # To fetch Onion urls PySocks +DomainClassifier + #ASN lookup requirements http://adns-python.googlecode.com/files/adns-python-1.2.1.tar.gz https://github.com/trolldbois/python-cymru-services/archive/master.zip