mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-13 01:58:22 +00:00
Add Domain Classifier module.
Cleanup in the config files.
This commit is contained in:
parent
b7c9e489c9
commit
fca00beed9
5 changed files with 67 additions and 36 deletions
58
bin/DomClassifier.py
Executable file
58
bin/DomClassifier.py
Executable file
|
@ -0,0 +1,58 @@
|
||||||
|
#!/usr/bin/env python2
|
||||||
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
|
"""
|
||||||
|
The DomClassifier Module
|
||||||
|
============================
|
||||||
|
|
||||||
|
The DomClassifier modules is fetching the list of files to be
|
||||||
|
processed and index each file with a full-text indexer (Whoosh until now).
|
||||||
|
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
from packages import Paste
|
||||||
|
from pubsublogger import publisher
|
||||||
|
|
||||||
|
import DomainClassifier.domainclassifier
|
||||||
|
from Helper import Process
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
publisher.port = 6380
|
||||||
|
publisher.channel = "Script"
|
||||||
|
|
||||||
|
config_section = 'DomClassifier'
|
||||||
|
|
||||||
|
p = Process(config_section)
|
||||||
|
|
||||||
|
publisher.info("""ZMQ DomainClassifier is Running""")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
message = p.get_from_set()
|
||||||
|
|
||||||
|
if message is not None:
|
||||||
|
PST = Paste.Paste(message)
|
||||||
|
else:
|
||||||
|
publisher.debug("Script DomClassifier is idling 10s")
|
||||||
|
time.sleep(1)
|
||||||
|
continue
|
||||||
|
paste = PST.get_p_content()
|
||||||
|
mimetype = PST._get_p_encoding()
|
||||||
|
if mimetype == "text/plain":
|
||||||
|
c = DomainClassifier.domainclassifier.Extract(rawtext=paste)
|
||||||
|
c.potentialdomain()
|
||||||
|
c.validdomain(rtype=['A'], extended=True)
|
||||||
|
localizeddomains = c.include(expression=r'\.lu$')
|
||||||
|
if localizeddomains:
|
||||||
|
print (localizeddomains)
|
||||||
|
localizeddomains = c.localizedomain(cc='LU')
|
||||||
|
if localizeddomains:
|
||||||
|
print (localizeddomains)
|
||||||
|
except IOError:
|
||||||
|
print "CRC Checksum Failed on :", PST.p_path
|
||||||
|
publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format(
|
||||||
|
PST.p_source, PST.p_date, PST.p_name))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -108,6 +108,8 @@ function launching_scripts {
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script" -X screen -t "Line" bash -c './Line.py; read x'
|
screen -S "Script" -X screen -t "Line" bash -c './Line.py; read x'
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
|
screen -S "Script" -X screen -t "DomainClassifier" bash -c './DomClassifier.py; read x'
|
||||||
|
sleep 0.1
|
||||||
screen -S "Script" -X screen -t "Categ" bash -c './Categ.py; read x'
|
screen -S "Script" -X screen -t "Categ" bash -c './Categ.py; read x'
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script" -X screen -t "Tokenize" bash -c './Tokenize.py; read x'
|
screen -S "Script" -X screen -t "Tokenize" bash -c './Tokenize.py; read x'
|
||||||
|
|
|
@ -10,16 +10,10 @@ host = localhost
|
||||||
port = 6379
|
port = 6379
|
||||||
db = 0
|
db = 0
|
||||||
|
|
||||||
[Redis_Log]
|
|
||||||
host = localhost
|
|
||||||
port = 6380
|
|
||||||
db = 0
|
|
||||||
|
|
||||||
[Redis_Queues]
|
[Redis_Queues]
|
||||||
host = localhost
|
host = localhost
|
||||||
port = 6381
|
port = 6381
|
||||||
db_sub = 0
|
db = 0
|
||||||
db_pub = 1
|
|
||||||
|
|
||||||
[Redis_Data_Merging]
|
[Redis_Data_Merging]
|
||||||
host = localhost
|
host = localhost
|
||||||
|
@ -37,35 +31,7 @@ host = localhost
|
||||||
port = 2013
|
port = 2013
|
||||||
db = 1
|
db = 1
|
||||||
|
|
||||||
# PUB / SUB : ZMQ
|
[Url]
|
||||||
[Feed]
|
|
||||||
address = tcp://crf.circl.lu:5556
|
|
||||||
topicfilter = 102
|
|
||||||
|
|
||||||
[PubSub_Global]
|
|
||||||
address = tcp://127.0.0.1:5000
|
|
||||||
channel = filelist
|
|
||||||
|
|
||||||
[PubSub_Longlines]
|
|
||||||
address = tcp://127.0.0.1:5001
|
|
||||||
channel_0 = Longlines
|
|
||||||
channel_1 = Shortlines
|
|
||||||
|
|
||||||
[PubSub_Words]
|
|
||||||
address = tcp://127.0.0.1:5002
|
|
||||||
channel_0 = words
|
|
||||||
|
|
||||||
[PubSub_Categ]
|
|
||||||
address = tcp://127.0.0.1:5003
|
|
||||||
channel_0 = creditcard_categ
|
|
||||||
channel_1 = mails_categ
|
|
||||||
channel_2 = onion_categ
|
|
||||||
channel_3 = web_categ
|
|
||||||
|
|
||||||
[PubSub_Url]
|
|
||||||
address = tcp://127.0.0.1:5004
|
|
||||||
channel = urls
|
|
||||||
# country code logged as critical
|
|
||||||
cc_critical = DE
|
cc_critical = DE
|
||||||
|
|
||||||
# Indexer configuration
|
# Indexer configuration
|
||||||
|
|
|
@ -15,6 +15,9 @@ subscribe = Redis_Global
|
||||||
subscribe = Redis_Global
|
subscribe = Redis_Global
|
||||||
publish = Redis_LinesShort,Redis_LinesLong
|
publish = Redis_LinesShort,Redis_LinesLong
|
||||||
|
|
||||||
|
[DomClassifier]
|
||||||
|
subscribe = Redis_Global
|
||||||
|
|
||||||
[Tokenize]
|
[Tokenize]
|
||||||
subscribe = Redis_LinesShort
|
subscribe = Redis_LinesShort
|
||||||
publish = Redis_Words
|
publish = Redis_Words
|
||||||
|
|
|
@ -35,6 +35,8 @@ pycountry
|
||||||
# To fetch Onion urls
|
# To fetch Onion urls
|
||||||
PySocks
|
PySocks
|
||||||
|
|
||||||
|
DomainClassifier
|
||||||
|
|
||||||
#ASN lookup requirements
|
#ASN lookup requirements
|
||||||
http://adns-python.googlecode.com/files/adns-python-1.2.1.tar.gz
|
http://adns-python.googlecode.com/files/adns-python-1.2.1.tar.gz
|
||||||
https://github.com/trolldbois/python-cymru-services/archive/master.zip
|
https://github.com/trolldbois/python-cymru-services/archive/master.zip
|
||||||
|
|
Loading…
Reference in a new issue