From f01768036564538bdb1d06790695312e64cf4ce0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= <raphael@vinot.info>
Date: Mon, 8 Sep 2014 16:51:43 +0200
Subject: [PATCH] fix onions, cc and domain classifier modules

---
 bin/CreditCard.py    | 12 ++++++------
 bin/DomClassifier.py |  3 ++-
 bin/Onion.py         | 12 ++++++++----
 bin/tor_fetcher.py   |  5 ++++-
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/bin/CreditCard.py b/bin/CreditCard.py
index ba264c9d..b1bdc26e 100755
--- a/bin/CreditCard.py
+++ b/bin/CreditCard.py
@@ -29,12 +29,12 @@ if __name__ == "__main__":
 
     # Source: http://www.richardsramblings.com/regex/credit-card-numbers/
     cards = [
-        r'4\d{3}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}',  # 16-digit VISA, with separators
-        r'5[1-5]\d{2}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}',  # 16 digits MasterCard
-        r'6(?:011|22(?:(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}',  # Discover Card
-        r'35(?:2[89]|[3-8]\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}',  # Japan Credit Bureau (JCB)
-        r'3[47]\d\d(?:[\ \-]?)\d{6}(?:[\ \-]?)\d{5}',  # American Express
-        r'(?:5[0678]\d\d|6304|6390|67\d\d)\d{8,15}',  # Maestro
+        r'\b4\d{3}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # 16-digit VISA, with separators
+        r'\b5[1-5]\d{2}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # 16 digits MasterCard
+        r'\b6(?:011|22(?:(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # Discover Card
+        r'\b35(?:2[89]|[3-8]\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # Japan Credit Bureau (JCB)
+        r'\b3[47]\d\d(?:[\ \-]?)\d{6}(?:[\ \-]?)\d{5}\b',  # American Express
+        r'\b(?:5[0678]\d\d|6304|6390|67\d\d)\d{8,15}\b',  # Maestro
         ]
 
     regex = re.compile('|'.join(cards))
diff --git a/bin/DomClassifier.py b/bin/DomClassifier.py
index 1cbe4ed9..c2b9f4ef 100755
--- a/bin/DomClassifier.py
+++ b/bin/DomClassifier.py
@@ -27,6 +27,7 @@ def main():
 
     publisher.info("""ZMQ DomainClassifier is Running""")
 
+    c = DomainClassifier.domainclassifier.Extract(rawtext="")
     while True:
         try:
             message = p.get_from_set()
@@ -40,7 +41,7 @@ def main():
             paste = PST.get_p_content()
             mimetype = PST._get_p_encoding()
             if mimetype == "text/plain":
-                c = DomainClassifier.domainclassifier.Extract(rawtext=paste)
+                c.text(rawtext=paste)
                 c.potentialdomain()
                 c.validdomain(rtype=['A'], extended=True)
                 localizeddomains = c.include(expression=r'\.lu$')
diff --git a/bin/Onion.py b/bin/Onion.py
index 7d04d028..45a8a6aa 100755
--- a/bin/Onion.py
+++ b/bin/Onion.py
@@ -36,6 +36,8 @@ from Helper import Process
 
 def fetch(p, r_cache, urls, domains, path):
     failed = []
+    downloaded = []
+    print len(urls), 'Urls to fetch.'
     for url, domain in zip(urls, domains):
         if r_cache.exists(url) or url in failed:
             continue
@@ -47,10 +49,11 @@ def fetch(p, r_cache, urls, domains, path):
 
         if process.returncode == 0:
             r_cache.setbit(url, 0, 1)
-            r_cache.expire(url, 3600)
+            r_cache.expire(url, 360000)
+            downloaded.append(url)
             tempfile = process.stdout.read().strip()
             with open(tempfile, 'r') as f:
-                filename = path + domain
+                filename = path + domain + '.gz'
                 content = base64.standard_b64decode(f.read())
                 save_path = os.path.join(os.environ['AIL_HOME'],
                                          p.config.get("Directories", "pastes"),
@@ -65,9 +68,12 @@ def fetch(p, r_cache, urls, domains, path):
                 yield url
             os.unlink(tempfile)
         else:
+            r_cache.setbit(url, 0, 0)
+            r_cache.expire(url, 3600)
             failed.append(url)
             print 'Failed at downloading', url
             print process.stdout.read()
+    print 'Failed:', len(failed), 'Downloaded:', len(downloaded)
 
 
 if __name__ == "__main__":
@@ -121,8 +127,6 @@ if __name__ == "__main__":
                 # Saving the list of extracted onion domains.
                 PST.__setattr__(channel, domains_list)
                 PST.save_attribute_redis(channel, domains_list)
-                pprint.pprint(domains_list)
-                print PST.p_path
                 to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date,
                                                     PST.p_name)
                 if len(domains_list) > 0:
diff --git a/bin/tor_fetcher.py b/bin/tor_fetcher.py
index 305cd3f5..bd3d72d3 100644
--- a/bin/tor_fetcher.py
+++ b/bin/tor_fetcher.py
@@ -10,6 +10,9 @@ import base64
 import sys
 import tempfile
 
+# Max size in Mb
+max_size = 5
+
 def create_connection(address, timeout=None, source_address=None):
     sock = socks.socksocket()
     sock.connect(address)
@@ -21,7 +24,7 @@ def get_page(url, torclient_host='127.0.0.1', torclient_port=9050):
     request = urllib2.Request(url)
     # UA of the Tor browser bundle
     request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0')
-    return urllib2.urlopen(request).read()
+    return urllib2.urlopen(request, timeout=5).read(max_size * 100000)
 
 
 def makegzip64(s):