From f01768036564538bdb1d06790695312e64cf4ce0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Mon, 8 Sep 2014 16:51:43 +0200 Subject: [PATCH] fix onions, cc and domain classifier modules --- bin/CreditCard.py | 12 ++++++------ bin/DomClassifier.py | 3 ++- bin/Onion.py | 12 ++++++++---- bin/tor_fetcher.py | 5 ++++- 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/bin/CreditCard.py b/bin/CreditCard.py index ba264c9d..b1bdc26e 100755 --- a/bin/CreditCard.py +++ b/bin/CreditCard.py @@ -29,12 +29,12 @@ if __name__ == "__main__": # Source: http://www.richardsramblings.com/regex/credit-card-numbers/ cards = [ - r'4\d{3}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}', # 16-digit VISA, with separators - r'5[1-5]\d{2}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}', # 16 digits MasterCard - r'6(?:011|22(?:(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}', # Discover Card - r'35(?:2[89]|[3-8]\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}', # Japan Credit Bureau (JCB) - r'3[47]\d\d(?:[\ \-]?)\d{6}(?:[\ \-]?)\d{5}', # American Express - r'(?:5[0678]\d\d|6304|6390|67\d\d)\d{8,15}', # Maestro + r'\b4\d{3}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # 16-digit VISA, with separators + r'\b5[1-5]\d{2}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # 16 digits MasterCard + r'\b6(?:011|22(?:(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # Discover Card + r'\b35(?:2[89]|[3-8]\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # Japan Credit Bureau (JCB) + r'\b3[47]\d\d(?:[\ \-]?)\d{6}(?:[\ \-]?)\d{5}\b', # American Express + r'\b(?:5[0678]\d\d|6304|6390|67\d\d)\d{8,15}\b', # Maestro ] regex = re.compile('|'.join(cards)) diff --git a/bin/DomClassifier.py b/bin/DomClassifier.py index 1cbe4ed9..c2b9f4ef 100755 --- a/bin/DomClassifier.py +++ b/bin/DomClassifier.py @@ -27,6 +27,7 @@ def main(): publisher.info("""ZMQ DomainClassifier is Running""") + c = DomainClassifier.domainclassifier.Extract(rawtext="") while True: try: message = p.get_from_set() @@ -40,7 +41,7 @@ def main(): paste = PST.get_p_content() mimetype = PST._get_p_encoding() if mimetype == "text/plain": - c = DomainClassifier.domainclassifier.Extract(rawtext=paste) + c.text(rawtext=paste) c.potentialdomain() c.validdomain(rtype=['A'], extended=True) localizeddomains = c.include(expression=r'\.lu$') diff --git a/bin/Onion.py b/bin/Onion.py index 7d04d028..45a8a6aa 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -36,6 +36,8 @@ from Helper import Process def fetch(p, r_cache, urls, domains, path): failed = [] + downloaded = [] + print len(urls), 'Urls to fetch.' for url, domain in zip(urls, domains): if r_cache.exists(url) or url in failed: continue @@ -47,10 +49,11 @@ def fetch(p, r_cache, urls, domains, path): if process.returncode == 0: r_cache.setbit(url, 0, 1) - r_cache.expire(url, 3600) + r_cache.expire(url, 360000) + downloaded.append(url) tempfile = process.stdout.read().strip() with open(tempfile, 'r') as f: - filename = path + domain + filename = path + domain + '.gz' content = base64.standard_b64decode(f.read()) save_path = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"), @@ -65,9 +68,12 @@ def fetch(p, r_cache, urls, domains, path): yield url os.unlink(tempfile) else: + r_cache.setbit(url, 0, 0) + r_cache.expire(url, 3600) failed.append(url) print 'Failed at downloading', url print process.stdout.read() + print 'Failed:', len(failed), 'Downloaded:', len(downloaded) if __name__ == "__main__": @@ -121,8 +127,6 @@ if __name__ == "__main__": # Saving the list of extracted onion domains. PST.__setattr__(channel, domains_list) PST.save_attribute_redis(channel, domains_list) - pprint.pprint(domains_list) - print PST.p_path to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date, PST.p_name) if len(domains_list) > 0: diff --git a/bin/tor_fetcher.py b/bin/tor_fetcher.py index 305cd3f5..bd3d72d3 100644 --- a/bin/tor_fetcher.py +++ b/bin/tor_fetcher.py @@ -10,6 +10,9 @@ import base64 import sys import tempfile +# Max size in Mb +max_size = 5 + def create_connection(address, timeout=None, source_address=None): sock = socks.socksocket() sock.connect(address) @@ -21,7 +24,7 @@ def get_page(url, torclient_host='127.0.0.1', torclient_port=9050): request = urllib2.Request(url) # UA of the Tor browser bundle request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0') - return urllib2.urlopen(request).read() + return urllib2.urlopen(request, timeout=5).read(max_size * 100000) def makegzip64(s):