chg: [module] add CEDetector

2024-11-22 22:27:17 +00:00 · 2024-10-11 14:36:02 +02:00 · 2024-10-11 14:36:02 +02:00 · 3fb281f6c3
commit 3fb281f6c3
parent 2ead8c21aa
4 changed files with 140 additions and 1 deletions
--- a/bin/lib/objects/Titles.py
+++ b/bin/lib/objects/Titles.py
@ -96,6 +96,7 @@ def create_title(content):
        title.create(content)
    return title
 class Titles(AbstractDaterangeObjects):
    """
        Titles Objects
--- a/bin/lib/objects/abstract_daterange_object.py
+++ b/bin/lib/objects/abstract_daterange_object.py
@ -193,9 +193,19 @@ class AbstractDaterangeObjects(ABC):
        self.type = obj_type
        self.obj_class = obj_class
    ################################################
    ################################################
    def get_ids(self):
        return r_object.smembers(f'{self.type}:all')
    def get_iterator(self):
        for obj_id in self.get_ids():
            yield self.obj_class(obj_id)
    ################################################
    ################################################
    # def get_ids_iterator(self):
    #     return r_object.sscan_iter(r_object, f'{self.type}:all')
--- a/bin/modules/CEDetector.py
+++ b/bin/modules/CEDetector.py
@ -0,0 +1,122 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
 The Onion Module
 ============================
 This module extract url from item and returning only ones which are tor
 related (.onion). All These urls are send to the crawler discovery queue.
 Requirements
 ------------
 *Need running Redis instances. (Redis)
 """
 import os
 import sys
 from textblob import TextBlob
 from nltk.tokenize import RegexpTokenizer
 sys.path.append(os.environ['AIL_BIN'])
 ##################################
 # Import Project packages
 ##################################
 from modules.abstract_module import AbstractModule
 from lib.ConfigLoader import ConfigLoader
 class CEDetector(AbstractModule):
    """docstring for Onion module."""
    def __init__(self, queue=True):
        super(CEDetector, self).__init__(queue=queue)
        config_loader = ConfigLoader()
        self.r_cache = config_loader.get_redis_conn("Redis_Cache")
        self.csam_words = self.load_world_file('csam_words')
        self.child_worlds = self.load_world_file('child_words')
        self.porn_worlds = self.load_world_file('porn_words')
        self.ce_tag = 'dark-web:topic="pornography-child-exploitation"'
        self.tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\//\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+',
                                         gaps=True, discard_empty=True)
    def load_world_file(self, path):
        words = set()
        try:
            with open(os.path.join(os.environ['AIL_HOME'], f'files/{path}')) as f:
                content = f.read()
        except FileNotFoundError:
            content = ''
        content = content.splitlines()
        for line in content:
            if line.startswith('#') or not line:
                continue
            word = line.split()
            if word:
                words.add(word[0])
        return words
    def compute(self, message):  # TODO LIMIT TO DARKWEB ???
        to_tag = False
        content = self.obj.get_content().lower()
        # print(content)
        is_csam = False
        is_child_word = False
        is_porn_world = False
        words = TextBlob(content, tokenizer=self.tokenizer).tokens
        words = set(words)
        for word in words:
            print(word)
            if word in self.csam_words:
                is_csam = True
            if word in self.child_worlds:
                is_child_word = True
            if word in self.porn_worlds:
                is_porn_world = True
            # PERF ???
            # if is_child_word and is_porn_world:
            #     break
        if is_csam:
            to_tag = True
        if is_child_word and is_porn_world:
            to_tag = True
        if to_tag:
            # print(f'{content}                         DETECTED')
            # print()
            self.add_message_to_queue(message=self.ce_tag, queue='Tags')
        return to_tag
 def test_detection():
    from lib import Tag
    from lib.objects.Domains import Domain
    from lib.objects.Titles import Title
    not_detected = set()
    tag = 'dark-web:topic="pornography-child-exploitation"'
    tag_key = f'domain::{tag}'
    for domain in Tag.get_obj_by_tag(tag_key):
        dom = Domain(domain)
        is_detected = False
        for h in dom.get_correlation('title').get('title', []):
            t = Title(h[1:])
            title = t.get_content()
            module.obj = title
            if module.compute(''):
                is_detected = True
        if not is_detected:
            not_detected.add(domain)
    print(not_detected)
 if __name__ == "__main__":
    module = CEDetector()
    module.run()
    # test_detection()
--- a/configs/modules.cfg
+++ b/configs/modules.cfg
@ -1,7 +1,7 @@
 ######## IMPORTERS ########
 [Crawler]
-publish = Importers,Tags,Images
+publish = Importers,Tags,Images,Titles
 [ZMQModuleImporter]
 publish = Importers
@ -172,6 +172,12 @@ publish = Item
 subscribe = Images
 publish = Item,Tags
 ######## TITLES ########
 [CEDetector]
 subscribe = Titles
 publish = Tags
 ######## CORE ########
 [Tags]