chg: [Crawler] add docs

This commit is contained in:
Terrtia 2018-09-27 11:14:29 +02:00
parent c49e871ba8
commit 04b9d9fc1d
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
5 changed files with 107 additions and 34 deletions

View file

@ -96,3 +96,31 @@ In AIL, you can track terms, set of terms and even regexes without creating a de
- You can track a term by simply putting it in the box.
- You can track a set of terms by simply putting terms in an array surrounded by the '\' character. You can also set a custom threshold regarding the number of terms that must match to trigger the detection. For example, if you want to track the terms _term1_ and _term2_ at the same time, you can use the following rule: `\[term1, term2, [100]]\`
- You can track regexes as easily as tracking a term. You just have to put your regex in the box surrounded by the '/' character. For example, if you want to track the regex matching all email address having the domain _domain.net_, you can use the following aggressive rule: `/*.domain.net/`.
Crawler
---------------------
In AIL, you can crawl hidden services.
two types of configutation [explaination for what]:
1) use local Splash dockers (use the same host for Splash servers and AIL)
2) use remote Splash servers
- (Splash host) Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used)
- (Splash host) Setup your tor proxy[is already installed]:
- Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16``
(for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform)
- Restart the tor proxy: ``sudo service tor restart``
- (Splash host) Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]``
all the Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status.
- (AIL host) Edit the ``/bin/packages/config.cfg`` file:
- In the crawler section, set ``activate_crawler`` to ``True``
- Change the IP address of Splash servers if needed (remote only)
- Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports).
- (AIL host) launch all AIL crawler scripts using: ``./bin/LAUNCH.sh -c``

View file

@ -203,11 +203,16 @@ function launching_scripts {
function launching_crawler {
CONFIG=$AIL_BIN/packages/config.cfg
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}")
echo $lport
IFS='-' read -ra PORTS <<< "$lport"
if [ ${#PORTS[@]} -eq 1 ]
then
first_port=${PORTS[0]}
last_port=${PORTS[0]}
else
first_port=${PORTS[0]}
last_port=${PORTS[1]}
fi
screen -dmS "Crawler_AIL"
sleep 0.1
@ -465,7 +470,7 @@ function launch_all {
while [ "$1" != "" ]; do
case $1 in
-l | --launchAuto ) launch_all "automatic"; launching_crawler
-l | --launchAuto ) launch_all "automatic";
;;
-k | --killAll ) killall;
;;

View file

@ -240,4 +240,4 @@ db = 0
activate_crawler = True
crawler_depth_limit = 1
splash_url_onion = http://127.0.0.1
splash_onion_port = 8050-8050
splash_onion_port = 8050-8052

View file

@ -1,12 +1,12 @@
#!/bin/bash
usage() { echo "Usage: sudo $0 [-f <config_absolute_path>] [-p <port_start>] [-n <number_of_splash_servers>]" 1>&2;
echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)"
echo " -p: number of the first splash server port number. This number is incremented for the others splash server"
echo " -n: number of splash servers to start"
echo ""
echo "example:"
echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3"
echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)";
echo " -p: number of the first splash server port number. This number is incremented for the others splash server";
echo " -n: number of splash servers to start";
echo "";
echo "example:";
echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3";
exit 1;
}
@ -29,8 +29,7 @@ done
shift $((OPTIND-1))
if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
#usage
echo "usage"
usage;
fi
screen -dmS "Docker_Splash"

View file

@ -1,15 +1,56 @@
#!/bin/bash
read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r
echo # (optional) move to a new line
if [[ $REPLY =~ ^[Yy]$ ]]
then
install_docker() {
# install docker
sudo apt install docker.io
sudo apt install docker.io;
# pull splah docker
sudo docker pull scrapinghub/splash;
}
install_python_requirement() {
. ./AILENV/bin/activate;
pip3 install -U -r crawler_requirements.txt;
}
install_all() {
read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r
echo # (optional) move to a new line
if [[ $REPLY =~ ^[Yy]$ ]]
then
install_docker;
fi
install_python_requirement;
}
usage() {
echo "Usage: crawler_hidden_services_install.sh [-y | -n]" 1>&2;
echo " -y: install docker"
echo " -n: don't install docker"
echo ""
echo "example:"
echo "crawler_hidden_services_install.sh -y"
exit 1;
}
if [[ $1 == "" ]]; then
install_all;
exit;
else
key="$1"
case $key in
"")
install_all;
;;
-y|--yes)
install_docker;
install_python_requirement;
;;
-n|--no)
install_python_requirement;
;;
*) # unknown option
usage;
;;
esac
fi
# pull splah docker
sudo docker pull scrapinghub/splash
. ./AILENV/bin/activate
pip3 install -U -r pip3_packages_requirement.txt