mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-23 06:37:15 +00:00
chg: [Crawler] add docs
This commit is contained in:
parent
c49e871ba8
commit
04b9d9fc1d
5 changed files with 107 additions and 34 deletions
28
HOWTO.md
28
HOWTO.md
|
@ -96,3 +96,31 @@ In AIL, you can track terms, set of terms and even regexes without creating a de
|
||||||
- You can track a term by simply putting it in the box.
|
- You can track a term by simply putting it in the box.
|
||||||
- You can track a set of terms by simply putting terms in an array surrounded by the '\' character. You can also set a custom threshold regarding the number of terms that must match to trigger the detection. For example, if you want to track the terms _term1_ and _term2_ at the same time, you can use the following rule: `\[term1, term2, [100]]\`
|
- You can track a set of terms by simply putting terms in an array surrounded by the '\' character. You can also set a custom threshold regarding the number of terms that must match to trigger the detection. For example, if you want to track the terms _term1_ and _term2_ at the same time, you can use the following rule: `\[term1, term2, [100]]\`
|
||||||
- You can track regexes as easily as tracking a term. You just have to put your regex in the box surrounded by the '/' character. For example, if you want to track the regex matching all email address having the domain _domain.net_, you can use the following aggressive rule: `/*.domain.net/`.
|
- You can track regexes as easily as tracking a term. You just have to put your regex in the box surrounded by the '/' character. For example, if you want to track the regex matching all email address having the domain _domain.net_, you can use the following aggressive rule: `/*.domain.net/`.
|
||||||
|
|
||||||
|
|
||||||
|
Crawler
|
||||||
|
---------------------
|
||||||
|
In AIL, you can crawl hidden services.
|
||||||
|
|
||||||
|
two types of configutation [explaination for what]:
|
||||||
|
1) use local Splash dockers (use the same host for Splash servers and AIL)
|
||||||
|
2) use remote Splash servers
|
||||||
|
|
||||||
|
- (Splash host) Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used)
|
||||||
|
- (Splash host) Setup your tor proxy[is already installed]:
|
||||||
|
- Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16``
|
||||||
|
(for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform)
|
||||||
|
- Restart the tor proxy: ``sudo service tor restart``
|
||||||
|
|
||||||
|
- (Splash host) Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]``
|
||||||
|
all the Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status.
|
||||||
|
|
||||||
|
- (AIL host) Edit the ``/bin/packages/config.cfg`` file:
|
||||||
|
- In the crawler section, set ``activate_crawler`` to ``True``
|
||||||
|
- Change the IP address of Splash servers if needed (remote only)
|
||||||
|
- Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports).
|
||||||
|
|
||||||
|
- (AIL host) launch all AIL crawler scripts using: ``./bin/LAUNCH.sh -c``
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -201,23 +201,28 @@ function launching_scripts {
|
||||||
}
|
}
|
||||||
|
|
||||||
function launching_crawler {
|
function launching_crawler {
|
||||||
CONFIG=$AIL_BIN/packages/config.cfg
|
CONFIG=$AIL_BIN/packages/config.cfg
|
||||||
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}")
|
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}")
|
||||||
echo $lport
|
|
||||||
|
|
||||||
IFS='-' read -ra PORTS <<< "$lport"
|
IFS='-' read -ra PORTS <<< "$lport"
|
||||||
first_port=${PORTS[0]}
|
if [ ${#PORTS[@]} -eq 1 ]
|
||||||
last_port=${PORTS[1]}
|
then
|
||||||
|
first_port=${PORTS[0]}
|
||||||
|
last_port=${PORTS[0]}
|
||||||
|
else
|
||||||
|
first_port=${PORTS[0]}
|
||||||
|
last_port=${PORTS[1]}
|
||||||
|
fi
|
||||||
|
|
||||||
screen -dmS "Crawler_AIL"
|
screen -dmS "Crawler_AIL"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
|
|
||||||
for ((i=first_port;i<=last_port;i++)); do
|
for ((i=first_port;i<=last_port;i++)); do
|
||||||
screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x'
|
screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x'
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
done
|
done
|
||||||
|
|
||||||
echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT
|
echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT
|
||||||
}
|
}
|
||||||
|
|
||||||
function shutting_down_redis {
|
function shutting_down_redis {
|
||||||
|
@ -465,7 +470,7 @@ function launch_all {
|
||||||
|
|
||||||
while [ "$1" != "" ]; do
|
while [ "$1" != "" ]; do
|
||||||
case $1 in
|
case $1 in
|
||||||
-l | --launchAuto ) launch_all "automatic"; launching_crawler
|
-l | --launchAuto ) launch_all "automatic";
|
||||||
;;
|
;;
|
||||||
-k | --killAll ) killall;
|
-k | --killAll ) killall;
|
||||||
;;
|
;;
|
||||||
|
|
|
@ -240,4 +240,4 @@ db = 0
|
||||||
activate_crawler = True
|
activate_crawler = True
|
||||||
crawler_depth_limit = 1
|
crawler_depth_limit = 1
|
||||||
splash_url_onion = http://127.0.0.1
|
splash_url_onion = http://127.0.0.1
|
||||||
splash_onion_port = 8050-8050
|
splash_onion_port = 8050-8052
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
usage() { echo "Usage: sudo $0 [-f <config_absolute_path>] [-p <port_start>] [-n <number_of_splash_servers>]" 1>&2;
|
usage() { echo "Usage: sudo $0 [-f <config_absolute_path>] [-p <port_start>] [-n <number_of_splash_servers>]" 1>&2;
|
||||||
echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)"
|
echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)";
|
||||||
echo " -p: number of the first splash server port number. This number is incremented for the others splash server"
|
echo " -p: number of the first splash server port number. This number is incremented for the others splash server";
|
||||||
echo " -n: number of splash servers to start"
|
echo " -n: number of splash servers to start";
|
||||||
echo ""
|
echo "";
|
||||||
echo "example:"
|
echo "example:";
|
||||||
echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3"
|
echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3";
|
||||||
exit 1;
|
exit 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -29,8 +29,7 @@ done
|
||||||
shift $((OPTIND-1))
|
shift $((OPTIND-1))
|
||||||
|
|
||||||
if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
|
if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
|
||||||
#usage
|
usage;
|
||||||
echo "usage"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
screen -dmS "Docker_Splash"
|
screen -dmS "Docker_Splash"
|
||||||
|
|
|
@ -1,15 +1,56 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r
|
install_docker() {
|
||||||
echo # (optional) move to a new line
|
|
||||||
if [[ $REPLY =~ ^[Yy]$ ]]
|
|
||||||
then
|
|
||||||
# install docker
|
# install docker
|
||||||
sudo apt install docker.io
|
sudo apt install docker.io;
|
||||||
|
|
||||||
|
# pull splah docker
|
||||||
|
sudo docker pull scrapinghub/splash;
|
||||||
|
}
|
||||||
|
|
||||||
|
install_python_requirement() {
|
||||||
|
. ./AILENV/bin/activate;
|
||||||
|
pip3 install -U -r crawler_requirements.txt;
|
||||||
|
}
|
||||||
|
|
||||||
|
install_all() {
|
||||||
|
read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r
|
||||||
|
echo # (optional) move to a new line
|
||||||
|
if [[ $REPLY =~ ^[Yy]$ ]]
|
||||||
|
then
|
||||||
|
install_docker;
|
||||||
|
fi
|
||||||
|
install_python_requirement;
|
||||||
|
}
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "Usage: crawler_hidden_services_install.sh [-y | -n]" 1>&2;
|
||||||
|
echo " -y: install docker"
|
||||||
|
echo " -n: don't install docker"
|
||||||
|
echo ""
|
||||||
|
echo "example:"
|
||||||
|
echo "crawler_hidden_services_install.sh -y"
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if [[ $1 == "" ]]; then
|
||||||
|
install_all;
|
||||||
|
exit;
|
||||||
|
else
|
||||||
|
key="$1"
|
||||||
|
case $key in
|
||||||
|
"")
|
||||||
|
install_all;
|
||||||
|
;;
|
||||||
|
-y|--yes)
|
||||||
|
install_docker;
|
||||||
|
install_python_requirement;
|
||||||
|
;;
|
||||||
|
-n|--no)
|
||||||
|
install_python_requirement;
|
||||||
|
;;
|
||||||
|
*) # unknown option
|
||||||
|
usage;
|
||||||
|
;;
|
||||||
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# pull splah docker
|
|
||||||
sudo docker pull scrapinghub/splash
|
|
||||||
|
|
||||||
. ./AILENV/bin/activate
|
|
||||||
pip3 install -U -r pip3_packages_requirement.txt
|
|
||||||
|
|
Loading…
Reference in a new issue