mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-22 22:27:17 +00:00
chg: [Crawler] add docs
This commit is contained in:
parent
c49e871ba8
commit
04b9d9fc1d
5 changed files with 107 additions and 34 deletions
28
HOWTO.md
28
HOWTO.md
|
@ -96,3 +96,31 @@ In AIL, you can track terms, set of terms and even regexes without creating a de
|
|||
- You can track a term by simply putting it in the box.
|
||||
- You can track a set of terms by simply putting terms in an array surrounded by the '\' character. You can also set a custom threshold regarding the number of terms that must match to trigger the detection. For example, if you want to track the terms _term1_ and _term2_ at the same time, you can use the following rule: `\[term1, term2, [100]]\`
|
||||
- You can track regexes as easily as tracking a term. You just have to put your regex in the box surrounded by the '/' character. For example, if you want to track the regex matching all email address having the domain _domain.net_, you can use the following aggressive rule: `/*.domain.net/`.
|
||||
|
||||
|
||||
Crawler
|
||||
---------------------
|
||||
In AIL, you can crawl hidden services.
|
||||
|
||||
two types of configutation [explaination for what]:
|
||||
1) use local Splash dockers (use the same host for Splash servers and AIL)
|
||||
2) use remote Splash servers
|
||||
|
||||
- (Splash host) Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used)
|
||||
- (Splash host) Setup your tor proxy[is already installed]:
|
||||
- Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16``
|
||||
(for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform)
|
||||
- Restart the tor proxy: ``sudo service tor restart``
|
||||
|
||||
- (Splash host) Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]``
|
||||
all the Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status.
|
||||
|
||||
- (AIL host) Edit the ``/bin/packages/config.cfg`` file:
|
||||
- In the crawler section, set ``activate_crawler`` to ``True``
|
||||
- Change the IP address of Splash servers if needed (remote only)
|
||||
- Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports).
|
||||
|
||||
- (AIL host) launch all AIL crawler scripts using: ``./bin/LAUNCH.sh -c``
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -201,23 +201,28 @@ function launching_scripts {
|
|||
}
|
||||
|
||||
function launching_crawler {
|
||||
CONFIG=$AIL_BIN/packages/config.cfg
|
||||
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}")
|
||||
echo $lport
|
||||
CONFIG=$AIL_BIN/packages/config.cfg
|
||||
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}")
|
||||
|
||||
IFS='-' read -ra PORTS <<< "$lport"
|
||||
first_port=${PORTS[0]}
|
||||
last_port=${PORTS[1]}
|
||||
IFS='-' read -ra PORTS <<< "$lport"
|
||||
if [ ${#PORTS[@]} -eq 1 ]
|
||||
then
|
||||
first_port=${PORTS[0]}
|
||||
last_port=${PORTS[0]}
|
||||
else
|
||||
first_port=${PORTS[0]}
|
||||
last_port=${PORTS[1]}
|
||||
fi
|
||||
|
||||
screen -dmS "Crawler_AIL"
|
||||
sleep 0.1
|
||||
screen -dmS "Crawler_AIL"
|
||||
sleep 0.1
|
||||
|
||||
for ((i=first_port;i<=last_port;i++)); do
|
||||
screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x'
|
||||
sleep 0.1
|
||||
done
|
||||
for ((i=first_port;i<=last_port;i++)); do
|
||||
screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x'
|
||||
sleep 0.1
|
||||
done
|
||||
|
||||
echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT
|
||||
echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT
|
||||
}
|
||||
|
||||
function shutting_down_redis {
|
||||
|
@ -465,7 +470,7 @@ function launch_all {
|
|||
|
||||
while [ "$1" != "" ]; do
|
||||
case $1 in
|
||||
-l | --launchAuto ) launch_all "automatic"; launching_crawler
|
||||
-l | --launchAuto ) launch_all "automatic";
|
||||
;;
|
||||
-k | --killAll ) killall;
|
||||
;;
|
||||
|
|
|
@ -240,4 +240,4 @@ db = 0
|
|||
activate_crawler = True
|
||||
crawler_depth_limit = 1
|
||||
splash_url_onion = http://127.0.0.1
|
||||
splash_onion_port = 8050-8050
|
||||
splash_onion_port = 8050-8052
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
#!/bin/bash
|
||||
|
||||
usage() { echo "Usage: sudo $0 [-f <config_absolute_path>] [-p <port_start>] [-n <number_of_splash_servers>]" 1>&2;
|
||||
echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)"
|
||||
echo " -p: number of the first splash server port number. This number is incremented for the others splash server"
|
||||
echo " -n: number of splash servers to start"
|
||||
echo ""
|
||||
echo "example:"
|
||||
echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3"
|
||||
echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)";
|
||||
echo " -p: number of the first splash server port number. This number is incremented for the others splash server";
|
||||
echo " -n: number of splash servers to start";
|
||||
echo "";
|
||||
echo "example:";
|
||||
echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3";
|
||||
exit 1;
|
||||
}
|
||||
|
||||
|
@ -29,8 +29,7 @@ done
|
|||
shift $((OPTIND-1))
|
||||
|
||||
if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
|
||||
#usage
|
||||
echo "usage"
|
||||
usage;
|
||||
fi
|
||||
|
||||
screen -dmS "Docker_Splash"
|
||||
|
|
|
@ -1,15 +1,56 @@
|
|||
#!/bin/bash
|
||||
|
||||
read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r
|
||||
echo # (optional) move to a new line
|
||||
if [[ $REPLY =~ ^[Yy]$ ]]
|
||||
then
|
||||
install_docker() {
|
||||
# install docker
|
||||
sudo apt install docker.io
|
||||
sudo apt install docker.io;
|
||||
|
||||
# pull splah docker
|
||||
sudo docker pull scrapinghub/splash;
|
||||
}
|
||||
|
||||
install_python_requirement() {
|
||||
. ./AILENV/bin/activate;
|
||||
pip3 install -U -r crawler_requirements.txt;
|
||||
}
|
||||
|
||||
install_all() {
|
||||
read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r
|
||||
echo # (optional) move to a new line
|
||||
if [[ $REPLY =~ ^[Yy]$ ]]
|
||||
then
|
||||
install_docker;
|
||||
fi
|
||||
install_python_requirement;
|
||||
}
|
||||
|
||||
usage() {
|
||||
echo "Usage: crawler_hidden_services_install.sh [-y | -n]" 1>&2;
|
||||
echo " -y: install docker"
|
||||
echo " -n: don't install docker"
|
||||
echo ""
|
||||
echo "example:"
|
||||
echo "crawler_hidden_services_install.sh -y"
|
||||
exit 1;
|
||||
}
|
||||
|
||||
if [[ $1 == "" ]]; then
|
||||
install_all;
|
||||
exit;
|
||||
else
|
||||
key="$1"
|
||||
case $key in
|
||||
"")
|
||||
install_all;
|
||||
;;
|
||||
-y|--yes)
|
||||
install_docker;
|
||||
install_python_requirement;
|
||||
;;
|
||||
-n|--no)
|
||||
install_python_requirement;
|
||||
;;
|
||||
*) # unknown option
|
||||
usage;
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# pull splah docker
|
||||
sudo docker pull scrapinghub/splash
|
||||
|
||||
. ./AILENV/bin/activate
|
||||
pip3 install -U -r pip3_packages_requirement.txt
|
||||
|
|
Loading…
Reference in a new issue