ail-framework/bin/indexer_lookup.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# This file is part of AIL framework - Analysis Information Leak framework
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Copyright (c) 2014 Alexandre Dulaunoy - a@foo.be

import argparse
import gzip
import os
import sys

sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader

def readdoc(path=None):
    if path is None:
        return False
    f = gzip.open(path, 'r')
    return f.read()

config_loader = ConfigLoader.ConfigLoader()

# Indexer configuration - index dir and schema setup
indexpath = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Indexer", "path"))
indexertype = config_loader.get_config_str("Indexer", "type")

argParser = argparse.ArgumentParser(description='Fulltext search for AIL')
argParser.add_argument('-q', action='append', help='query to lookup (one or more)')
argParser.add_argument('-n', action='store_true', default=False, help='return numbers of indexed documents')
argParser.add_argument('-t', action='store_true', default=False, help='dump top 500 terms')
argParser.add_argument('-l', action='store_true', default=False, help='dump all terms encountered in indexed documents')
argParser.add_argument('-f', action='store_true', default=False, help='dump each matching document')
argParser.add_argument('-v', action='store_true', default=False, help='Include filepath')
argParser.add_argument('-s', action='append', help='search similar documents')

args = argParser.parse_args()

from whoosh import index
from whoosh.fields import Schema, TEXT, ID

schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)

ix = index.open_dir(indexpath)

from whoosh.qparser import QueryParser

if args.n:
    print(ix.doc_count_all())
    exit(0)

if args.l:
    xr = ix.searcher().reader()
    for x in xr.lexicon("content"):
        print (x)
    exit(0)

if args.t:
    xr = ix.searcher().reader()
    for x in xr.most_frequent_terms("content", number=500, prefix=''):
        print (x)
    exit(0)

if args.s:
    # By default, the index is not storing the vector of the document (Whoosh
    # document schema). It won't work if you don't change the schema of the
    # index for the content. It depends of your storage strategy.
    docnum = ix.searcher().document_number(path=args.s)
    r = ix.searcher().more_like(docnum, "content")
    for hit in r:
            print(hit["path"])
    exit(0)

if args.q is None:
    argParser.print_help()
    exit(1)

with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse(" ".join(args.q))
    results = searcher.search(query, limit=None)
    for x in results:
        if args.f:
            if args.v:
                print (x.items()[0][1])
            print (readdoc(path=x.items()[0][1]))
        else:
            print (x.items()[0][1])
        print
decode with redis connection 2018-05-04 11:53:29 +00:00			`#!/usr/bin/env python3`
Indexer module: script to query the index Test script to query the index generated from the Indexer module. python indexer_lookup.py -q Visa -q Mastercard 2014-08-11 10:03:27 +00:00			`# -- coding: utf-8 --`
			`#`
			`# This file is part of AIL framework - Analysis Information Leak framework`
			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU Affero General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# Copyright (c) 2014 Alexandre Dulaunoy - a@foo.be`

			`import argparse`
-f option added: dump full document for each match 2014-08-12 11:26:56 +00:00			`import gzip`
completely remove ZMQ_PubSub.py 2014-08-20 13:14:57 +00:00			`import os`
chg: [core] mv bin/packages/config.cfg configs/core.cfg + use ConfigLoader 2019-11-05 14:18:03 +00:00			`import sys`
-f option added: dump full document for each match 2014-08-12 11:26:56 +00:00
chg: [core] mv bin/packages/config.cfg configs/core.cfg + use ConfigLoader 2019-11-05 14:18:03 +00:00			`sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))`
			`import ConfigLoader`
Big cleanup, pep8 2014-08-14 15:55:18 +00:00
-f option added: dump full document for each match 2014-08-12 11:26:56 +00:00			`def readdoc(path=None):`
			`if path is None:`
			`return False`
Big cleanup, pep8 2014-08-14 15:55:18 +00:00			`f = gzip.open(path, 'r')`
-f option added: dump full document for each match 2014-08-12 11:26:56 +00:00			`return f.read()`
Indexer module: script to query the index Test script to query the index generated from the Indexer module. python indexer_lookup.py -q Visa -q Mastercard 2014-08-11 10:03:27 +00:00
chg: [core] mv bin/packages/config.cfg configs/core.cfg + use ConfigLoader 2019-11-05 14:18:03 +00:00			`config_loader = ConfigLoader.ConfigLoader()`
Indexer module: script to query the index Test script to query the index generated from the Indexer module. python indexer_lookup.py -q Visa -q Mastercard 2014-08-11 10:03:27 +00:00
			`# Indexer configuration - index dir and schema setup`
chg: [core] mv bin/packages/config.cfg configs/core.cfg + use ConfigLoader 2019-11-05 14:18:03 +00:00			`indexpath = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Indexer", "path"))`
			`indexertype = config_loader.get_config_str("Indexer", "type")`
Indexer module: script to query the index Test script to query the index generated from the Indexer module. python indexer_lookup.py -q Visa -q Mastercard 2014-08-11 10:03:27 +00:00
			`argParser = argparse.ArgumentParser(description='Fulltext search for AIL')`
			`argParser.add_argument('-q', action='append', help='query to lookup (one or more)')`
-l added -> dumping all terms indexed 2014-08-11 12:56:15 +00:00			`argParser.add_argument('-n', action='store_true', default=False, help='return numbers of indexed documents')`
Indexer: Some index statistics added usage: indexer_lookup.py [-h] [-q Q] [-n] [-t] [-l] Fulltext search for AIL optional arguments: -h, --help show this help message and exit -q Q query to lookup (one or more) -n return number of indexed documents -t dump top 500 terms -l dump all terms encountered in indexed documents 2014-08-11 13:07:12 +00:00			`argParser.add_argument('-t', action='store_true', default=False, help='dump top 500 terms')`
-l added -> dumping all terms indexed 2014-08-11 12:56:15 +00:00			`argParser.add_argument('-l', action='store_true', default=False, help='dump all terms encountered in indexed documents')`
-f option added: dump full document for each match 2014-08-12 11:26:56 +00:00			`argParser.add_argument('-f', action='store_true', default=False, help='dump each matching document')`
-v option added to list the path 2015-12-22 21:37:05 +00:00			`argParser.add_argument('-v', action='store_true', default=False, help='Include filepath')`
-s option added to find similar documents By default, the index is not storing the vector of the document (Whoosh document schema). It won't work if you don't change the schema of the index for the content. It depends of your storage strategy. 2014-08-12 11:42:26 +00:00			`argParser.add_argument('-s', action='append', help='search similar documents')`
-f option added: dump full document for each match 2014-08-12 11:26:56 +00:00
Indexer module: script to query the index Test script to query the index generated from the Indexer module. python indexer_lookup.py -q Visa -q Mastercard 2014-08-11 10:03:27 +00:00			`args = argParser.parse_args()`

			`from whoosh import index`
Big cleanup, pep8 2014-08-14 15:55:18 +00:00			`from whoosh.fields import Schema, TEXT, ID`

Indexer module: script to query the index Test script to query the index generated from the Indexer module. python indexer_lookup.py -q Visa -q Mastercard 2014-08-11 10:03:27 +00:00			`schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)`

			`ix = index.open_dir(indexpath)`

			`from whoosh.qparser import QueryParser`
Return the number of indexed documents 2014-08-11 12:50:35 +00:00
			`if args.n:`
python 3 backend upgrade 2018-04-16 12:50:04 +00:00			`print(ix.doc_count_all())`
Return the number of indexed documents 2014-08-11 12:50:35 +00:00			`exit(0)`

-l added -> dumping all terms indexed 2014-08-11 12:56:15 +00:00			`if args.l:`
			`xr = ix.searcher().reader()`
			`for x in xr.lexicon("content"):`
			`print (x)`
			`exit(0)`

Indexer: Some index statistics added usage: indexer_lookup.py [-h] [-q Q] [-n] [-t] [-l] Fulltext search for AIL optional arguments: -h, --help show this help message and exit -q Q query to lookup (one or more) -n return number of indexed documents -t dump top 500 terms -l dump all terms encountered in indexed documents 2014-08-11 13:07:12 +00:00			`if args.t:`
			`xr = ix.searcher().reader()`
			`for x in xr.most_frequent_terms("content", number=500, prefix=''):`
			`print (x)`
			`exit(0)`

-s option added to find similar documents By default, the index is not storing the vector of the document (Whoosh document schema). It won't work if you don't change the schema of the index for the content. It depends of your storage strategy. 2014-08-12 11:42:26 +00:00			`if args.s:`
			`# By default, the index is not storing the vector of the document (Whoosh`
			`# document schema). It won't work if you don't change the schema of the`
			`# index for the content. It depends of your storage strategy.`
			`docnum = ix.searcher().document_number(path=args.s)`
			`r = ix.searcher().more_like(docnum, "content")`
			`for hit in r:`
			`print(hit["path"])`
			`exit(0)`

Indexer module: script to query the index Test script to query the index generated from the Indexer module. python indexer_lookup.py -q Visa -q Mastercard 2014-08-11 10:03:27 +00:00			`if args.q is None:`
			`argParser.print_help()`
			`exit(1)`

			`with ix.searcher() as searcher:`
			`query = QueryParser("content", ix.schema).parse(" ".join(args.q))`
			`results = searcher.search(query, limit=None)`
			`for x in results:`
-f option added: dump full document for each match 2014-08-12 11:26:56 +00:00			`if args.f:`
-v option added to list the path 2015-12-22 21:37:05 +00:00			`if args.v:`
			`print (x.items()[0][1])`
-f option added: dump full document for each match 2014-08-12 11:26:56 +00:00			`print (readdoc(path=x.items()[0][1]))`
			`else:`
			`print (x.items()[0][1])`
			`print`