From fd6e1a84367b128537719c52371c3fc7457df3ac Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy
Date: Tue, 12 Aug 2014 13:26:56 +0200
Subject: [PATCH 1/2] -f option added: dump full document for each match
---
bin/tests/indexer_lookup.py | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/bin/tests/indexer_lookup.py b/bin/tests/indexer_lookup.py
index 305ae236..110d6086 100644
--- a/bin/tests/indexer_lookup.py
+++ b/bin/tests/indexer_lookup.py
@@ -13,6 +13,13 @@
import ConfigParser
import argparse
import sys
+import gzip
+
+def readdoc(path=None):
+ if path is None:
+ return False
+ f = gzip.open (path, 'r')
+ return f.read()
configfile = '../packages/config.cfg'
cfg = ConfigParser.ConfigParser()
@@ -27,6 +34,8 @@ argParser.add_argument('-q', action='append', help='query to lookup (one or more
argParser.add_argument('-n', action='store_true', default=False, help='return numbers of indexed documents')
argParser.add_argument('-t', action='store_true', default=False, help='dump top 500 terms')
argParser.add_argument('-l', action='store_true', default=False, help='dump all terms encountered in indexed documents')
+argParser.add_argument('-f', action='store_true', default=False, help='dump each matching document')
+
args = argParser.parse_args()
from whoosh import index
@@ -61,5 +70,8 @@ with ix.searcher() as searcher:
query = QueryParser("content", ix.schema).parse(" ".join(args.q))
results = searcher.search(query, limit=None)
for x in results:
- print (x)
-
+ if args.f:
+ print (readdoc(path=x.items()[0][1]))
+ else:
+ print (x.items()[0][1])
+ print
From 0b4a80b7ea9077d644f39195343066075c3b8043 Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy
Date: Tue, 12 Aug 2014 13:42:26 +0200
Subject: [PATCH 2/2] -s option added to find similar documents
By default, the index is not storing the vector of the document (Whoosh
document schema). It won't work if you don't change the schema of the
index for the content. It depends of your storage strategy.
---
bin/tests/indexer_lookup.py | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/bin/tests/indexer_lookup.py b/bin/tests/indexer_lookup.py
index 110d6086..8e0e49fc 100644
--- a/bin/tests/indexer_lookup.py
+++ b/bin/tests/indexer_lookup.py
@@ -35,11 +35,13 @@ argParser.add_argument('-n', action='store_true', default=False, help='return nu
argParser.add_argument('-t', action='store_true', default=False, help='dump top 500 terms')
argParser.add_argument('-l', action='store_true', default=False, help='dump all terms encountered in indexed documents')
argParser.add_argument('-f', action='store_true', default=False, help='dump each matching document')
+argParser.add_argument('-s', action='append', help='search similar documents')
args = argParser.parse_args()
from whoosh import index
from whoosh.fields import *
+import whoosh
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
ix = index.open_dir(indexpath)
@@ -62,6 +64,16 @@ if args.t:
print (x)
exit(0)
+if args.s:
+ # By default, the index is not storing the vector of the document (Whoosh
+ # document schema). It won't work if you don't change the schema of the
+ # index for the content. It depends of your storage strategy.
+ docnum = ix.searcher().document_number(path=args.s)
+ r = ix.searcher().more_like(docnum, "content")
+ for hit in r:
+ print(hit["path"])
+ exit(0)
+
if args.q is None:
argParser.print_help()
exit(1)