mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-14 02:28:23 +00:00
-s option added to find similar documents
By default, the index is not storing the vector of the document (Whoosh document schema). It won't work if you don't change the schema of the index for the content. It depends of your storage strategy.
This commit is contained in:
parent
fd6e1a8436
commit
0b4a80b7ea
1 changed files with 12 additions and 0 deletions
|
@ -35,11 +35,13 @@ argParser.add_argument('-n', action='store_true', default=False, help='return nu
|
||||||
argParser.add_argument('-t', action='store_true', default=False, help='dump top 500 terms')
|
argParser.add_argument('-t', action='store_true', default=False, help='dump top 500 terms')
|
||||||
argParser.add_argument('-l', action='store_true', default=False, help='dump all terms encountered in indexed documents')
|
argParser.add_argument('-l', action='store_true', default=False, help='dump all terms encountered in indexed documents')
|
||||||
argParser.add_argument('-f', action='store_true', default=False, help='dump each matching document')
|
argParser.add_argument('-f', action='store_true', default=False, help='dump each matching document')
|
||||||
|
argParser.add_argument('-s', action='append', help='search similar documents')
|
||||||
|
|
||||||
args = argParser.parse_args()
|
args = argParser.parse_args()
|
||||||
|
|
||||||
from whoosh import index
|
from whoosh import index
|
||||||
from whoosh.fields import *
|
from whoosh.fields import *
|
||||||
|
import whoosh
|
||||||
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
|
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
|
||||||
|
|
||||||
ix = index.open_dir(indexpath)
|
ix = index.open_dir(indexpath)
|
||||||
|
@ -62,6 +64,16 @@ if args.t:
|
||||||
print (x)
|
print (x)
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
|
if args.s:
|
||||||
|
# By default, the index is not storing the vector of the document (Whoosh
|
||||||
|
# document schema). It won't work if you don't change the schema of the
|
||||||
|
# index for the content. It depends of your storage strategy.
|
||||||
|
docnum = ix.searcher().document_number(path=args.s)
|
||||||
|
r = ix.searcher().more_like(docnum, "content")
|
||||||
|
for hit in r:
|
||||||
|
print(hit["path"])
|
||||||
|
exit(0)
|
||||||
|
|
||||||
if args.q is None:
|
if args.q is None:
|
||||||
argParser.print_help()
|
argParser.print_help()
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
Loading…
Reference in a new issue