Skip to content

Commit

Permalink
Updated README.txt
Browse files Browse the repository at this point in the history
  • Loading branch information
bubbajoe committed Mar 14, 2018
1 parent 3f858c0 commit 034f855
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 8 deletions.
23 changes: 22 additions & 1 deletion README.txt
Original file line number Diff line number Diff line change
@@ -1 +1,22 @@
README.txt

About

By Joe Williams
Information Retrieval Programming

Usage

After the first run the data should be serialized
and it shouldn't take long at all to load the data.

Once you see 'Enter Query:', you can enter 2 words
or 2 words and number. If you search 'love her', it
will find all the instances where the stem of the
word love and her are right next to each other. If
you search 'love her 3', it will find all instance
where the stem of the words love and her are at most
3 words from each other.

/exit to exit program

Happy Coding Fellas
23 changes: 16 additions & 7 deletions pos_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
import os.path # For checking whether a file exist

from nltk.stem import PorterStemmer as ps # For stemming and word tokenization
#from nltk.tokenize import sent_tokenize, word_tokenize #

def isint(s):
try: int(s)
except ValueError: return False
return True

# Takes a file that has a list of files
def getInputFiles(filelist):
Expand Down Expand Up @@ -44,8 +48,8 @@ def showPreview(positions,radius):
with open(docArr[doc_id]) as f:
wordArr = [a for a in preprocess(f.read()).split(' ') if a != ""]
result = " ".join(wordArr[word_index-radius:word_index+radius])
print(str(i+1)+": ..."+result+"- "+files[doc_id].split("/")[-1]+"... :"+str(word_index))
print()
print(str(i+1)+": ..."+result+"... "+files[doc_id].split("/")[-1])
print()

# Serialization/Positional Index
pi = {}
Expand All @@ -60,7 +64,7 @@ def showPreview(positions,radius):
pickle.dump(pi,f)

# User interface and positional index querying
while 1:
while True:
print("Enter Query: 'word word <int>'")
sys.stdout.write("'/exit' to close > ")
q = [a for a in input().lower().split(' ') if a != ""]
Expand All @@ -71,15 +75,20 @@ def showPreview(positions,radius):
word1, word2 = q
word1 = ps().stem(preprocess(word1).replace(' ',''))
word2 = ps().stem(preprocess(word2).replace(' ',''))
print(word1)
print("Loading... \n")
print("Searching... \n")
for doc1, index1 in pi[word1]:
for doc2, index2 in pi[word2]:
if doc1 != doc2: continue
if index1 == (index2 - 1): matches.append( (doc1,index1) )
showPreview(matches,5)
elif len(q) == 3:
word1, word2, l = q
if not isint(l):
print("arg 3 needs to be an int\n")
continue
word1 = ps().stem(preprocess(word1).replace(' ',''))
word2 = ps().stem(preprocess(word2).replace(' ',''))
print("Searching... \n")
rad = int(l)
for doc1, index1 in pi[word1]:
for doc2, index2 in pi[word2]:
Expand All @@ -88,4 +97,4 @@ def showPreview(positions,radius):
# when abs_pos is 0, the word is itself
if abs_pos <= rad and abs_pos != 0: matches.append( (doc1,index1) )
showPreview(matches, 5 if rad <= 5 else rad)
else: print("Needs to have 2 or 3 args")
else: print("Needs to have 2 or 3 args\n")

0 comments on commit 034f855

Please sign in to comment.