Skip to content

Commit

Permalink
Merge pull request #3 from Patrick-Lapid/tokenization
Browse files Browse the repository at this point in the history
Tokenization
  • Loading branch information
Patrick-Lapid authored Sep 18, 2021
2 parents 5e2e409 + a8069e8 commit e638f75
Showing 1 changed file with 73 additions and 18 deletions.
91 changes: 73 additions & 18 deletions preprocessing/test.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,81 @@
import nltk
# nltk.download('punkt')
import os
from bs4 import BeautifulSoup
import requests
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import filestring
usrInput = input("Enter file name: ")
fileNames = [usrInput]


while usrInput != "":
fileNames.append(usrInput)
usrInput = input("Enter file name (ENTER to terminate): ")
def main():
fileNames = []
words = []
sentences = []

# UNCOMMENT FOR FUNCTIONS

URL(words, sentences)

# singleFile(fileNames)
# tokenizeFiles(fileNames)

# multipleFiles(fileNames)
# tokenizeFiles(filNames)

# directory(fileNames)
# tokenizeFiles(fileNames)



# Tokenizes from URL
def URL(words, sentences):
url = input("Enter URL: ")
html = requests.get(url)
raw = BeautifulSoup(html.text, 'html.parser').get_text()
print(raw)
wordTokens = word_tokenize(raw)
sentences = sent_tokenize(raw)
# Turns text into nltk object, concordance searches and returns search keyword
# wordObj = nltk.Text(wordTokens)
# words = wordObj.concordance("SEARCH_KEY_WORD")
return words, sentences


# Pulls filenames from Directory
def directory(fileNames):
path = os.getcwd()
usrInput = input("Enter full path (ENTER for working directory): ")
if usrInput != "": path = usrInput
dirFiles = [x for x in os.listdir(path) if x.endswith(".txt")]
for file in dirFiles: fileNames.append(file)

# Single and Multiple files
def singleFile(fileNames):
usrInput = input("Enter file name: ")
fileNames = [usrInput]
return fileNames

def multipleFiles(fileNames):
usrInput = input("Enter file name: ")
fileNames = [usrInput]
while usrInput != "":
fileNames.append(usrInput)
usrInput = input("Enter file name (ENTER to terminate): ")
return fileNames


try:
for fileName in fileNames:
with open(fileName, encoding='utf-8') as file:
text = file.read()
words = word_tokenize(text)
sentences = sent_tokenize(text)
print('Words: ', words, '\nSentences: ', sentences)
# Tokenizes txt files
def tokenizeFiles(fileNames):
try:
for fileName in fileNames:
with open(fileName, encoding='utf-8') as file:
text = file.read()
words = word_tokenize(text)
sentences = sent_tokenize(text)
print('Words: ', words, '\nSentences: ', sentences)

except IOError as e:
print("I/O error({0}): {1}".format(e.errno, e.strerror))
except OSError:
print(f"OS error trying to open {fileName}")
except IOError as e:
print("I/O error({0}): {1}".format(e.errno, e.strerror))
except OSError:
print(f"OS error trying to open {fileName}")

main()

0 comments on commit e638f75

Please sign in to comment.