Merge pull request #3 from Patrick-Lapid/tokenization

Tokenization
ufosc · Sep 18, 2021 · e638f75 · e638f75
2 parents 5e2e409 + a8069e8
commit e638f75
Showing 1 changed file with 73 additions and 18 deletions.
diff --git a/preprocessing/test.py b/preprocessing/test.py
@@ -1,26 +1,81 @@
 import nltk
-# nltk.download('punkt')
+import os
+from bs4 import BeautifulSoup
+import requests
+nltk.download('punkt')
 from nltk.tokenize import sent_tokenize, word_tokenize
 from nltk.util import filestring
-usrInput = input("Enter file name: ")
-fileNames = [usrInput]
 
-
-while usrInput != "":
-    fileNames.append(usrInput)
-    usrInput = input("Enter file name (ENTER to terminate): ")
+def main():
+    fileNames = []
+    words = []
+    sentences = []
 
+    # UNCOMMENT FOR FUNCTIONS
+
+    URL(words, sentences)
+
+    # singleFile(fileNames)
+    # tokenizeFiles(fileNames)
+
+    # multipleFiles(fileNames)
+    # tokenizeFiles(filNames)
+
+    # directory(fileNames)
+    # tokenizeFiles(fileNames)
+
+
+
+# Tokenizes from URL
+def URL(words, sentences):
+    url = input("Enter URL: ")
+    html = requests.get(url)
+    raw = BeautifulSoup(html.text, 'html.parser').get_text()
+    print(raw)
+    wordTokens = word_tokenize(raw)
+    sentences = sent_tokenize(raw)
+    # Turns text into nltk object, concordance searches and returns search keyword
+    # wordObj = nltk.Text(wordTokens)
+    # words = wordObj.concordance("SEARCH_KEY_WORD")
+    return words, sentences
+
+
+# Pulls filenames from Directory
+def directory(fileNames):
+    path = os.getcwd()
+    usrInput = input("Enter full path (ENTER for working directory): ")
+    if usrInput != "": path = usrInput
+    dirFiles = [x for x in os.listdir(path) if x.endswith(".txt")]
+    for file in dirFiles: fileNames.append(file) 
+
+# Single and Multiple files
+def singleFile(fileNames):
+    usrInput = input("Enter file name: ")
+    fileNames = [usrInput]
+    return fileNames
+
+def multipleFiles(fileNames):
+    usrInput = input("Enter file name: ")
+    fileNames = [usrInput]
+    while usrInput != "":
+        fileNames.append(usrInput)
+        usrInput = input("Enter file name (ENTER to terminate): ")
+    return fileNames
+
 
-try: 
-    for fileName in fileNames:
-        with open(fileName, encoding='utf-8') as file:
-            text = file.read()
-            words = word_tokenize(text)
-            sentences = sent_tokenize(text)
-            print('Words: ', words, '\nSentences: ', sentences)
+# Tokenizes txt files  
+def tokenizeFiles(fileNames):
+    try: 
+        for fileName in fileNames:
+            with open(fileName, encoding='utf-8') as file:
+                text = file.read()
+                words = word_tokenize(text)
+                sentences = sent_tokenize(text)
+                print('Words: ', words, '\nSentences: ', sentences)
 
-except IOError as e:
-    print("I/O error({0}): {1}".format(e.errno, e.strerror))
-except OSError:
-    print(f"OS error trying to open {fileName}")
+    except IOError as e:
+        print("I/O error({0}): {1}".format(e.errno, e.strerror))
+    except OSError:
+        print(f"OS error trying to open {fileName}")
 
+main()