Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Python-glitch authored Dec 5, 2023
1 parent 1527317 commit 10ce137
Showing 1 changed file with 285 additions and 0 deletions.
285 changes: 285 additions & 0 deletions submission/Number3_jakob.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"import numpy as np \n",
"import string \n",
"import pandas as pd \n",
"from numpy.linalg import norm\n",
"import nltk\n",
"import re \n",
"\n",
"# moduls for Stopwords \n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"\n",
"\n",
"#moduls for stemming \n",
"from nltk.stem.snowball import SnowballStemmer\n",
"from nltk.stem.lancaster import LancasterStemmer\n",
"from nltk.stem.porter import PorterStemmer\n",
"\n",
"#moduls for lemming \n",
"from nltk.stem import WordNetLemmatizer \n",
"nltk.download('averaged_perceptron_tagger')\n",
"from nltk.corpus import wordnet\n",
"\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n",
"class StringSimilarity: \n",
" \n",
" \n",
" def __init__(self): \n",
" \n",
" #list with all documents \n",
" self.document_pool = {}\n",
" \n",
" self.vector_pool = {}\n",
" \n",
" #dictionary with all words -> without punctation and special characters \n",
" self.dictionary = set()\n",
" \n",
" \n",
" def add_documents(self,name, document): \n",
" \n",
" processed_document = StringSimilarity.main_cleaning(document)\n",
" \n",
" if processed_document not in list(self.document_pool.keys()): \n",
" \n",
" self.document_pool[name] = processed_document\n",
" \n",
" self.dictionary.update(set(processed_document))\n",
" \n",
" else: \n",
" raise ValueError(\"Text has already been added to pool\")\n",
" \n",
" \n",
" # methods to clean and prepare the text documents\n",
" \n",
" @staticmethod\n",
" def cleaning_text(text): \n",
" \n",
" text = text.strip()\n",
" text = re.sub(r'(?<=\\w)[_-]|[_-](?=\\w)', '', text)\n",
" text = re.sub(r'\\b(?:[a-zA-Z]\\.)+[a-zA-Z]?[,]*\\b', ' ', text)\n",
" text = re.sub(r\"\\W\", \" \", text) #remove non words char\n",
" text = re.sub(r\"\\d\", \" \", text) #remove digits char\n",
" text = re.sub(r\"[\\s]+\", \" \", text) # remove extra white space\n",
" text = text.lower() #lower char for matching\n",
" return text \n",
" \n",
" \n",
" \n",
" @staticmethod\n",
" def string_to_list(string1): \n",
" \n",
" clean_text = StringSimilarity.cleaning_text(string1)\n",
" \n",
" return clean_text.split()\n",
"\n",
" # removing stopwords \n",
" @staticmethod\n",
" def removing_stopwords(list_words): \n",
" stop_words = set(stopwords.words('english'))\n",
" text_without_stop = [word for word in list_words if word not in stop_words]\n",
" \n",
" return text_without_stop\n",
" \n",
" #stemming of words -> avoid dublicates \n",
" @staticmethod\n",
" def stemming_words(word_list): \n",
" \n",
" snowball = SnowballStemmer('english')\n",
" lancaster = LancasterStemmer()\n",
" porter = PorterStemmer()\n",
" \n",
" for stemmer in (snowball, lancaster, porter): \n",
" \n",
" stemmend_words = [stemmer.stem(word) for word in word_list]\n",
" \n",
" return stemmend_words\n",
" \n",
" @staticmethod\n",
" def pos_tagger(nltk_tag):\n",
" if nltk_tag.startswith('J'):\n",
" return wordnet.ADJ\n",
" elif nltk_tag.startswith('V'):\n",
" return wordnet.VERB\n",
" elif nltk_tag.startswith('N'):\n",
" return wordnet.NOUN\n",
" elif nltk_tag.startswith('R'):\n",
" return wordnet.ADV\n",
" else: \n",
" return None\n",
" \n",
" @staticmethod \n",
" def lemming_words(word_list): \n",
" \n",
" lemmatizer = WordNetLemmatizer()\n",
"\n",
" pos_tagged = nltk.pos_tag(word_list)\n",
"\n",
"\n",
" wordnet_tagged = list(map(lambda x: (x[0], StringSimilarity.pos_tagger(x[1])), pos_tagged))\n",
"\n",
" lemmatized_sentence = []\n",
" for word, tag in wordnet_tagged: \n",
" \n",
" if tag is None: \n",
" lemmatized_sentence.append(word)\n",
" else: \n",
" lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))\n",
"\n",
" return lemmatized_sentence\n",
" \n",
" @staticmethod\n",
" def main_cleaning(text): \n",
" \n",
" text_list = StringSimilarity.string_to_list(text)\n",
" text_list = StringSimilarity.removing_stopwords(text_list)\n",
" text_list = StringSimilarity.lemming_words(text_list) # Use lemmatization\n",
" return text_list \n",
" \n",
" def update_vectors(self): \n",
" \n",
" pass \n",
" \n",
"\n",
" def create_vetor(self, word_list): \n",
" \n",
" vector = [0] * len(self.dictionary)\n",
" \n",
"\n",
" # maybe better performance if we delete word from dict temporally -> lenght of loop would be reducing by each run\n",
" for i, word in enumerate(self.dictionary): \n",
" \n",
" if word in word_list: \n",
" vector[i] = 1\n",
" else: \n",
" continue \n",
" \n",
" return vector \n",
"\n",
" @staticmethod\n",
" def rank_vectors(dict1): \n",
" \n",
" return dict(sorted(dict1.items(), key=lambda item: item[1], reverse=True))\n",
"\n",
"\n",
"\n",
" def create_matrix(self): \n",
" pass \n",
" \n",
" \n",
" \n",
" def dot_product_normal(self, new_doc): \n",
" \n",
" final_dict = {}\n",
" \n",
" clean_text = self.main_cleaning(new_doc)\n",
" \n",
" new_vector = self.create_vetor(clean_text)\n",
" \n",
" for text in self.document_pool.keys(): \n",
" \n",
" temp_vector = self.create_vetor(self.document_pool[text])\n",
" \n",
" final_dict[text] = np.dot(new_vector, temp_vector)\n",
" \n",
" return StringSimilarity.rank_vectors(final_dict)\n",
" \n",
" \n",
"\n",
" def cosine_Similarity(self, new_doc): \n",
" \n",
" cosine_values = {}\n",
" \n",
" clean_text = self.main_cleaning(new_doc)\n",
" \n",
" new_vector = self.create_vetor(clean_text)\n",
" \n",
" for i in self.document_pool.keys(): \n",
" \n",
" temp_vector = self.create_vetor(self.document_pool[i])\n",
" \n",
" if norm(new_vector)*norm(temp_vector) != 0: \n",
" \n",
" cosine = np.dot(new_vector,temp_vector)/(norm(new_vector)*norm(temp_vector))\n",
" \n",
" cosine_values[i] = cosine\n",
" \n",
" else: \n",
" cosine_values[i] = 'no matches'\n",
" \n",
" return StringSimilarity.rank_vectors(cosine_values)\n",
" \n",
" \n",
" def Euclidean_distance(self, new_doc): \n",
" \n",
" euclidean_values = {}\n",
" clean_text = self.main_cleaning(new_doc)\n",
" \n",
" new_vector = self.create_vetor(clean_text)\n",
" \n",
" for i in self.document_pool.keys(): \n",
" \n",
" temp_vector = self.create_vetor(self.document_pool[i]) \n",
" \n",
" dist = np.linalg.norm(np.array(temp_vector) - np.array(new_vector))\n",
" euclidean_values[i] = dist \n",
" \n",
" return StringSimilarity.rank_vectors(euclidean_values)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"document_pool3 = StringSimilarity()\n",
"\n",
"text1 = 'Create a python program that will, compute the text document similarity between different documents one two three!!'\n",
"text2 = 'Your implementation will take a list of documents as an input text corpus, and it will compute a dictionary of words for the given corpus.'\n",
"text3 = 'Later, when a new document (i.e, search document) is provided, your implementation should provide a list of documents that are similar to the given search document, in descending order of their similarity with the search document.'\n",
"\n",
"\n",
"\n",
"document_pool3.add_documents('doc1', text1)\n",
"document_pool3.add_documents('doc2', text2)\n",
"document_pool3.add_documents('doc3', text3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_string = 'A text document can be represented as a word vector against a given dictionary of words.'\n",
"\n",
"\n",
"print(document_pool3.dot_product_normal(test_string))\n",
"\n",
"print(document_pool3.cosine_Similarity(test_string))\n",
"\n",
"print(document_pool3.Euclidean_distance(test_string))"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 10ce137

Please sign in to comment.