-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
1527317
commit 10ce137
Showing
1 changed file
with
285 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,285 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"\n", | ||
"import numpy as np \n", | ||
"import string \n", | ||
"import pandas as pd \n", | ||
"from numpy.linalg import norm\n", | ||
"import nltk\n", | ||
"import re \n", | ||
"\n", | ||
"# moduls for Stopwords \n", | ||
"from nltk.tokenize import word_tokenize\n", | ||
"from nltk.corpus import stopwords\n", | ||
"\n", | ||
"\n", | ||
"#moduls for stemming \n", | ||
"from nltk.stem.snowball import SnowballStemmer\n", | ||
"from nltk.stem.lancaster import LancasterStemmer\n", | ||
"from nltk.stem.porter import PorterStemmer\n", | ||
"\n", | ||
"#moduls for lemming \n", | ||
"from nltk.stem import WordNetLemmatizer \n", | ||
"nltk.download('averaged_perceptron_tagger')\n", | ||
"from nltk.corpus import wordnet\n", | ||
"\n", | ||
"nltk.download('stopwords')\n", | ||
"nltk.download('wordnet')\n", | ||
"\n", | ||
"class StringSimilarity: \n", | ||
" \n", | ||
" \n", | ||
" def __init__(self): \n", | ||
" \n", | ||
" #list with all documents \n", | ||
" self.document_pool = {}\n", | ||
" \n", | ||
" self.vector_pool = {}\n", | ||
" \n", | ||
" #dictionary with all words -> without punctation and special characters \n", | ||
" self.dictionary = set()\n", | ||
" \n", | ||
" \n", | ||
" def add_documents(self,name, document): \n", | ||
" \n", | ||
" processed_document = StringSimilarity.main_cleaning(document)\n", | ||
" \n", | ||
" if processed_document not in list(self.document_pool.keys()): \n", | ||
" \n", | ||
" self.document_pool[name] = processed_document\n", | ||
" \n", | ||
" self.dictionary.update(set(processed_document))\n", | ||
" \n", | ||
" else: \n", | ||
" raise ValueError(\"Text has already been added to pool\")\n", | ||
" \n", | ||
" \n", | ||
" # methods to clean and prepare the text documents\n", | ||
" \n", | ||
" @staticmethod\n", | ||
" def cleaning_text(text): \n", | ||
" \n", | ||
" text = text.strip()\n", | ||
" text = re.sub(r'(?<=\\w)[_-]|[_-](?=\\w)', '', text)\n", | ||
" text = re.sub(r'\\b(?:[a-zA-Z]\\.)+[a-zA-Z]?[,]*\\b', ' ', text)\n", | ||
" text = re.sub(r\"\\W\", \" \", text) #remove non words char\n", | ||
" text = re.sub(r\"\\d\", \" \", text) #remove digits char\n", | ||
" text = re.sub(r\"[\\s]+\", \" \", text) # remove extra white space\n", | ||
" text = text.lower() #lower char for matching\n", | ||
" return text \n", | ||
" \n", | ||
" \n", | ||
" \n", | ||
" @staticmethod\n", | ||
" def string_to_list(string1): \n", | ||
" \n", | ||
" clean_text = StringSimilarity.cleaning_text(string1)\n", | ||
" \n", | ||
" return clean_text.split()\n", | ||
"\n", | ||
" # removing stopwords \n", | ||
" @staticmethod\n", | ||
" def removing_stopwords(list_words): \n", | ||
" stop_words = set(stopwords.words('english'))\n", | ||
" text_without_stop = [word for word in list_words if word not in stop_words]\n", | ||
" \n", | ||
" return text_without_stop\n", | ||
" \n", | ||
" #stemming of words -> avoid dublicates \n", | ||
" @staticmethod\n", | ||
" def stemming_words(word_list): \n", | ||
" \n", | ||
" snowball = SnowballStemmer('english')\n", | ||
" lancaster = LancasterStemmer()\n", | ||
" porter = PorterStemmer()\n", | ||
" \n", | ||
" for stemmer in (snowball, lancaster, porter): \n", | ||
" \n", | ||
" stemmend_words = [stemmer.stem(word) for word in word_list]\n", | ||
" \n", | ||
" return stemmend_words\n", | ||
" \n", | ||
" @staticmethod\n", | ||
" def pos_tagger(nltk_tag):\n", | ||
" if nltk_tag.startswith('J'):\n", | ||
" return wordnet.ADJ\n", | ||
" elif nltk_tag.startswith('V'):\n", | ||
" return wordnet.VERB\n", | ||
" elif nltk_tag.startswith('N'):\n", | ||
" return wordnet.NOUN\n", | ||
" elif nltk_tag.startswith('R'):\n", | ||
" return wordnet.ADV\n", | ||
" else: \n", | ||
" return None\n", | ||
" \n", | ||
" @staticmethod \n", | ||
" def lemming_words(word_list): \n", | ||
" \n", | ||
" lemmatizer = WordNetLemmatizer()\n", | ||
"\n", | ||
" pos_tagged = nltk.pos_tag(word_list)\n", | ||
"\n", | ||
"\n", | ||
" wordnet_tagged = list(map(lambda x: (x[0], StringSimilarity.pos_tagger(x[1])), pos_tagged))\n", | ||
"\n", | ||
" lemmatized_sentence = []\n", | ||
" for word, tag in wordnet_tagged: \n", | ||
" \n", | ||
" if tag is None: \n", | ||
" lemmatized_sentence.append(word)\n", | ||
" else: \n", | ||
" lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))\n", | ||
"\n", | ||
" return lemmatized_sentence\n", | ||
" \n", | ||
" @staticmethod\n", | ||
" def main_cleaning(text): \n", | ||
" \n", | ||
" text_list = StringSimilarity.string_to_list(text)\n", | ||
" text_list = StringSimilarity.removing_stopwords(text_list)\n", | ||
" text_list = StringSimilarity.lemming_words(text_list) # Use lemmatization\n", | ||
" return text_list \n", | ||
" \n", | ||
" def update_vectors(self): \n", | ||
" \n", | ||
" pass \n", | ||
" \n", | ||
"\n", | ||
" def create_vetor(self, word_list): \n", | ||
" \n", | ||
" vector = [0] * len(self.dictionary)\n", | ||
" \n", | ||
"\n", | ||
" # maybe better performance if we delete word from dict temporally -> lenght of loop would be reducing by each run\n", | ||
" for i, word in enumerate(self.dictionary): \n", | ||
" \n", | ||
" if word in word_list: \n", | ||
" vector[i] = 1\n", | ||
" else: \n", | ||
" continue \n", | ||
" \n", | ||
" return vector \n", | ||
"\n", | ||
" @staticmethod\n", | ||
" def rank_vectors(dict1): \n", | ||
" \n", | ||
" return dict(sorted(dict1.items(), key=lambda item: item[1], reverse=True))\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
" def create_matrix(self): \n", | ||
" pass \n", | ||
" \n", | ||
" \n", | ||
" \n", | ||
" def dot_product_normal(self, new_doc): \n", | ||
" \n", | ||
" final_dict = {}\n", | ||
" \n", | ||
" clean_text = self.main_cleaning(new_doc)\n", | ||
" \n", | ||
" new_vector = self.create_vetor(clean_text)\n", | ||
" \n", | ||
" for text in self.document_pool.keys(): \n", | ||
" \n", | ||
" temp_vector = self.create_vetor(self.document_pool[text])\n", | ||
" \n", | ||
" final_dict[text] = np.dot(new_vector, temp_vector)\n", | ||
" \n", | ||
" return StringSimilarity.rank_vectors(final_dict)\n", | ||
" \n", | ||
" \n", | ||
"\n", | ||
" def cosine_Similarity(self, new_doc): \n", | ||
" \n", | ||
" cosine_values = {}\n", | ||
" \n", | ||
" clean_text = self.main_cleaning(new_doc)\n", | ||
" \n", | ||
" new_vector = self.create_vetor(clean_text)\n", | ||
" \n", | ||
" for i in self.document_pool.keys(): \n", | ||
" \n", | ||
" temp_vector = self.create_vetor(self.document_pool[i])\n", | ||
" \n", | ||
" if norm(new_vector)*norm(temp_vector) != 0: \n", | ||
" \n", | ||
" cosine = np.dot(new_vector,temp_vector)/(norm(new_vector)*norm(temp_vector))\n", | ||
" \n", | ||
" cosine_values[i] = cosine\n", | ||
" \n", | ||
" else: \n", | ||
" cosine_values[i] = 'no matches'\n", | ||
" \n", | ||
" return StringSimilarity.rank_vectors(cosine_values)\n", | ||
" \n", | ||
" \n", | ||
" def Euclidean_distance(self, new_doc): \n", | ||
" \n", | ||
" euclidean_values = {}\n", | ||
" clean_text = self.main_cleaning(new_doc)\n", | ||
" \n", | ||
" new_vector = self.create_vetor(clean_text)\n", | ||
" \n", | ||
" for i in self.document_pool.keys(): \n", | ||
" \n", | ||
" temp_vector = self.create_vetor(self.document_pool[i]) \n", | ||
" \n", | ||
" dist = np.linalg.norm(np.array(temp_vector) - np.array(new_vector))\n", | ||
" euclidean_values[i] = dist \n", | ||
" \n", | ||
" return StringSimilarity.rank_vectors(euclidean_values)\n", | ||
" " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"document_pool3 = StringSimilarity()\n", | ||
"\n", | ||
"text1 = 'Create a python program that will, compute the text document similarity between different documents one two three!!'\n", | ||
"text2 = 'Your implementation will take a list of documents as an input text corpus, and it will compute a dictionary of words for the given corpus.'\n", | ||
"text3 = 'Later, when a new document (i.e, search document) is provided, your implementation should provide a list of documents that are similar to the given search document, in descending order of their similarity with the search document.'\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"document_pool3.add_documents('doc1', text1)\n", | ||
"document_pool3.add_documents('doc2', text2)\n", | ||
"document_pool3.add_documents('doc3', text3)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"test_string = 'A text document can be represented as a word vector against a given dictionary of words.'\n", | ||
"\n", | ||
"\n", | ||
"print(document_pool3.dot_product_normal(test_string))\n", | ||
"\n", | ||
"print(document_pool3.cosine_Similarity(test_string))\n", | ||
"\n", | ||
"print(document_pool3.Euclidean_distance(test_string))" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |