From 10ce1376bcac519d4ea66c748533baa0364bc2e6 Mon Sep 17 00:00:00 2001 From: Python-glitch <62609296+Python-glitch@users.noreply.github.com> Date: Tue, 5 Dec 2023 14:09:36 +0100 Subject: [PATCH] Add files via upload --- submission/Number3_jakob.ipynb | 285 +++++++++++++++++++++++++++++++++ 1 file changed, 285 insertions(+) create mode 100644 submission/Number3_jakob.ipynb diff --git a/submission/Number3_jakob.ipynb b/submission/Number3_jakob.ipynb new file mode 100644 index 0000000..3bdbd44 --- /dev/null +++ b/submission/Number3_jakob.ipynb @@ -0,0 +1,285 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "import numpy as np \n", + "import string \n", + "import pandas as pd \n", + "from numpy.linalg import norm\n", + "import nltk\n", + "import re \n", + "\n", + "# moduls for Stopwords \n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.corpus import stopwords\n", + "\n", + "\n", + "#moduls for stemming \n", + "from nltk.stem.snowball import SnowballStemmer\n", + "from nltk.stem.lancaster import LancasterStemmer\n", + "from nltk.stem.porter import PorterStemmer\n", + "\n", + "#moduls for lemming \n", + "from nltk.stem import WordNetLemmatizer \n", + "nltk.download('averaged_perceptron_tagger')\n", + "from nltk.corpus import wordnet\n", + "\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "\n", + "class StringSimilarity: \n", + " \n", + " \n", + " def __init__(self): \n", + " \n", + " #list with all documents \n", + " self.document_pool = {}\n", + " \n", + " self.vector_pool = {}\n", + " \n", + " #dictionary with all words -> without punctation and special characters \n", + " self.dictionary = set()\n", + " \n", + " \n", + " def add_documents(self,name, document): \n", + " \n", + " processed_document = StringSimilarity.main_cleaning(document)\n", + " \n", + " if processed_document not in list(self.document_pool.keys()): \n", + " \n", + " self.document_pool[name] = processed_document\n", + " \n", + " self.dictionary.update(set(processed_document))\n", + " \n", + " else: \n", + " raise ValueError(\"Text has already been added to pool\")\n", + " \n", + " \n", + " # methods to clean and prepare the text documents\n", + " \n", + " @staticmethod\n", + " def cleaning_text(text): \n", + " \n", + " text = text.strip()\n", + " text = re.sub(r'(?<=\\w)[_-]|[_-](?=\\w)', '', text)\n", + " text = re.sub(r'\\b(?:[a-zA-Z]\\.)+[a-zA-Z]?[,]*\\b', ' ', text)\n", + " text = re.sub(r\"\\W\", \" \", text) #remove non words char\n", + " text = re.sub(r\"\\d\", \" \", text) #remove digits char\n", + " text = re.sub(r\"[\\s]+\", \" \", text) # remove extra white space\n", + " text = text.lower() #lower char for matching\n", + " return text \n", + " \n", + " \n", + " \n", + " @staticmethod\n", + " def string_to_list(string1): \n", + " \n", + " clean_text = StringSimilarity.cleaning_text(string1)\n", + " \n", + " return clean_text.split()\n", + "\n", + " # removing stopwords \n", + " @staticmethod\n", + " def removing_stopwords(list_words): \n", + " stop_words = set(stopwords.words('english'))\n", + " text_without_stop = [word for word in list_words if word not in stop_words]\n", + " \n", + " return text_without_stop\n", + " \n", + " #stemming of words -> avoid dublicates \n", + " @staticmethod\n", + " def stemming_words(word_list): \n", + " \n", + " snowball = SnowballStemmer('english')\n", + " lancaster = LancasterStemmer()\n", + " porter = PorterStemmer()\n", + " \n", + " for stemmer in (snowball, lancaster, porter): \n", + " \n", + " stemmend_words = [stemmer.stem(word) for word in word_list]\n", + " \n", + " return stemmend_words\n", + " \n", + " @staticmethod\n", + " def pos_tagger(nltk_tag):\n", + " if nltk_tag.startswith('J'):\n", + " return wordnet.ADJ\n", + " elif nltk_tag.startswith('V'):\n", + " return wordnet.VERB\n", + " elif nltk_tag.startswith('N'):\n", + " return wordnet.NOUN\n", + " elif nltk_tag.startswith('R'):\n", + " return wordnet.ADV\n", + " else: \n", + " return None\n", + " \n", + " @staticmethod \n", + " def lemming_words(word_list): \n", + " \n", + " lemmatizer = WordNetLemmatizer()\n", + "\n", + " pos_tagged = nltk.pos_tag(word_list)\n", + "\n", + "\n", + " wordnet_tagged = list(map(lambda x: (x[0], StringSimilarity.pos_tagger(x[1])), pos_tagged))\n", + "\n", + " lemmatized_sentence = []\n", + " for word, tag in wordnet_tagged: \n", + " \n", + " if tag is None: \n", + " lemmatized_sentence.append(word)\n", + " else: \n", + " lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))\n", + "\n", + " return lemmatized_sentence\n", + " \n", + " @staticmethod\n", + " def main_cleaning(text): \n", + " \n", + " text_list = StringSimilarity.string_to_list(text)\n", + " text_list = StringSimilarity.removing_stopwords(text_list)\n", + " text_list = StringSimilarity.lemming_words(text_list) # Use lemmatization\n", + " return text_list \n", + " \n", + " def update_vectors(self): \n", + " \n", + " pass \n", + " \n", + "\n", + " def create_vetor(self, word_list): \n", + " \n", + " vector = [0] * len(self.dictionary)\n", + " \n", + "\n", + " # maybe better performance if we delete word from dict temporally -> lenght of loop would be reducing by each run\n", + " for i, word in enumerate(self.dictionary): \n", + " \n", + " if word in word_list: \n", + " vector[i] = 1\n", + " else: \n", + " continue \n", + " \n", + " return vector \n", + "\n", + " @staticmethod\n", + " def rank_vectors(dict1): \n", + " \n", + " return dict(sorted(dict1.items(), key=lambda item: item[1], reverse=True))\n", + "\n", + "\n", + "\n", + " def create_matrix(self): \n", + " pass \n", + " \n", + " \n", + " \n", + " def dot_product_normal(self, new_doc): \n", + " \n", + " final_dict = {}\n", + " \n", + " clean_text = self.main_cleaning(new_doc)\n", + " \n", + " new_vector = self.create_vetor(clean_text)\n", + " \n", + " for text in self.document_pool.keys(): \n", + " \n", + " temp_vector = self.create_vetor(self.document_pool[text])\n", + " \n", + " final_dict[text] = np.dot(new_vector, temp_vector)\n", + " \n", + " return StringSimilarity.rank_vectors(final_dict)\n", + " \n", + " \n", + "\n", + " def cosine_Similarity(self, new_doc): \n", + " \n", + " cosine_values = {}\n", + " \n", + " clean_text = self.main_cleaning(new_doc)\n", + " \n", + " new_vector = self.create_vetor(clean_text)\n", + " \n", + " for i in self.document_pool.keys(): \n", + " \n", + " temp_vector = self.create_vetor(self.document_pool[i])\n", + " \n", + " if norm(new_vector)*norm(temp_vector) != 0: \n", + " \n", + " cosine = np.dot(new_vector,temp_vector)/(norm(new_vector)*norm(temp_vector))\n", + " \n", + " cosine_values[i] = cosine\n", + " \n", + " else: \n", + " cosine_values[i] = 'no matches'\n", + " \n", + " return StringSimilarity.rank_vectors(cosine_values)\n", + " \n", + " \n", + " def Euclidean_distance(self, new_doc): \n", + " \n", + " euclidean_values = {}\n", + " clean_text = self.main_cleaning(new_doc)\n", + " \n", + " new_vector = self.create_vetor(clean_text)\n", + " \n", + " for i in self.document_pool.keys(): \n", + " \n", + " temp_vector = self.create_vetor(self.document_pool[i]) \n", + " \n", + " dist = np.linalg.norm(np.array(temp_vector) - np.array(new_vector))\n", + " euclidean_values[i] = dist \n", + " \n", + " return StringSimilarity.rank_vectors(euclidean_values)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "document_pool3 = StringSimilarity()\n", + "\n", + "text1 = 'Create a python program that will, compute the text document similarity between different documents one two three!!'\n", + "text2 = 'Your implementation will take a list of documents as an input text corpus, and it will compute a dictionary of words for the given corpus.'\n", + "text3 = 'Later, when a new document (i.e, search document) is provided, your implementation should provide a list of documents that are similar to the given search document, in descending order of their similarity with the search document.'\n", + "\n", + "\n", + "\n", + "document_pool3.add_documents('doc1', text1)\n", + "document_pool3.add_documents('doc2', text2)\n", + "document_pool3.add_documents('doc3', text3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_string = 'A text document can be represented as a word vector against a given dictionary of words.'\n", + "\n", + "\n", + "print(document_pool3.dot_product_normal(test_string))\n", + "\n", + "print(document_pool3.cosine_Similarity(test_string))\n", + "\n", + "print(document_pool3.Euclidean_distance(test_string))" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}