Add files via upload

MagnusS0 · Dec 5, 2023 · 10ce137 · 10ce137
1 parent 1527317
commit 10ce137
Showing 1 changed file with 285 additions and 0 deletions.
diff --git a/submission/Number3_jakob.ipynb b/submission/Number3_jakob.ipynb
@@ -0,0 +1,285 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "import numpy as np \n",
+    "import string \n",
+    "import pandas as pd \n",
+    "from numpy.linalg import norm\n",
+    "import nltk\n",
+    "import re \n",
+    "\n",
+    "# moduls for Stopwords \n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.corpus import stopwords\n",
+    "\n",
+    "\n",
+    "#moduls for stemming \n",
+    "from nltk.stem.snowball import SnowballStemmer\n",
+    "from nltk.stem.lancaster import LancasterStemmer\n",
+    "from nltk.stem.porter import PorterStemmer\n",
+    "\n",
+    "#moduls for lemming \n",
+    "from nltk.stem import WordNetLemmatizer \n",
+    "nltk.download('averaged_perceptron_tagger')\n",
+    "from nltk.corpus import wordnet\n",
+    "\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('wordnet')\n",
+    "\n",
+    "class StringSimilarity: \n",
+    "    \n",
+    "    \n",
+    "    def __init__(self): \n",
+    "        \n",
+    "        #list with all documents \n",
+    "        self.document_pool = {}\n",
+    "        \n",
+    "        self.vector_pool = {}\n",
+    "        \n",
+    "        #dictionary with all words -> without punctation and special characters \n",
+    "        self.dictionary = set()\n",
+    "     \n",
+    "    \n",
+    "    def add_documents(self,name, document): \n",
+    "        \n",
+    "        processed_document = StringSimilarity.main_cleaning(document)\n",
+    "        \n",
+    "        if processed_document not in list(self.document_pool.keys()): \n",
+    "            \n",
+    "            self.document_pool[name] = processed_document\n",
+    "            \n",
+    "            self.dictionary.update(set(processed_document))\n",
+    "            \n",
+    "        else: \n",
+    "            raise ValueError(\"Text has already been added to pool\")\n",
+    "    \n",
+    "    \n",
+    "    # methods to clean and prepare the text documents\n",
+    "    \n",
+    "    @staticmethod\n",
+    "    def cleaning_text(text): \n",
+    "        \n",
+    "        text = text.strip()\n",
+    "        text = re.sub(r'(?<=\\w)[_-]|[_-](?=\\w)', '', text)\n",
+    "        text = re.sub(r'\\b(?:[a-zA-Z]\\.)+[a-zA-Z]?[,]*\\b', ' ', text)\n",
+    "        text = re.sub(r\"\\W\", \" \", text)  #remove non words char\n",
+    "        text = re.sub(r\"\\d\", \" \", text)  #remove digits char\n",
+    "        text = re.sub(r\"[\\s]+\", \" \", text) # remove extra white space\n",
+    "        text = text.lower() #lower char for matching\n",
+    "        return text \n",
+    "    \n",
+    "    \n",
+    "    \n",
+    "    @staticmethod\n",
+    "    def string_to_list(string1): \n",
+    "        \n",
+    "        clean_text = StringSimilarity.cleaning_text(string1)\n",
+    "        \n",
+    "        return clean_text.split()\n",
+    "\n",
+    "    # removing stopwords \n",
+    "    @staticmethod\n",
+    "    def removing_stopwords(list_words): \n",
+    "        stop_words = set(stopwords.words('english'))\n",
+    "        text_without_stop = [word for word in list_words if word not in stop_words]\n",
+    "        \n",
+    "        return text_without_stop\n",
+    "    \n",
+    "    #stemming of words -> avoid dublicates \n",
+    "    @staticmethod\n",
+    "    def stemming_words(word_list): \n",
+    "        \n",
+    "        snowball = SnowballStemmer('english')\n",
+    "        lancaster = LancasterStemmer()\n",
+    "        porter = PorterStemmer()\n",
+    "        \n",
+    "        for stemmer in (snowball, lancaster, porter): \n",
+    "            \n",
+    "            stemmend_words = [stemmer.stem(word) for word in word_list]\n",
+    "            \n",
+    "        return stemmend_words\n",
+    "    \n",
+    "    @staticmethod\n",
+    "    def pos_tagger(nltk_tag):\n",
+    "        if nltk_tag.startswith('J'):\n",
+    "            return wordnet.ADJ\n",
+    "        elif nltk_tag.startswith('V'):\n",
+    "            return wordnet.VERB\n",
+    "        elif nltk_tag.startswith('N'):\n",
+    "            return wordnet.NOUN\n",
+    "        elif nltk_tag.startswith('R'):\n",
+    "            return wordnet.ADV\n",
+    "        else:          \n",
+    "            return None\n",
+    "        \n",
+    "    @staticmethod    \n",
+    "    def lemming_words(word_list): \n",
+    "        \n",
+    "        lemmatizer = WordNetLemmatizer()\n",
+    "\n",
+    "        pos_tagged = nltk.pos_tag(word_list)\n",
+    "\n",
+    "\n",
+    "        wordnet_tagged = list(map(lambda x: (x[0], StringSimilarity.pos_tagger(x[1])), pos_tagged))\n",
+    "\n",
+    "        lemmatized_sentence = []\n",
+    "        for word, tag in wordnet_tagged: \n",
+    "            \n",
+    "            if tag is None: \n",
+    "                lemmatized_sentence.append(word)\n",
+    "            else: \n",
+    "                lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))\n",
+    "\n",
+    "        return lemmatized_sentence\n",
+    "    \n",
+    "    @staticmethod\n",
+    "    def main_cleaning(text): \n",
+    "        \n",
+    "        text_list = StringSimilarity.string_to_list(text)\n",
+    "        text_list = StringSimilarity.removing_stopwords(text_list)\n",
+    "        text_list = StringSimilarity.lemming_words(text_list)  # Use lemmatization\n",
+    "        return text_list      \n",
+    "    \n",
+    "    def update_vectors(self): \n",
+    "        \n",
+    "        pass \n",
+    "    \n",
+    "\n",
+    "    def create_vetor(self, word_list): \n",
+    "    \n",
+    "        vector = [0] * len(self.dictionary)\n",
+    "        \n",
+    "\n",
+    "        # maybe better performance if we delete word from dict temporally -> lenght of loop would be reducing by each run\n",
+    "        for i, word in enumerate(self.dictionary): \n",
+    "            \n",
+    "            if word in word_list: \n",
+    "                vector[i] = 1\n",
+    "            else: \n",
+    "                continue \n",
+    "            \n",
+    "        return vector \n",
+    "\n",
+    "    @staticmethod\n",
+    "    def rank_vectors(dict1): \n",
+    "        \n",
+    "        return dict(sorted(dict1.items(), key=lambda item: item[1], reverse=True))\n",
+    "\n",
+    "\n",
+    "\n",
+    "    def create_matrix(self): \n",
+    "        pass \n",
+    "        \n",
+    "        \n",
+    "        \n",
+    "    def dot_product_normal(self, new_doc): \n",
+    "        \n",
+    "        final_dict = {}\n",
+    "        \n",
+    "        clean_text = self.main_cleaning(new_doc)\n",
+    "        \n",
+    "        new_vector = self.create_vetor(clean_text)\n",
+    "        \n",
+    "        for text in self.document_pool.keys(): \n",
+    "            \n",
+    "            temp_vector = self.create_vetor(self.document_pool[text])\n",
+    "            \n",
+    "            final_dict[text] = np.dot(new_vector, temp_vector)\n",
+    "        \n",
+    "        return StringSimilarity.rank_vectors(final_dict)\n",
+    "    \n",
+    "    \n",
+    "\n",
+    "    def cosine_Similarity(self, new_doc): \n",
+    "        \n",
+    "        cosine_values = {}\n",
+    "        \n",
+    "        clean_text = self.main_cleaning(new_doc)\n",
+    "        \n",
+    "        new_vector = self.create_vetor(clean_text)\n",
+    "        \n",
+    "        for i in self.document_pool.keys(): \n",
+    "            \n",
+    "            temp_vector = self.create_vetor(self.document_pool[i])\n",
+    "            \n",
+    "            if norm(new_vector)*norm(temp_vector) != 0: \n",
+    "                \n",
+    "                cosine = np.dot(new_vector,temp_vector)/(norm(new_vector)*norm(temp_vector))\n",
+    "                \n",
+    "                cosine_values[i] = cosine\n",
+    "                \n",
+    "            else: \n",
+    "                cosine_values[i] = 'no matches'\n",
+    "            \n",
+    "        return StringSimilarity.rank_vectors(cosine_values)\n",
+    "    \n",
+    "    \n",
+    "    def Euclidean_distance(self, new_doc): \n",
+    "        \n",
+    "        euclidean_values = {}\n",
+    "        clean_text = self.main_cleaning(new_doc)\n",
+    "        \n",
+    "        new_vector = self.create_vetor(clean_text)\n",
+    "        \n",
+    "        for i in self.document_pool.keys(): \n",
+    "            \n",
+    "            temp_vector = self.create_vetor(self.document_pool[i]) \n",
+    "            \n",
+    "            dist = np.linalg.norm(np.array(temp_vector) - np.array(new_vector))\n",
+    "            euclidean_values[i] = dist \n",
+    "            \n",
+    "        return StringSimilarity.rank_vectors(euclidean_values)\n",
+    "            "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "document_pool3 = StringSimilarity()\n",
+    "\n",
+    "text1 = 'Create a python program that will, compute the text document similarity between different documents one two three!!'\n",
+    "text2 = 'Your implementation will take a list of documents as an input text corpus, and it will compute a dictionary of words for the given corpus.'\n",
+    "text3 = 'Later, when a new document (i.e, search document) is provided, your implementation should provide a list of documents that are similar to the given search document, in descending order of their similarity with the search document.'\n",
+    "\n",
+    "\n",
+    "\n",
+    "document_pool3.add_documents('doc1', text1)\n",
+    "document_pool3.add_documents('doc2', text2)\n",
+    "document_pool3.add_documents('doc3', text3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_string =  'A text document can be represented as a word vector against a given dictionary of words.'\n",
+    "\n",
+    "\n",
+    "print(document_pool3.dot_product_normal(test_string))\n",
+    "\n",
+    "print(document_pool3.cosine_Similarity(test_string))\n",
+    "\n",
+    "print(document_pool3.Euclidean_distance(test_string))"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}