-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
41 lines (33 loc) · 2.4 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import gensim
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import ssl
from nltk.tokenize import word_tokenize
# Preprocess the paragraphs and tokenize the words
def preprocess_paragraph(paragraph):
paragraph = paragraph.lower()
tokens = word_tokenize(paragraph)
print(tokens)
return tokens
if __name__ == '__main__':
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt')
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('/Users/wenchaoguo/Desktop/Google/09_GoogleNews/GoogleNews-vectors-negative300.bin', binary=True)
# Input paragraphs
# paragraph1 = "The computer wouldn't start. She banged on the side and tried again. Nothing. She lifted it up and dropped it to the table. Still nothing. She banged her closed fist against the top. It was at this moment she saw the irony of trying to fix the machine with violence.h"
# paragraph2 = "Generating random paragraphs can be an excellent way for writers to get their creative flow going at the beginning of the day. The writer has no idea what topic the random paragraph will be about when it appears. This forces the writer to use creativity to complete one of three common writing challenges. The writer can use the paragraph as the first one of a short story and build upon it. A second option is to use the random paragraph somewhere in a short story they create. The third option is to have the random paragraph be the ending paragraph in a short story. No matter which of these challenges is undertaken, the writer is forced to use creativity to incorporate the paragraph into their writing."
paragraph1 = "The weather today is beautiful with clear blue skies"
token1 = preprocess_paragraph(paragraph1)
paragraph2 = "Today weather is lovely featuring bright blue skies"
token2 = preprocess_paragraph(paragraph2)
# Preprocess and convert paragraphs into vectors using Word2Vec
vector1 = sum(word2vec_model[token] for token in token1) / len(token1)
vector2 = sum(word2vec_model[token] for token in token2) / len(token2)
# Calculate the cosine similarity between the two vectors
similarity = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]
print("Cosine Similarity:", similarity)