From 294e37250d5c155b1912a84f814107ef8eb7a7dc Mon Sep 17 00:00:00 2001 From: SubhasisChakraborty Date: Sat, 1 Apr 2017 14:55:01 +0530 Subject: [PATCH] Semantic Similarity Files --- BrownIC.py | 50 +++++++ BrownIC.txt | 350 +++++++++++++++++++++++++++++++++++++++++++++++ DavidModel.txt | 350 +++++++++++++++++++++++++++++++++++++++++++++++ DavidModelIC.py | 79 +++++++++++ GenesisIC.py | 50 +++++++ GenesisIC.txt | 350 +++++++++++++++++++++++++++++++++++++++++++++++ Hyponym.py | 18 +++ NunoModel.txt | 350 +++++++++++++++++++++++++++++++++++++++++++++++ NunoModelIC.py | 86 ++++++++++++ SemcorIC.py | 49 +++++++ SemcorIC.txt | 350 +++++++++++++++++++++++++++++++++++++++++++++++ combined.csv | 351 ++++++++++++++++++++++++++++++++++++++++++++++++ mc.csv | 30 +++++ rg.csv | 65 +++++++++ 14 files changed, 2528 insertions(+) create mode 100644 BrownIC.py create mode 100644 BrownIC.txt create mode 100644 DavidModel.txt create mode 100644 DavidModelIC.py create mode 100644 GenesisIC.py create mode 100644 GenesisIC.txt create mode 100644 Hyponym.py create mode 100644 NunoModel.txt create mode 100644 NunoModelIC.py create mode 100644 SemcorIC.py create mode 100644 SemcorIC.txt create mode 100644 combined.csv create mode 100644 mc.csv create mode 100644 rg.csv diff --git a/BrownIC.py b/BrownIC.py new file mode 100644 index 0000000..2c61fb4 --- /dev/null +++ b/BrownIC.py @@ -0,0 +1,50 @@ +import nltk +from nltk.corpus import wordnet as wn +from nltk.corpus import wordnet_ic +import math,csv +import scipy + +brown_ic = wordnet_ic.ic('ic-brown.dat') + +def sim_lin(syns1,syns2): + maxSim=None + for s1 in syns1: + for s2 in syns2: + sim=s1.lin_similarity(s2,brown_ic) + if maxSim==None or maxSimdpt: + dpt=len(hypernym_paths(i1)) + item=i1 + it1=i + it2=j + return item,a[it1],b[it2] + +def IC(a): + if len(hypernym_paths(a))==0: + return (-1)*math.log(0.9999866) + return (-1)*math.log((float(len(Hyponym.leaf_nodes(a))/len(hypernym_paths(a)))+1)/74898) + +def Prob(a): + if len(hypernym_paths(a))==0: + return math.exp(math.log(0.9999866)) + return math.exp(math.log((float(len(Hyponym.leaf_nodes(a))/len(hypernym_paths(a)))+1)/74898)) + +def Lin_Sim(item,it1,it2): + return 2*IC(item)/(IC(it1)+IC(it2)) + +def Res_Sim(item): + return IC(item) + +train = csv.reader(open("mc.csv",'rb'),delimiter=';') + +word1=[] +word2=[] +hr=[] +LinS=[] + +for row in train: + word1.append(row[0]) + word2.append(row[1]) + hr.append(row[2]) + +#f=open("DavidModel.txt","w") + +for i in range(1,len(hr)): + a=wn.synsets(word1[i]) + b=wn.synsets(word2[i]) + [item,it1,it2]=LCS(a,b) + LinS.append(Lin_Sim(item,it1,it2)*(1-Prob(item))) + #print Prob(item) + #f.write("%s\t%s\t%.5s\t%.5s\t%.5s\n"%(word1[i],word2[i],Lin_Sim(item,it1,it2),Res_Sim(item),hr[i])) + +#f.close() +hr.pop(0) +print scipy.stats.spearmanr(LinS,hr) diff --git a/GenesisIC.py b/GenesisIC.py new file mode 100644 index 0000000..fafc002 --- /dev/null +++ b/GenesisIC.py @@ -0,0 +1,50 @@ +import nltk +from nltk.corpus import wordnet as wn +from nltk.corpus import genesis +import math,csv +import scipy + +genesis_ic=wn.ic(genesis, False, 0.0) + +def sim_lin(syns1,syns2): + maxSim=None + for s1 in syns1: + for s2 in syns2: + sim=s1.lin_similarity(s2,genesis_ic) + if maxSim==None or maxSimdpt: + dpt=len(hypernym_paths(i1)) + item=i1 + it1=i + it2=j + return item,a[it1],b[it2] + +def IC(a): + if (Hyponym.hyponym_paths(a))==[]: + return 1-(math.log10(1)/math.log10(node_max)) + return 1-(math.log10(len(Hyponym.hyponym_paths(a)))/math.log10(node_max)) + +def Prob(a): + if (Hyponym.hyponym_paths(a))==[]: + return 1-(math.log10(1)/math.log10(node_max)) + return (math.log10(len(Hyponym.hyponym_paths(a))+1)/math.log10(node_max+1)) + """(1/(1-(math.log10(len(Hyponym.hyponym_paths(a))+2)/math.log10(node_max+1)))) + """ + +def Lin_Sim(item,it1,it2): + return 2*IC(item)/(IC(it1)+IC(it2)) + +def Res_Sim(item): + return IC(item) + +train = csv.reader(open("rg.csv",'rb'),delimiter=';') + +word1=[] +word2=[] +hr=[] +LinS=[] + +for row in train: + word1.append(row[0]) + word2.append(row[1]) + hr.append(row[2]) + +#f=open("NunoModel.txt","w") + +for i in range(1,len(hr)): + a=wn.synsets(word1[i]) + b=wn.synsets(word2[i]) + [item,it1,it2]=LCS(a,b) + LinS.append(Res_Sim(item)) + #print Prob(item),Lin_Sim(item,it1,it2) + #f.write("%s\t%s\t%.5s\t%.5s\t%.5s\n"%(word1[i],word2[i],Lin_Sim(item,it1,it2),Res_Sim(item),hr[i])) + +#f.close() +hr.pop(0) +#print len(LinS), len(hr1) +print scipy.stats.spearmanr(LinS,hr) diff --git a/SemcorIC.py b/SemcorIC.py new file mode 100644 index 0000000..729d79d --- /dev/null +++ b/SemcorIC.py @@ -0,0 +1,49 @@ +import nltk +from nltk.corpus import wordnet as wn +from nltk.corpus import wordnet_ic +import math,csv +import scipy + +semcor_ic = wordnet_ic.ic('ic-semcor.dat') + +def sim_lin(syns1,syns2): + maxSim=None + for s1 in syns1: + for s2 in syns2: + sim=s1.lin_similarity(s2,semcor_ic) + if maxSim==None or maxSim