-
Notifications
You must be signed in to change notification settings - Fork 0
/
featuresExtractor.py
81 lines (70 loc) · 2.82 KB
/
featuresExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import jellyfish
## MINITUTORIAL DE COMO CRIAR UMA FEATURE EM 2 PASSOS
# passo 1: crie uma função para sua feature
### a função deve receber os dois valores utilizados no calculo = (self, valueA, valueB)
# passo 2: de um nome para sua funcao
### adicione o nome : self.função dentro da variável featureFunctions na função extractFeatures
class FeaturesExtractor:
def __init__(self, config):
self.config = config
# load config
def getJaroDistance(self, valueA, valueB):
if valueA and valueB:
return jellyfish.jaro_distance(valueA, valueB)
return None
def isCategoricalEqual(self, valueA, valueB):
if valueA and valueB:
if valueA==valueB:
return 1.0 # max
else:
return 0.0 # min
return None
def getHammingDistance(self, valueA, valueB):
if valueA and valueB:
hamming = jellyfish.hamming_distance(valueA, valueB)
stringLength = max(len(valueA), len(valueB))
# normalize the hamming distance to fit between 0 and 1
similarity = (stringLength-hamming)/stringLength
else:
return None
return similarity
def getIbgeSimilarity(self, valueA, valueB):
similarity=0.0
if valueA and valueB:
if valueA[:2] == valueB[:2]:
similarity = similarity + 0.4 # 0.4 here means the wheight of the state
if valueA[2:] == valueB[2:]:
similarity = similarity + 0.6 # 0.6 is the wheight of the city
else:
return None
return similarity
def extractFeatures(self, zipAttributes):
# get feature functions
featureFunctions = {
'hamming': self.getHammingDistance,
'jaro': self.getJaroDistance,
'categorical': self.isCategoricalEqual,
'ibge': self.getIbgeSimilarity
}
# get config
config = self.config
# declare list to store features
featureList = []
#
for column in config.getColumnList():
column = int(column)
# get values
valueA = zipAttributes[column][0].getValue()
valueB = zipAttributes[column][1].getValue()
# get features to calc
columnConfig = config.getColumnByNumber(column)
features = columnConfig.getFeaturesList()
# for each feature, calculate feature
for feature in features:
# get feature function acording to config
singleFeatureFunction = featureFunctions.get(feature)
# get feature value
tmpFeature = singleFeatureFunction(valueA, valueB)
# append value to list
featureList.append(tmpFeature)
return featureList