-
Notifications
You must be signed in to change notification settings - Fork 0
/
address.py
115 lines (102 loc) · 4.26 KB
/
address.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from __future__ import annotations
import typing
import re
from dataclasses import dataclass
from helpers import logger
from thefuzz import fuzz
be_housenum_re = re.compile(r"(?P<num>(\d+)) *[-\/]?(?P<let>([a-zA-Z]{1})?)")
@dataclass
class Address:
uri: str
full_address: str
adminunitl1: str
postcode: typing.Optional[str] = None
postname: typing.Optional[str] = None
thoroughfare: typing.Optional[str] = None
locator_designator: typing.Optional[str] = None
@property
def specificity(self):
specificty = 0
MAX_SPECIFICITY = 5
if self.adminunitl1:
specificty += 1
if self.postcode:
specificty += 1
if self.postname:
specificty += 1
if self.thoroughfare:
specificty += 1
if self.locator_designator:
specificty += 1
normalized_specificity = specificty / MAX_SPECIFICITY * 100
# print(f"score specificiteit {normalized_specificity}")
return normalized_specificity
def score_specificity(self, b: typing.Self):
return 100 - abs(self.specificity - b.specificity)
def score_adminunitl1(self, b: typing.Self):
return fuzz.partial_token_set_ratio(self.adminunitl1, b.adminunitl1)
def score_postcode(self, b: typing.Self):
try:
return 100 - abs(int(self.postcode) - int(b.postcode))
except Exception:
return 0
def score_postname(self, b: typing.Self):
return fuzz.partial_token_set_ratio(self.postname, b.postname)
def score_thoroughfare(self, b: typing.Self):
return fuzz.partial_token_set_ratio(self.thoroughfare, b.thoroughfare)
def score_locator_designator(self, b: typing.Self):
if self.locator_designator and b.locator_designator:
match = be_housenum_re.match(self.locator_designator)
match_b = be_housenum_re.match(b.locator_designator)
if match and match_b:
num = int(match["num"]) if "num" in match.groupdict() else None
num_b = int(match_b["num"]) if "num" in match_b.groupdict() else None
# times 10/2 amplification, means score of 0 when more than 10 numbers away (when counting alternating numbering)
score_num = max(100 - abs(num - num_b) * (10 / 2), 0)
if score_num == 100:
let = match["let"] if "let" in match.groupdict() else None
# print(f'{num}, {let}')
let_b = match_b["let"] if "let" in match_b.groupdict() else None
# print(f'{num_b}, {let_b}')
if let == "" and let_b == "":
return score_num
elif let == "" or let_b == "":
logger.debug(f"Only one of both locator designators has a letter {match} {match_b}")
return 95
else:
let_num = ord(let.lower()) - 96
let_num_b = ord(let_b.lower()) - 96
score_let = max(100 - abs(let_num - let_num_b), 0)
return (score_let + score_num) / 2
else:
return score_num
else:
return 0
elif not self.locator_designator and not b.locator_designator:
return 100
else:
# unless for the case 1 vs none?
return 0
def score(self, b: typing.Self):
scoring_funs = [
(self.score_specificity, 1),
(self.score_adminunitl1, 0.6),
(self.score_postcode, 0.5),
(self.score_postname, 0.5),
(self.score_thoroughfare, 2.4),
(self.score_locator_designator, 1.0)
]
score = 0
used_scores = 0
logger.debug(f"Scoring '{self.full_address}' against '{b.full_address}'")
for scoring_fun, weight in scoring_funs:
res = scoring_fun(b)
logger.debug(f"- {scoring_fun.__name__} {res}")
if res is not None:
score += res * weight
used_scores += 1
max_score = used_scores * 100
normalized_score = score / max_score * 100
return normalized_score
def __str__(self):
return self.full_address