Skip to content

Commit

Permalink
Fixed to include the completed prefix when counting the number of mat…
Browse files Browse the repository at this point in the history
…ched characters.
  • Loading branch information
t-sagara committed Dec 9, 2021
1 parent d6c7318 commit 69a2a62
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 19 deletions.
3 changes: 2 additions & 1 deletion itaiji_dic/itaiji_src.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"煙": "",
"艶": "",
"塩": "",
"園": "",
"奥": "",
"往": "",
"応": "",
Expand Down Expand Up @@ -264,7 +265,7 @@
"棲": "",
"声": "",
"静": "",
"斉": "斎齊",
"斉": "斎齋齊齊済齋齊斎斎薺濟斎斉薺齎濟齊齏霽",
"跡": "迹蹟",
"摂": "",
"窃": "",
Expand Down
4 changes: 2 additions & 2 deletions jageocoder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
>>> jageocoder.searchNode('<Japanese-address>')
"""

__version__ = '1.0.0rc1' # The package version
__dictionary_version__ = '20211208' # Compatible dictionary version
__version__ = '1.0.0rc2' # The package version
__dictionary_version__ = '20211209' # Compatible dictionary version
__author__ = 'Takeshi Sagara <[email protected]>'

__all__ = [
Expand Down
2 changes: 1 addition & 1 deletion jageocoder/itaiji.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class Converter(object):
AddressLevel.WARD: re.compile(r'(区)$'),
AddressLevel.OAZA: re.compile(r'(町|条|線|丁|丁目|番|号|番丁|番町)$'),
AddressLevel.AZA: re.compile(r'(町|条|線|丁|丁目|区|番|号)$'),
AddressLevel.BLOCK: re.compile(r'(番|番地)$'),
AddressLevel.BLOCK: re.compile(r'(番|番地|号|地)$'),
AddressLevel.BLD: re.compile(r'(号|番地)$'),
}

Expand Down
2 changes: 1 addition & 1 deletion jageocoder/itaiji_dic.json

Large diffs are not rendered by default.

29 changes: 21 additions & 8 deletions jageocoder/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ def search_child_with_criteria(self, session, pattern: str,
AddressNode.id)
return filtered_children

def search_recursive(self, index, session) -> List[Result]:
def search_recursive(self, index, session,
processed_nodes=None) -> List[Result]:
"""
Search nodes recursively that match the specified address notation.
Expand All @@ -156,6 +157,8 @@ def search_recursive(self, index, session) -> List[Result]:
The standardized address notation.
session : sqlalchemy.orm.Session
The database session for executing SQL queries.
processed_nodes: List of int, optional
List of IDs of nodes that have already been processed.
Return
------
Expand Down Expand Up @@ -194,10 +197,12 @@ def search_recursive(self, index, session) -> List[Result]:
if filtered_children.count() == 0 and index[0] in '-ノ':
logger.debug("Beginning with an extra hyphen: {}".format(
index))
candidates = self.search_recursive(index[1:], session)
candidates = self.search_recursive(
index[1:], session, processed_nodes)
if len(candidates) > 0:
return [Result(x[0], index[0] + x[1], len(x[1]))
for x in candidates]
return [Result(
x[0], index[0] + x[1], l_optional_prefix + len(x[1]))
for x in candidates]

return []

Expand All @@ -208,6 +213,11 @@ def search_recursive(self, index, session) -> List[Result]:

candidates = []
for child in filtered_children:
if processed_nodes is not None and child.id in processed_nodes:
logger.debug("-> skipped; {}({})".format(
child.name, child.id))
continue

logger.debug("-> comparing; {}".format(child.name_index))
new_candidates = self._get_candidates_from_child(
child, index, optional_prefix, session)
Expand All @@ -227,11 +237,13 @@ def search_recursive(self, index, session) -> List[Result]:
rest_index = index[offset:]
logger.debug(
"child:{} match {} chars".format(child, offset))
for cand in child.search_recursive(rest_index, session):
for cand in child.search_recursive(
rest_index, session, processed_nodes):
candidates.append(
Result(cand[0],
optional_prefix +
index[0: offset] + cand[1],
l_optional_prefix +
len(child.name_index) + len(cand[1])
))

Expand All @@ -243,13 +255,13 @@ def search_recursive(self, index, session) -> List[Result]:
logger.debug('"{}" in index "{}" can be optional.'.format(
index[:azalen], index))
sub_candidates = self.search_recursive(
index[azalen:], session)
index[azalen:], session, processed_nodes)
if sub_candidates[0].matched != '':
for cand in sub_candidates:
candidates.append(Result(
cand.node,
optional_prefix + index[0:azalen] + cand.matched,
cand.nchars))
l_optional_prefix + cand.nchars))

if len(candidates) == 0:
candidates = [Result(self, '', 0)]
Expand Down Expand Up @@ -316,12 +328,13 @@ def _get_candidates_from_child(
candidates = []
offset = match_len
rest_index = index[offset:]
l_optional_prefix = len(optional_prefix)
logger.debug("child:{} match {} chars".format(child, offset))
for cand in child.search_recursive(rest_index, session):
candidates.append(Result(
cand.node,
optional_prefix + index[0:match_len] + cand.matched,
match_len + cand.nchars))
l_optional_prefix + match_len + cand.nchars))

return candidates

Expand Down
15 changes: 10 additions & 5 deletions jageocoder/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -972,7 +972,7 @@ def search_by_trie(self, query: str,
logger.debug("Search '{}' under {}({})".format(
rest_index, node.name, node.id))
results_by_node = node.search_recursive(
rest_index, self.session)
rest_index, self.session, processed_nodes)
processed_nodes.append(node.id)
logger.debug('{}({}) marked as processed'.format(
node.name, node.id))
Expand All @@ -989,16 +989,21 @@ def search_by_trie(self, query: str,
"""

_len = offset + cand.nchars
_part = offset + len(cand[1])
_part = offset + len(cand.matched)
msg = "candidate: {} ({})"
logger.debug(msg.format(key + cand.matched, _len))
if best_only:
if _len > max_len:
results = {}
results = {
"cand.node.id": [cand.node, key + cand.matched]
}
max_len = _len
min_part = _part

if _len == max_len and cand.node.id not in results \
elif _len == max_len and cand.node.id not in results \
and (min_part is None or _part <= min_part):
results[cand.node.id] = [cand.node, key + cand[1]]
results[cand.node.id] = [
cand.node, key + cand.matched]
min_part = _part

else:
Expand Down
15 changes: 14 additions & 1 deletion tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def test_mie(self):
"""
self._check(
query="三重県津市広明町13番地",
match="三重県津市広明町13番地",
fullname=["三重県", "津市", "広明町", "13番地"])

def test_akita(self):
Expand All @@ -90,7 +91,7 @@ def test_oaza(self):
"""
Test notations with and without "大字"
"""
top = self._check(
self._check(
query="東京都西多摩郡瑞穂町大字箱根ケ崎2335番地",
match="東京都西多摩郡瑞穂町大字箱根ケ崎2335番地",
fullname=[
Expand Down Expand Up @@ -431,6 +432,18 @@ def test_mura_ooaza_koaza(self):
fullname=["徳島県", "美馬市", "脇町", "大字猪尻",
"西上野", "61番地"])

def test_select_best(self):
"""
Check that the best answer is returned for ambiguous queries.
"""
# "佐賀県鹿島市納富分字藤津甲2" can be parsed as
# - ["佐賀県", "鹿島市", "大字納富分", "藤津甲"] or
# - ["佐賀県", "鹿島市", "大字納富分", "甲", "2番地"]
self._check(
query="佐賀県鹿島市納富分字藤津甲2",
match="佐賀県鹿島市納富分字藤津甲",
fullname=["佐賀県", "鹿島市", "大字納富分", "藤津甲"])


if __name__ == "__main__":
unittest.main()

0 comments on commit 69a2a62

Please sign in to comment.