Fixed to include the completed prefix when counting the number of mat…

…ched characters.
t-sagara · Dec 9, 2021 · 69a2a62 · 69a2a62
1 parent d6c7318
commit 69a2a62
Show file tree

Hide file tree

Showing 7 changed files with 51 additions and 19 deletions.
diff --git a/itaiji_dic/itaiji_src.json b/itaiji_dic/itaiji_src.json
@@ -37,6 +37,7 @@
   "煙": "烟",
   "艶": "艷",
   "塩": "鹽",
+  "園": "薗",
   "奥": "奧",
   "往": "徃",
   "応": "應",
@@ -264,7 +265,7 @@
   "棲": "栖",
   "声": "聲",
   "静": "靜",
-  "斉": "斎齊",
+  "斉": "斎齋齊齊済齋齊斎斎薺濟斎斉薺齎濟齊齏霽",
   "跡": "迹蹟",
   "摂": "攝",
   "窃": "竊",

diff --git a/jageocoder/__init__.py b/jageocoder/__init__.py
@@ -19,8 +19,8 @@
     >>> jageocoder.searchNode('<Japanese-address>')
 """
 
-__version__ = '1.0.0rc1'  # The package version
-__dictionary_version__ = '20211208'  # Compatible dictionary version
+__version__ = '1.0.0rc2'  # The package version
+__dictionary_version__ = '20211209'  # Compatible dictionary version
 __author__ = 'Takeshi Sagara <[email protected]>'
 
 __all__ = [

diff --git a/jageocoder/itaiji.py b/jageocoder/itaiji.py
@@ -20,7 +20,7 @@ class Converter(object):
         AddressLevel.WARD: re.compile(r'(区)$'),
         AddressLevel.OAZA: re.compile(r'(町|条|線|丁|丁目|番|号|番丁|番町)$'),
         AddressLevel.AZA: re.compile(r'(町|条|線|丁|丁目|区|番|号)$'),
-        AddressLevel.BLOCK: re.compile(r'(番|番地)$'),
+        AddressLevel.BLOCK: re.compile(r'(番|番地|号|地)$'),
         AddressLevel.BLD: re.compile(r'(号|番地)$'),
     }
 

diff --git a/jageocoder/itaiji_dic.json b/jageocoder/itaiji_dic.json
diff --git a/jageocoder/node.py b/jageocoder/node.py
@@ -146,7 +146,8 @@ def search_child_with_criteria(self, session, pattern: str,
             AddressNode.id)
         return filtered_children
 
-    def search_recursive(self, index, session) -> List[Result]:
+    def search_recursive(self, index, session,
+                         processed_nodes=None) -> List[Result]:
         """
         Search nodes recursively that match the specified address notation.
 
@@ -156,6 +157,8 @@ def search_recursive(self, index, session) -> List[Result]:
             The standardized address notation.
         session : sqlalchemy.orm.Session
             The database session for executing SQL queries.
+        processed_nodes: List of int, optional
+            List of IDs of nodes that have already been processed.
 
         Return
         ------
@@ -194,10 +197,12 @@ def search_recursive(self, index, session) -> List[Result]:
         if filtered_children.count() == 0 and index[0] in '-ノ':
             logger.debug("Beginning with an extra hyphen: {}".format(
                 index))
-            candidates = self.search_recursive(index[1:], session)
+            candidates = self.search_recursive(
+                index[1:], session, processed_nodes)
             if len(candidates) > 0:
-                return [Result(x[0], index[0] + x[1], len(x[1]))
-                        for x in candidates]
+                return [Result(
+                    x[0], index[0] + x[1], l_optional_prefix + len(x[1]))
+                    for x in candidates]
 
             return []
 
@@ -208,6 +213,11 @@ def search_recursive(self, index, session) -> List[Result]:
 
         candidates = []
         for child in filtered_children:
+            if processed_nodes is not None and child.id in processed_nodes:
+                logger.debug("-> skipped; {}({})".format(
+                    child.name, child.id))
+                continue
+
             logger.debug("-> comparing; {}".format(child.name_index))
             new_candidates = self._get_candidates_from_child(
                 child, index, optional_prefix, session)
@@ -227,11 +237,13 @@ def search_recursive(self, index, session) -> List[Result]:
                     rest_index = index[offset:]
                     logger.debug(
                         "child:{} match {} chars".format(child, offset))
-                    for cand in child.search_recursive(rest_index, session):
+                    for cand in child.search_recursive(
+                            rest_index, session, processed_nodes):
                         candidates.append(
                             Result(cand[0],
                                    optional_prefix +
                                    index[0: offset] + cand[1],
+                                   l_optional_prefix +
                                    len(child.name_index) + len(cand[1])
                                    ))
 
@@ -243,13 +255,13 @@ def search_recursive(self, index, session) -> List[Result]:
                 logger.debug('"{}" in index "{}" can be optional.'.format(
                     index[:azalen], index))
                 sub_candidates = self.search_recursive(
-                    index[azalen:], session)
+                    index[azalen:], session, processed_nodes)
                 if sub_candidates[0].matched != '':
                     for cand in sub_candidates:
                         candidates.append(Result(
                             cand.node,
                             optional_prefix + index[0:azalen] + cand.matched,
-                            cand.nchars))
+                            l_optional_prefix + cand.nchars))
 
         if len(candidates) == 0:
             candidates = [Result(self, '', 0)]
@@ -316,12 +328,13 @@ def _get_candidates_from_child(
         candidates = []
         offset = match_len
         rest_index = index[offset:]
+        l_optional_prefix = len(optional_prefix)
         logger.debug("child:{} match {} chars".format(child, offset))
         for cand in child.search_recursive(rest_index, session):
             candidates.append(Result(
                 cand.node,
                 optional_prefix + index[0:match_len] + cand.matched,
-                match_len + cand.nchars))
+                l_optional_prefix + match_len + cand.nchars))
 
         return candidates
 

diff --git a/jageocoder/tree.py b/jageocoder/tree.py
@@ -972,7 +972,7 @@ def search_by_trie(self, query: str,
                 logger.debug("Search '{}' under {}({})".format(
                     rest_index, node.name, node.id))
                 results_by_node = node.search_recursive(
-                    rest_index, self.session)
+                    rest_index, self.session, processed_nodes)
                 processed_nodes.append(node.id)
                 logger.debug('{}({}) marked as processed'.format(
                     node.name, node.id))
@@ -989,16 +989,21 @@ def search_by_trie(self, query: str,
                     """
 
                     _len = offset + cand.nchars
-                    _part = offset + len(cand[1])
+                    _part = offset + len(cand.matched)
+                    msg = "candidate: {} ({})"
+                    logger.debug(msg.format(key + cand.matched, _len))
                     if best_only:
                         if _len > max_len:
-                            results = {}
+                            results = {
+                                "cand.node.id": [cand.node, key + cand.matched]
+                            }
                             max_len = _len
                             min_part = _part
 
-                        if _len == max_len and cand.node.id not in results \
+                        elif _len == max_len and cand.node.id not in results \
                                 and (min_part is None or _part <= min_part):
-                            results[cand.node.id] = [cand.node, key + cand[1]]
+                            results[cand.node.id] = [
+                                cand.node, key + cand.matched]
                             min_part = _part
 
                     else:

diff --git a/tests/test_search.py b/tests/test_search.py
@@ -67,6 +67,7 @@ def test_mie(self):
         """
         self._check(
             query="三重県津市広明町13番地",
+            match="三重県津市広明町13番地",
             fullname=["三重県", "津市", "広明町", "13番地"])
 
     def test_akita(self):
@@ -90,7 +91,7 @@ def test_oaza(self):
         """
         Test notations with and without "大字"
         """
-        top = self._check(
+        self._check(
             query="東京都西多摩郡瑞穂町大字箱根ケ崎2335番地",
             match="東京都西多摩郡瑞穂町大字箱根ケ崎2335番地",
             fullname=[
@@ -431,6 +432,18 @@ def test_mura_ooaza_koaza(self):
             fullname=["徳島県", "美馬市", "脇町", "大字猪尻",
                       "西上野", "61番地"])
 
+    def test_select_best(self):
+        """
+        Check that the best answer is returned for ambiguous queries.
+        """
+        # "佐賀県鹿島市納富分字藤津甲２" can be parsed as
+        # - ["佐賀県", "鹿島市", "大字納富分", "藤津甲"] or
+        # - ["佐賀県", "鹿島市", "大字納富分", "甲", "2番地"]
+        self._check(
+            query="佐賀県鹿島市納富分字藤津甲２",
+            match="佐賀県鹿島市納富分字藤津甲",
+            fullname=["佐賀県", "鹿島市", "大字納富分", "藤津甲"])
+
 
 if __name__ == "__main__":
     unittest.main()