From 4cbbaa7835299c390a51189adef93ce5bd8f980b Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 5 Mar 2018 12:22:10 +0800 Subject: [PATCH 01/31] puchikarui & lxml may not be available pre-setup --- jamdict/__init__.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/jamdict/__init__.py b/jamdict/__init__.py index 8030769..405102a 100644 --- a/jamdict/__init__.py +++ b/jamdict/__init__.py @@ -54,16 +54,18 @@ __url__ = "https://github.com/neocl/jamdict" __maintainer__ = "Le Tuan Anh" __version_major__ = "0.1" -__version__ = "{}a1".format(__version_major__) +__version__ = "{}a2".format(__version_major__) __version_long__ = "{} - Alpha".format(__version_major__) __status__ = "Prototype" ######################################################################## -from .jmdict_sqlite import JMDictSQLite -from .kanjidic2_sqlite import KanjiDic2SQLite -from .util import Jamdict, JMDictXML, KanjiDic2XML +import logging -######################################################################## - -__all__ = ['Jamdict', 'JMDictSQLite', 'JMDictXML', 'KanjiDic2SQLite', 'KanjiDic2XML'] +try: + from .jmdict_sqlite import JMDictSQLite + from .kanjidic2_sqlite import KanjiDic2SQLite + from .util import Jamdict, JMDictXML, KanjiDic2XML + __all__ = ['Jamdict', 'JMDictSQLite', 'JMDictXML', 'KanjiDic2SQLite', 'KanjiDic2XML'] +except: + logging.getLogger(__name__).exception("jamdict package was not loaded properly") From fd8408161614427cfb802067f169e7e3aec5b3e2 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 15:55:10 +0800 Subject: [PATCH 02/31] Better setup script --- MANIFEST.in | 3 +- data/README.md | 2 +- jamdict/__version__.py | 15 +++++ jamdict/{scripts => data}/setup_jmdict.sql | 0 jamdict/{scripts => data}/setup_kanjidic2.sql | 0 setup.py | 67 ++++++------------- {data => test/data}/JMdict_mini.xml | 0 .../data/kanjidic2_mini.xml | 0 8 files changed, 37 insertions(+), 50 deletions(-) create mode 100644 jamdict/__version__.py rename jamdict/{scripts => data}/setup_jmdict.sql (100%) rename jamdict/{scripts => data}/setup_kanjidic2.sql (100%) rename {data => test/data}/JMdict_mini.xml (100%) rename data/kanjidic2.mini.xml => test/data/kanjidic2_mini.xml (100%) diff --git a/MANIFEST.in b/MANIFEST.in index 462f989..4853fac 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include README.rst include CHANGES.md include LICENSE -recursive-include jamdict/scripts/ *.sql +recursive-include jamdict/data/ *.sql +recursive-include jamdict/data/ *.json diff --git a/data/README.md b/data/README.md index 512d3b2..cb2a2c3 100644 --- a/data/README.md +++ b/data/README.md @@ -1 +1 @@ -Copy JMDict dictionary file (JMdict_e.xml) here \ No newline at end of file +Copy dictionary files (JMdict_e.xml, kanjidic2.xml, kradfile, etc.) here diff --git a/jamdict/__version__.py b/jamdict/__version__.py new file mode 100644 index 0000000..6da97c6 --- /dev/null +++ b/jamdict/__version__.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- + +# jamdict's package version information +__author__ = "Le Tuan Anh" +__email__ = "tuananh.ke@gmail.com" +__copyright__ = "Copyright (c) 2016, Le Tuan Anh" +__credits__ = [] +__license__ = "MIT License" +__description__ = "Python library for manipulating Jim Breen's JMdict & KanjiDic2" +__url__ = "https://github.com/neocl/jamdict" +__maintainer__ = "Le Tuan Anh" +__version_major__ = "0.1" +__version__ = "{}a2".format(__version_major__) +__version_long__ = "{} - Alpha".format(__version_major__) +__status__ = "Prototype" diff --git a/jamdict/scripts/setup_jmdict.sql b/jamdict/data/setup_jmdict.sql similarity index 100% rename from jamdict/scripts/setup_jmdict.sql rename to jamdict/data/setup_jmdict.sql diff --git a/jamdict/scripts/setup_kanjidic2.sql b/jamdict/data/setup_kanjidic2.sql similarity index 100% rename from jamdict/scripts/setup_kanjidic2.sql rename to jamdict/data/setup_kanjidic2.sql diff --git a/setup.py b/setup.py index 119ca09..4ed5192 100644 --- a/setup.py +++ b/setup.py @@ -3,51 +3,20 @@ ''' Setup script for jamdict -Latest version can be found at https://github.com/neocl/jamdict -References: - Python documentation: - https://docs.python.org/ - argparse module: - https://docs.python.org/3/howto/argparse.html - PEP 257 - Python Docstring Conventions: - https://www.python.org/dev/peps/pep-0257/ +Latest version can be found at https://github.com/neocl/jamdict -@author: Le Tuan Anh +:copyright: (c) 2012 Le Tuan Anh +:license: MIT, see LICENSE for more details. ''' -# Copyright (c) 2016, Le Tuan Anh -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -######################################################################## - import io import os from setuptools import setup -import jamdict - - -######################################################################## def read(*filenames, **kwargs): + ''' Read contents of multiple files and join them together ''' encoding = kwargs.get('encoding', 'utf-8') sep = kwargs.get('sep', '\n') buf = [] @@ -58,27 +27,29 @@ def read(*filenames, **kwargs): readme_file = 'README.rst' if os.path.isfile('README.rst') else 'README.md' -print("README file: {}".format(readme_file)) long_description = read(readme_file) +pkg_info = {} +exec(read('jamdict/__version__.py'), pkg_info) + setup( - name='jamdict', - version=jamdict.__version__, - url=jamdict.__url__, + name='jamdict', # package file name (-version.tar.gz) + version=pkg_info['__version__'], + url=pkg_info['__url__'], project_urls={ "Bug Tracker": "https://github.com/neocl/jamdict/issues", "Source Code": "https://github.com/neocl/jamdict/" }, - keywords="japanese dictionary sqlite xml", - license=jamdict.__license__, - author=jamdict.__author__, - tests_require=['lxml', 'puchikarui'], - install_requires=['lxml', 'puchikarui'], - author_email=jamdict.__email__, - description=jamdict.__description__, + keywords="nlp", + license=pkg_info['__license__'], + author=pkg_info['__author__'], + tests_require=['lxml', 'chirptext', 'puchikarui'], + install_requires=['lxml', 'chirptext', 'puchikarui'], + author_email=pkg_info['__email__'], + description=pkg_info['__description__'], long_description=long_description, packages=['jamdict'], - package_data={'jamdict': ['scripts/*.sql']}, + package_data={'jamdict': ['data/*.sql', 'data/*.json']}, include_package_data=True, platforms='any', test_suite='test', @@ -88,7 +59,7 @@ def read(*filenames, **kwargs): 'Natural Language :: Japanese', 'Environment :: Plugins', 'Intended Audience :: Developers', - 'License :: OSI Approved :: {}'.format(jamdict.__license__), + 'License :: OSI Approved :: {}'.format(pkg_info['__license__']), 'Operating System :: OS Independent', 'Topic :: Database', 'Topic :: Text Processing :: Linguistic', diff --git a/data/JMdict_mini.xml b/test/data/JMdict_mini.xml similarity index 100% rename from data/JMdict_mini.xml rename to test/data/JMdict_mini.xml diff --git a/data/kanjidic2.mini.xml b/test/data/kanjidic2_mini.xml similarity index 100% rename from data/kanjidic2.mini.xml rename to test/data/kanjidic2_mini.xml From 3afa4a947b2ded6a88dc2cc45fa7d96539e6a236 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 15:55:37 +0800 Subject: [PATCH 03/31] Use chirptext's AppConfig to config jamdict --- jamdict/config.py | 91 +++++++++++++++++++++++++++++++ jamdict/data/config_template.json | 8 +++ requirements.txt | 1 + 3 files changed, 100 insertions(+) create mode 100644 jamdict/config.py create mode 100644 jamdict/data/config_template.json diff --git a/jamdict/config.py b/jamdict/config.py new file mode 100644 index 0000000..ae16b61 --- /dev/null +++ b/jamdict/config.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +''' +Jamdict configuration management + +Latest version can be found at https://github.com/neocl/jamdict + +@author: Le Tuan Anh +@license: MIT +''' + +# Copyright (c) 2016, Le Tuan Anh +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +######################################################################## + +import os +import logging + +from chirptext import AppConfig +from chirptext.io import read_file, write_file + +# ---------------------------------------------------------------------- +# Configuration +# ---------------------------------------------------------------------- + +MY_DIR = os.path.dirname(__file__) +CONFIG_TEMPLATE = os.path.join(MY_DIR, 'data', 'config_template.json') +__jamdict_home = os.environ.get('JAMDICT_HOME', MY_DIR) +__app_config = AppConfig('jamdict', mode=AppConfig.JSON, working_dir=__jamdict_home) + + +def getLogger(): + return logging.getLogger(__name__) + + +def _get_config_manager(): + ''' Internal function for retrieving application config manager object + Don't use this directly, use read_config() method instead + ''' + return __app_config + + +def read_config(): + if not __app_config.config and not __app_config.locate_config(): + # need to create a config + config_dir = os.path.expanduser('~/.jamdict/') + if not os.path.exists(config_dir): + os.makedirs(config_dir) + cfg_loc = os.path.join(config_dir, 'config.json') + default_config = read_file(CONFIG_TEMPLATE) + getLogger().warning("Jamdict configuration file could not be found. A new configuration file will be generated at {}".format(cfg_loc)) + getLogger().debug("Default config: {}".format(default_config)) + write_file(cfg_loc, default_config) + # read config + config = __app_config.config + return config + + +def home_dir(): + _config = read_config() + return _config.get('JAMDICT_HOME', '.') + + +def data_dir(): + _config = read_config() + _data_dir = _config.get('JAMDICT_DATA', '{JAMDICT_HOME}/data').format(JAMDICT_HOME=home_dir()) + return _data_dir + + +def get_file(file_key): + _config = read_config() + _data_dir = data_dir() + return _config.get(file_key).format(JAMDICT_DATA=_data_dir) diff --git a/jamdict/data/config_template.json b/jamdict/data/config_template.json new file mode 100644 index 0000000..a014cbb --- /dev/null +++ b/jamdict/data/config_template.json @@ -0,0 +1,8 @@ +{ + "JAMDICT_HOME": "~/.jamdict", + "JAMDICT_DATA": "{JAMDICT_HOME}/data", + "JAMDICT_DB": "{JAMDICT_DATA}/jamdict.db", + "JMDICT_XML": "{JAMDICT_DATA}/JMdict_e.gz", + "KD2_XML": "{JAMDICT_DATA}/kanjidic2.xml.gz", + "KRADFILE": "{JAMDICT_DATA}/kradfile-u.gz" +} diff --git a/requirements.txt b/requirements.txt index 539e6ba..3134372 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ lxml +chirptext puchikarui From 57976219830eb51c8c41e031c9ff492694174540 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 15:56:08 +0800 Subject: [PATCH 04/31] Update jmd command line tool --- jamdict/__init__.py | 30 ++++++++---------------------- jmd | 2 +- 2 files changed, 9 insertions(+), 23 deletions(-) diff --git a/jamdict/__init__.py b/jamdict/__init__.py index 405102a..23377be 100644 --- a/jamdict/__init__.py +++ b/jamdict/__init__.py @@ -44,28 +44,14 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. - -__author__ = "Le Tuan Anh" -__email__ = "tuananh.ke@gmail.com" -__copyright__ = "Copyright 2016, jamdict" -__credits__ = [] -__license__ = "MIT License" -__description__ = "Python library for manipulating Jim Breen's JMdict & KanjiDic2" -__url__ = "https://github.com/neocl/jamdict" -__maintainer__ = "Le Tuan Anh" -__version_major__ = "0.1" -__version__ = "{}a2".format(__version_major__) -__version_long__ = "{} - Alpha".format(__version_major__) -__status__ = "Prototype" - ######################################################################## -import logging +from .__version__ import __author__, __email__, __copyright__, __maintainer__ +from .__version__ import __credits__, __license__, __description__, __url__ +from .__version__ import __version_major__, __version_long__, __version__, __status__ -try: - from .jmdict_sqlite import JMDictSQLite - from .kanjidic2_sqlite import KanjiDic2SQLite - from .util import Jamdict, JMDictXML, KanjiDic2XML - __all__ = ['Jamdict', 'JMDictSQLite', 'JMDictXML', 'KanjiDic2SQLite', 'KanjiDic2XML'] -except: - logging.getLogger(__name__).exception("jamdict package was not loaded properly") +from .jmdict_sqlite import JMDictSQLite +from .kanjidic2_sqlite import KanjiDic2SQLite +from .util import Jamdict, JMDictXML, KanjiDic2XML +__all__ = ['Jamdict', 'JMDictSQLite', 'JMDictXML', 'KanjiDic2SQLite', 'KanjiDic2XML', + "__version__", "__author__", "__description__", "__copyright__"] diff --git a/jmd b/jmd index 29436d0..ed093b6 100755 --- a/jmd +++ b/jmd @@ -2,5 +2,5 @@ export JAMDICT_HOME=~/local/jamdict cd ${JAMDICT_HOME} -python3 -m jamdict.tools lookup $1 +python3 -m jamdict.tools lookup "$@" From 2b7a4ce3f23c22b27c7250b0cbddecbc46cde2a4 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 15:56:36 +0800 Subject: [PATCH 05/31] Use chirptext.io to read files. *Reading -> *Form --- jamdict/jmdict.py | 53 ++++++++++++++++++++++++++++++-------------- jamdict/kanjidic2.py | 34 +++++++++++++++------------- 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/jamdict/jmdict.py b/jamdict/jmdict.py index db1358b..1c8a68b 100644 --- a/jamdict/jmdict.py +++ b/jamdict/jmdict.py @@ -50,6 +50,8 @@ import logging from lxml import etree +from chirptext import io as chio + logger = logging.getLogger(__name__) ######################################################################## @@ -64,8 +66,8 @@ class JMDEntry(object): def __init__(self, idseq=''): # A unique numeric sequence number for each entry self.idseq = idseq # ent_seq - self.kanji_forms = [] # k_ele* => KanjiReading[] - self.kana_forms = [] # r_ele+ => KanaReading[] + self.kanji_forms = [] # k_ele* => KanjiForm[] + self.kana_forms = [] # r_ele+ => KanaForm[] self.info = None # info? => EntryInfo self.senses = [] # sense+ @@ -103,7 +105,7 @@ def to_json(self): return ed -class KanjiReading(object): +class KanjiForm(object): ''' The kanji element, or in its absence, the reading element, is the defining component of each entry. The overwhelming majority of entries will have a single kanji @@ -174,8 +176,14 @@ def to_json(self): kjd['pri'] = self.pri return kjd + def __repr__(self): + return str(self) -class KanaReading(object): + def __str__(self): + return self.text + + +class KanaForm(object): ''' The reading element typically contains the valid readings of the word(s) in the kanji element using modern kanadzukai. @@ -228,6 +236,12 @@ def to_json(self): knd['pri'] = self.pri return knd + def __repr__(self): + return str(self) + + def __str__(self): + return self.text + class EntryInfo(object): '''General coded information relating to the entry as a whole. @@ -375,8 +389,11 @@ def __repr__(self): return str(self) def __str__(self): + return self.text(compact=False) + + def text(self, compact=True): tmp = [str(x) for x in self.gloss] - if self.pos: + if not compact and self.pos: return '{gloss} ({pos})'.format(gloss='/'.join(tmp), pos=('(%s)' % '|'.join(self.pos))) else: return '/'.join(tmp) @@ -513,16 +530,18 @@ def __init__(self): def parse_file(self, jmdict_file_path): ''' Parse JMDict_e.xml file and return a list of JMDEntry objects ''' - logger.debug('Loading data from file: %s' % (os.path.abspath(jmdict_file_path))) - - tree = etree.iterparse(jmdict_file_path) - entries = [] - for event, element in tree: - if event == 'end' and element.tag == 'entry': - entries.append(self.parse_entry_tag(element)) - # and then we can clear the element to save memory - element.clear() - return entries + actual_path = os.path.abspath(os.path.expanduser(jmdict_file_path)) + logger.debug('Loading data from file: {}'.format(actual_path)) + + with chio.open(actual_path, mode='rb') as jmfile: + tree = etree.iterparse(jmfile) + entries = [] + for event, element in tree: + if event == 'end' and element.tag == 'entry': + entries.append(self.parse_entry_tag(element)) + # and then we can clear the element to save memory + element.clear() + return entries def parse_entry_tag(self, etag): '''Parse a lxml XML Node and generate a JMDEntry entry''' @@ -559,7 +578,7 @@ def get_single(self, tag_name, a_tag): return children[0] def parse_k_ele(self, k_ele, entry): - kr = KanjiReading() + kr = KanjiForm() for child in k_ele: if child.tag == 'keb': kr.set_text(child.text) @@ -574,7 +593,7 @@ def parse_k_ele(self, k_ele, entry): return kr def parse_r_ele(self, r_ele, entry): - kr = KanaReading() + kr = KanaForm() for child in r_ele: if child.tag == 'reb': kr.set_text(child.text) diff --git a/jamdict/kanjidic2.py b/jamdict/kanjidic2.py index b0616a8..e9a25a9 100644 --- a/jamdict/kanjidic2.py +++ b/jamdict/kanjidic2.py @@ -51,6 +51,8 @@ import logging from lxml import etree +from chirptext import io as chio + # ------------------------------------------------------------------------------ # Configuration @@ -516,22 +518,24 @@ def get_attrib(self, a_tag, attr_name, default_value=''): else: return default_value - def parse_file(self, jmdict_file_path): - ''' Parse JMDict_e.xml file and return a list of JMDEntry objects + def parse_file(self, kd2_file_path): + ''' Parse all characters from Kanjidic2 XML file ''' - getLogger().debug('Loading data from file: %s' % (os.path.abspath(jmdict_file_path))) - - tree = etree.iterparse(jmdict_file_path) - kd2 = None - for event, element in tree: - if event == 'end': - if element.tag == 'header': - kd2 = self.parse_header(element) - element.clear() - elif element.tag == 'character': - kd2.characters.append(self.parse_char(element)) - element.clear() - return kd2 + actual_path = os.path.abspath(os.path.expanduser(kd2_file_path)) + getLogger().debug('Loading data from file: {}'.format(actual_path)) + + with chio.open(actual_path, mode='rb') as kd2file: + tree = etree.iterparse(kd2file) + kd2 = None + for event, element in tree: + if event == 'end': + if element.tag == 'header': + kd2 = self.parse_header(element) + element.clear() + elif element.tag == 'character': + kd2.characters.append(self.parse_char(element)) + element.clear() + return kd2 def parse_header(self, e): fv = None From 5ad5ee3932b5114a1fa64422b30f5dfe62151870 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 15:57:14 +0800 Subject: [PATCH 06/31] *Reading -> *Form --- jamdict/jmdict_sqlite.py | 8 ++++---- jamdict/kanjidic2_sqlite.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/jamdict/jmdict_sqlite.py b/jamdict/jmdict_sqlite.py index f83e900..3c77652 100644 --- a/jamdict/jmdict_sqlite.py +++ b/jamdict/jmdict_sqlite.py @@ -43,7 +43,7 @@ from puchikarui import Schema from . import __version__ as JAMDICT_VERSION, __url__ as JAMDICT_URL -from .jmdict import JMDEntry, EntryInfo, Link, BibInfo, Audit, KanjiReading, KanaReading, Sense, SenseGloss, LSource +from .jmdict import JMDEntry, EntryInfo, Link, BibInfo, Audit, KanjiForm, KanaForm, Sense, SenseGloss, LSource # ------------------------------------------------------------------------------- @@ -51,7 +51,7 @@ # ------------------------------------------------------------------------------- MY_FOLDER = os.path.dirname(os.path.abspath(__file__)) -SCRIPT_FOLDER = os.path.join(MY_FOLDER, 'scripts') +SCRIPT_FOLDER = os.path.join(MY_FOLDER, 'data') JMDICT_SETUP_FILE = os.path.join(SCRIPT_FOLDER, 'setup_jmdict.sql') JMDICT_VERSION = '1.08' JMDICT_URL = 'http://www.csse.monash.edu.au/~jwb/edict.html' @@ -183,7 +183,7 @@ def get_entry(self, idseq, ctx=None): # select kanji kanjis = ctx.Kanji.select('idseq=?', (idseq,)) for dbkj in kanjis: - kj = KanjiReading(dbkj.text) + kj = KanjiForm(dbkj.text) kjis = ctx.KJI.select('kid=?', (dbkj.ID,)) for i in kjis: kj.info.append(i.text) @@ -194,7 +194,7 @@ def get_entry(self, idseq, ctx=None): # select kana kanas = ctx.Kana.select('idseq=?', (idseq,)) for dbkn in kanas: - kn = KanaReading(dbkn.text, dbkn.nokanji) + kn = KanaForm(dbkn.text, dbkn.nokanji) knis = ctx.KNI.select('kid=?', (dbkn.ID,)) for i in knis: kn.info.append(i.text) diff --git a/jamdict/kanjidic2_sqlite.py b/jamdict/kanjidic2_sqlite.py index e550015..f7f0501 100644 --- a/jamdict/kanjidic2_sqlite.py +++ b/jamdict/kanjidic2_sqlite.py @@ -50,7 +50,7 @@ # ------------------------------------------------------------------------------ MY_FOLDER = os.path.dirname(os.path.abspath(__file__)) -SCRIPT_FOLDER = os.path.join(MY_FOLDER, 'scripts') +SCRIPT_FOLDER = os.path.join(MY_FOLDER, 'data') KANJIDIC2_SETUP_FILE = os.path.join(SCRIPT_FOLDER, 'setup_kanjidic2.sql') KANJIDIC2_SETUP_SCRIPT = ''' INSERT INTO meta SELECT 'generator', 'jamdict' From d9f81247646b0f9a258402eb4fc6cd99da865958 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 15:57:34 +0800 Subject: [PATCH 07/31] Make Jamdict class easier to use --- jamdict/util.py | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/jamdict/util.py b/jamdict/util.py index 0d01653..54f6c7b 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -47,9 +47,14 @@ ######################################################################## +import os import logging import threading from collections import defaultdict as dd + +from chirptext import io as chio + +from . import config from .jmdict import JMDictXMLParser from .jmdict_sqlite import JMDictSQLite from .kanjidic2 import Kanjidic2XMLParser @@ -58,11 +63,11 @@ ######################################################################## -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) +def getLogger(): + return logging.getLogger(__name__) -######################################################################## +######################################################################## class LookupResult(object): @@ -85,10 +90,14 @@ class Jamdict(object): def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file=None): # file paths configuration - self.db_file = db_file - self.kd2_file = kd2_file - self.jmd_xml_file = jmd_xml_file - self.kd2_xml_file = kd2_xml_file + self.db_file = db_file if db_file else config.get_file('JAMDICT_DB') + self.kd2_file = kd2_file if kd2_file else config.get_file('JAMDICT_DB') + if not self.db_file: + getLogger().warning("JAMDICT_DB could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict.tools import` first") + if not self.kd2_file: + getLogger().warning("Kanjidic2 database could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict.tools import` first") + self.jmd_xml_file = jmd_xml_file if jmd_xml_file else config.get_file('JMDICT_XML') + self.kd2_xml_file = kd2_xml_file if kd2_xml_file else config.get_file('KD2_XML') # data sources self._db_sqlite = None self._kd2_sqlite = None @@ -99,9 +108,10 @@ def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file= def jmdict(self): if not self._db_sqlite and self.db_file: with threading.Lock(): - if not self.kd2_file: + if not self.kd2_file or self.kd2_file == self.db_file: # Use 1 DB for both self._db_sqlite = JamdictSQLite(self.db_file) + self._kd2_sqlite = self._db_sqlite else: # use 2 separated files self._db_sqlite = JMDictSQLite(self.db_file) @@ -109,10 +119,10 @@ def jmdict(self): @property def kd2(self): - if not self._kd2_sqlite: - if self.kd2_file: + if self._kd2_sqlite is None: + if self.kd2_file is not None: with threading.Lock(): - self.kd2_sqlite = KanjiDic2SQLite(self.kd2_file) + self._kd2_sqlite = KanjiDic2SQLite(self.kd2_file) else: self._kd2_sqlite = self.jmdict return self._kd2_sqlite @@ -141,14 +151,14 @@ def is_available(self): def import_data(self): ''' Import JMDict and KanjiDic2 data from XML to SQLite ''' if self.jmdict and self.jmdict_xml: - logger.info("Importing JMDict data") + getLogger().info("Importing JMDict data") self.jmdict.insert_entries(self.jmdict_xml) - if self.kd2 and self.kd2_xml: - logger.info("Importing KanjiDic2 data") + if self.kd2 is not None and self.kd2_xml: + getLogger().info("Importing KanjiDic2 data") self.kd2.insert_chars(self.kd2_xml) def get_char(self, literal): - if self.kd2: + if self.kd2 is not None: return self.kd2.get_char(literal) elif self.kd2_xml: return self.kd2_xml.lookup(literal) @@ -221,7 +231,7 @@ def lookup(self, a_query): @staticmethod def from_file(filename): parser = JMDictXMLParser() - return JMDictXML(parser.parse_file(filename)) + return JMDictXML(parser.parse_file(os.path.abspath(os.path.expanduser(filename)))) class KanjiDic2XML(object): @@ -233,7 +243,7 @@ def __init__(self, kd2): self.char_map = {} for char in self.kd2: if char.literal in self.char_map: - logger.warning("Duplicate character entry: {}".format(char.literal)) + getLogger().warning("Duplicate character entry: {}".format(char.literal)) self.char_map[char.literal] = char def __len__(self): From e2770beec841123c37fa6663b1fe9f3a43c98ee5 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 15:58:23 +0800 Subject: [PATCH 08/31] Use chirptext setup_logging --- test/__init__.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test/__init__.py b/test/__init__.py index e69de29..75830db 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +''' Jamdict Test Scripts +Latest version can be found at https://github.com/neocl/jamdict/ + +:copyright: (c) 2016 Le Tuan Anh +:license: MIT, see LICENSE for more details. +''' + +# This source code is a part of jamdict library +# Copyright (c) 2016, Le Tuan Anh +# LICENSE: The MIT License (MIT) +# +# Homepage: https://github.com/neocl/jamdict + +import os +from chirptext.cli import setup_logging + +TEST_DIR = os.path.dirname(__file__) +TEST_DATA = os.path.join(TEST_DIR, 'data') + +setup_logging(os.path.join(TEST_DIR, 'logging.json'), os.path.join(TEST_DIR, 'logs')) From 70bc6792e530e549e0ac6f35219f10c3d67c647d Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 15:58:36 +0800 Subject: [PATCH 09/31] Add TestConfig --- test/test_jamdict.py | 76 +++++++++++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 22 deletions(-) diff --git a/test/test_jamdict.py b/test/test_jamdict.py index 5dd6750..aff81b4 100644 --- a/test/test_jamdict.py +++ b/test/test_jamdict.py @@ -21,23 +21,23 @@ # Copyright (c) 2016, Le Tuan Anh # -#Permission is hereby granted, free of charge, to any person obtaining a copy -#of this software and associated documentation files (the "Software"), to deal -#in the Software without restriction, including without limitation the rights -#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -#copies of the Software, and to permit persons to whom the Software is -#furnished to do so, subject to the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: # -#The above copyright notice and this permission notice shall be included in -#all copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. # -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -#THE SOFTWARE. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. __author__ = "Le Tuan Anh " __copyright__ = "Copyright 2016, jamdict" @@ -48,19 +48,48 @@ import os import logging import unittest +from jamdict import config from jamdict.jmdict import JMDictXMLParser from jamdict.kanjidic2 import Kanjidic2XMLParser from jamdict import Jamdict, JMDictXML ######################################################################## -MINI_JMD = 'data/JMdict_mini.xml' -MINI_KD2 = 'data/kanjidic2.mini.xml' MY_DIR = os.path.abspath(os.path.dirname(__file__)) TEST_DATA = os.path.join(MY_DIR, 'data') +MINI_JMD = os.path.join(TEST_DATA, 'JMdict_mini.xml') +MINI_KD2 = os.path.join(TEST_DATA, 'kanjidic2_mini.xml') TEST_DB = os.path.join(TEST_DATA, 'jamdict_test.db') +def getLogger(): + return logging.getLogger(__name__) + + +class TestConfig(unittest.TestCase): + + def test_config(self): + cfg = config.read_config() + self.assertIn('KD2_XML', cfg) + self.assertTrue(config.get_file('KD2_XML')) + getLogger().info("jamdict log file location: {}".format(config._get_config_manager().locate_config())) + + +class TestModels(unittest.TestCase): + + def test_basic_models(self): + parser = JMDictXMLParser() + entries = parser.parse_file(MINI_JMD) + self.assertEqual(len(entries), 230) # there are 230 test entries + e = entries[0] + self.assertEqual(len(e), 1) # there is only 1 sense + self.assertEqual(len(e[0].gloss), 1) # there is only 1 sense + # first sense in entry e to string -> with POS + self.assertEqual(str(e[0]), 'repetition mark in katakana ((noun (common) (futsuumeishi)))') + self.assertEqual(str(e[0].text()), 'repetition mark in katakana') # compact is enabled by default + self.assertEqual(str(e[0].gloss[0]), 'repetition mark in katakana') + + class TestJamdictXML(unittest.TestCase): @classmethod @@ -118,14 +147,17 @@ def test_jamdict_xml(self): self.assertEqual(len(result.chars), 2) self.assertEqual({c.literal for c in result.chars}, {'土', '産'}) - def test_jamdict_import(self): - jam = Jamdict(db_file=":memory:", jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2) - jam.import_data() + +class TestJamdictSQLite(unittest.TestCase): def test_jamdict_sqlite_all(self): - jam = Jamdict(db_file=TEST_DB, jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2) + if os.path.isfile(TEST_DB): + os.unlink(TEST_DB) + jam = Jamdict(db_file=TEST_DB, kd2_file=TEST_DB, jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2) + # Lookup using XML result = jam.jmdict_xml.lookup('おみやげ') - print(result) + getLogger().debug("Results: {}".format(result)) + # Lookup using SQLite jam.import_data() # test lookup result = jam.lookup('おみやげ') From c24c87e35c50ee5a0926e7c41be1fd9f3eefb79a Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 15:58:44 +0800 Subject: [PATCH 10/31] Code refactoring --- test/test_jmdict_sqlite.py | 71 ++++++++++++++++++----------------- test/test_kanjidic2_sqlite.py | 69 +++++++++++++++++----------------- 2 files changed, 70 insertions(+), 70 deletions(-) diff --git a/test/test_jmdict_sqlite.py b/test/test_jmdict_sqlite.py index 42af04a..850ef6c 100644 --- a/test/test_jmdict_sqlite.py +++ b/test/test_jmdict_sqlite.py @@ -20,23 +20,23 @@ # Copyright (c) 2017, Le Tuan Anh # -#Permission is hereby granted, free of charge, to any person obtaining a copy -#of this software and associated documentation files (the "Software"), to deal -#in the Software without restriction, including without limitation the rights -#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -#copies of the Software, and to permit persons to whom the Software is -#furnished to do so, subject to the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: # -#The above copyright notice and this permission notice shall be included in -#all copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. # -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -#THE SOFTWARE. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. __author__ = "Le Tuan Anh " __copyright__ = "Copyright 2016, jamdict" @@ -54,36 +54,37 @@ from jamdict import JMDictSQLite -#------------------------------------------------------------------------------- -# CONFIGURATION -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- +# Configuration +# ------------------------------------------------------------------------------- -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) -logger.addHandler(logging.StreamHandler(sys.stdout)) TEST_DIR = os.path.dirname(os.path.realpath(__file__)) TEST_DATA = os.path.join(TEST_DIR, 'data') if not os.path.isdir(TEST_DATA): os.makedirs(TEST_DATA) TEST_DB = os.path.join(TEST_DATA, 'test.db') RAM_DB = ':memory:' -MINI_DATA_FILE = 'data/JMdict_mini.xml' +MINI_JMD = os.path.join(TEST_DATA, 'JMdict_mini.xml') -#------------------------------------------------------------------------------- -# DATA STRUCTURES -#------------------------------------------------------------------------------- +def getLogger(): + return logging.getLogger(__name__) + + +# ------------------------------------------------------------------------------- +# Test cases +# ------------------------------------------------------------------------------- class TestJamdictSQLite(unittest.TestCase): db = JMDictSQLite(TEST_DB) - xdb = JMDictXML.from_file(MINI_DATA_FILE) + xdb = JMDictXML.from_file(MINI_JMD) ramdb = JMDictSQLite(RAM_DB) @classmethod def setUpClass(cls): if os.path.isfile(TEST_DB): - logger.info("Removing previous database file at {}".format(TEST_DB)) + getLogger().info("Removing previous database file at {}".format(TEST_DB)) os.unlink(TEST_DB) def test_xml2sqlite(self): @@ -91,7 +92,7 @@ def test_xml2sqlite(self): try: self.db.insert_entries(self.xdb) except: - logger.exception("Error happened while inserting entries") + getLogger().exception("Error happened while inserting entries") raise pass entries = self.db.Entry.select() @@ -100,7 +101,7 @@ def test_xml2sqlite(self): e = self.db.get_entry(1001710) ejson = e.to_json() self.assertEqual(ejson['kanji'][0]['text'], 'お菓子') - logger.debug(e.to_json()) + getLogger().debug(e.to_json()) def test_xml2ramdb(self): print("Testing XML to RAM") @@ -111,7 +112,7 @@ def test_xml2ramdb(self): def test_import_function(self): print("Testing JMDict import function") - jd = Jamdict(MINI_DATA_FILE, RAM_DB) + jd = Jamdict(MINI_JMD, RAM_DB) jd.import_data() def test_search(self): @@ -123,16 +124,16 @@ def test_search(self): # Search by kana es = self.ramdb.search('あの', ctx) self.assertEqual(len(es), 2) - logger.info('あの: {}'.format('|'.join([str(x) for x in es]))) + getLogger().info('あの: {}'.format('|'.join([str(x) for x in es]))) # Search by kanji es = self.db.search('%子%', ctx) self.assertEqual(len(es), 4) - logger.info('%子%: {}'.format('|'.join([str(x) for x in es]))) + getLogger().info('%子%: {}'.format('|'.join([str(x) for x in es]))) -#------------------------------------------------------------------------------- -# MAIN -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- +# Main +# ------------------------------------------------------------------------------- if __name__ == "__main__": unittest.main() diff --git a/test/test_kanjidic2_sqlite.py b/test/test_kanjidic2_sqlite.py index 17cf88b..b4b1988 100644 --- a/test/test_kanjidic2_sqlite.py +++ b/test/test_kanjidic2_sqlite.py @@ -20,23 +20,23 @@ # Copyright (c) 2017, Le Tuan Anh # -#Permission is hereby granted, free of charge, to any person obtaining a copy -#of this software and associated documentation files (the "Software"), to deal -#in the Software without restriction, including without limitation the rights -#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -#copies of the Software, and to permit persons to whom the Software is -#furnished to do so, subject to the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: # -#The above copyright notice and this permission notice shall be included in -#all copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. # -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -#THE SOFTWARE. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. __author__ = "Le Tuan Anh " __copyright__ = "Copyright 2017, jamdict" @@ -44,7 +44,6 @@ ######################################################################## -import sys import os import unittest import logging @@ -53,36 +52,37 @@ from jamdict import KanjiDic2XML -#------------------------------------------------------------------------------- -# CONFIGURATION -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- +# Configuration +# ------------------------------------------------------------------------------- -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) -logger.addHandler(logging.StreamHandler(sys.stdout)) TEST_DIR = os.path.dirname(os.path.realpath(__file__)) TEST_DATA = os.path.join(TEST_DIR, 'data') if not os.path.isdir(TEST_DATA): os.makedirs(TEST_DATA) TEST_DB = os.path.join(TEST_DATA, 'jamcha.db') RAM_DB = ':memory:' -MINI_DATA_FILE = 'data/kanjidic2.mini.xml' +MINI_KD2 = os.path.join(TEST_DATA, 'kanjidic2_mini.xml') -#------------------------------------------------------------------------------- -# DATA STRUCTURES -#------------------------------------------------------------------------------- +def getLogger(): + return logging.getLogger(__name__) + + +# ------------------------------------------------------------------------------- +# Test cases +# ------------------------------------------------------------------------------- class TestJamdictSQLite(unittest.TestCase): db = KanjiDic2SQLite(TEST_DB) ramdb = KanjiDic2SQLite(RAM_DB) - xdb = KanjiDic2XML.from_file(MINI_DATA_FILE) + xdb = KanjiDic2XML.from_file(MINI_KD2) @classmethod def setUpClass(cls): if os.path.isfile(TEST_DB): - logger.info("Removing previous database file at {}".format(TEST_DB)) + getLogger().info("Removing previous database file at {}".format(TEST_DB)) os.unlink(TEST_DB) def test_xml2sqlite(self): @@ -95,12 +95,12 @@ def test_xml2sqlite(self): doc = self.xdb.kd2.date_of_creation db.update_meta(fv, dv, doc, ctx) metas = ctx.meta.select() - logger.debug("KanjiDic2 meta: {}".format(metas)) + getLogger().debug("KanjiDic2 meta: {}".format(metas)) for c in self.xdb: db.insert_char(c, ctx) c2 = db.char_by_id(c.ID, ctx) - logger.debug("c-xml", c.to_json()) - logger.debug("c-sqlite", c2.to_json()) + getLogger().debug("c-xml", c.to_json()) + getLogger().debug("c-sqlite", c2.to_json()) self.assertEqual(c.to_json(), c2.to_json()) # test searching # by id @@ -116,10 +116,9 @@ def test_xml2sqlite(self): self.assertTrue(c.rm_groups[0].meanings) - -#------------------------------------------------------------------------------- -# MAIN -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- +# Main +# ------------------------------------------------------------------------------- if __name__ == "__main__": unittest.main() From c62947e018245b479865098cbaaaae78dd9d719d Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 15:58:51 +0800 Subject: [PATCH 11/31] Add info command. Use chirptext's CLIApp --- jamdict/tools.py | 113 ++++++++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 45 deletions(-) diff --git a/jamdict/tools.py b/jamdict/tools.py index a2c1238..e465d7a 100755 --- a/jamdict/tools.py +++ b/jamdict/tools.py @@ -37,52 +37,71 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -import sys import os -import logging -import argparse + +from chirptext import confirm, TextReport, Timer +from chirptext.cli import CLIApp, setup_logging + from jamdict import Jamdict +from jamdict import config # ------------------------------------------------------------------------------- # Configuration # ------------------------------------------------------------------------------- -DEFAULT_HOME = os.path.abspath(os.path.expanduser('~/local/jamdict')) -DATA_FOLDER = os.path.join(os.environ.get('JAMDICT_HOME', DEFAULT_HOME), 'data') -JMD_XML = os.path.join(DATA_FOLDER, 'JMdict.xml') -KD2_XML = os.path.join(DATA_FOLDER, 'kanjidic2.xml') -JMD_DB = os.path.join(DATA_FOLDER, 'jamdict.db') - -def getLogger(): - return logging.getLogger(__name__) +JMD_XML = config.get_file('JMDICT_XML') +KD2_XML = config.get_file('KD2_XML') +JMD_DB = config.get_file('JAMDICT_DB') +setup_logging('logging.json', 'logs') # ------------------------------------------------------------------------------- # Functions # ------------------------------------------------------------------------------- -def get_jam(args): - if args.jdb == args.kd2 or not args.kd2: - jmd = Jamdict(db_file=args.jdb, jmd_xml_file=args.jmdxml, kd2_xml_file=args.kd2xml) - else: +def get_jam(cli, args): + if not args.jdb: + args.jdb = None + if args.kd2: + cli.logger.warning("Jamdict database location: {}".format(args.jdb)) + cli.logger.warning("Kanjidic2 database location: {}".format(args.kd2)) jmd = Jamdict(db_file=args.jdb, kd2_file=args.kd2, jmd_xml_file=args.jmdxml, kd2_xml_file=args.kd2xml) + else: + cli.logger.info("Using the same database for both JMDict and Kanjidic2") + jmd = Jamdict(db_file=args.jdb, kd2_file=args.jdb, jmd_xml_file=args.jmdxml, kd2_xml_file=args.kd2xml) + if jmd.kd2 is None: + cli.logger.warning("Kanjidic2 database could not be found") return jmd -def import_data(args): +def import_data(cli, args): + '''Import XML data into SQLite database''' + rp = TextReport() + t = Timer(report=rp) + db_loc = os.path.abspath(os.path.expanduser(args.jdb)) + rp.print("Jamdict DB location : {}".format(db_loc)) + rp.print("JMDict XML file location : {}".format(args.jmdxml)) + rp.print("Kanjidic2 XML file location: {}".format(args.kd2xml)) + jam = get_jam(cli, args) if args and (args.jdb or args.kd2): + if os.path.isfile(db_loc): + if not confirm("Database file exists. Do you want to overwite (This action cannot be undone! yes/no?) "): + cli.logger.warning("Program aborted.") + exit() + else: + os.unlink(db_loc) # perform input - jam = get_jam(args) - print("Importing data. This process may take very long time ...") + t.start("Creating Jamdict SQLite database. This process may take very long time ...") jam.import_data() - print("Done!") + t.stop() else: print("Database paths were not provided. Process aborted.") -def lookup(args): - jam = get_jam(args) +def lookup(cli, args): + '''Lookup words by kanji/kana''' + jam = get_jam(cli, args) results = jam.lookup(args.query) if args.format == 'json': print(results.to_json()) @@ -109,44 +128,48 @@ def lookup(args): print("Meanings:", ", ".join([m.value for m in rmg.meanings if not m.m_lang or m.m_lang == 'en'])) -# ------------------------------------------------------------------------------- -# MAIN -# ------------------------------------------------------------------------------- - -def main(): - '''Main entry of jamtk - ''' +def show_info(cli, args): + ''' Show jamdict configuration (data folder, configuration file location, etc.) ''' + print("Configuration location: {}".format(config._get_config_manager().locate_config())) + print("Jamdict DB location: {}".format(JMD_DB)) + print("JMDict XML file: {}".format(JMD_XML)) + print("KanjiDic2 XML file: {}".format(KD2_XML)) - # It's easier to create a user-friendly console application by using argparse - # See reference at the top of this script - parser = argparse.ArgumentParser(description="Jamdict toolkit") - # Positional argument(s) - task = parser.add_subparsers(help='Task to be done') +# ------------------------------------------------------------------------------- +# Main +# ------------------------------------------------------------------------------- - # Optional arguments +def add_data_config(parser): parser.add_argument('-j', '--jmdxml', help='Path to JMdict XML file', default=JMD_XML) parser.add_argument('-k', '--kd2xml', help='Path to KanjiDic2 XML file', default=KD2_XML) parser.add_argument('-J', '--jdb', help='Path to JMDict SQLite file', default=JMD_DB) - parser.add_argument('-K', '--kd2', help='Path to KanjiDic2 SQLite file', default=JMD_DB) + parser.add_argument('-K', '--kd2', help='Path to KanjiDic2 SQLite file', default=None) + + +def main(): + '''Main entry of jamtk + ''' + app = CLIApp(desc='Jamdict toolkit', logger=__name__) + add_data_config(app.parser) # import task - import_task = task.add_parser('import', help='Import XML data into SQLite database') - import_task.set_defaults(func=import_data) + import_task = app.add_task('import', func=import_data) + add_data_config(import_task) + + # show info + info_task = app.add_task('info', func=show_info) + add_data_config(info_task) # look up task - lookup_task = task.add_parser('lookup', help='Lookup words by kanji/kana') + lookup_task = app.add_task('lookup', func=lookup) lookup_task.add_argument('query', help='kanji/kana') lookup_task.add_argument('-f', '--format', help='json or text') lookup_task.set_defaults(func=lookup) + add_data_config(lookup_task) - # Main script - if len(sys.argv) == 1: - # User didn't pass any value in, show help - parser.print_help() - else: - args = parser.parse_args() - args.func(args) + # run app + app.run() if __name__ == "__main__": From 446af90242289a74b9e862cb7e5b2431dff9df16 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 16:09:59 +0800 Subject: [PATCH 12/31] Allow to switch off auto_config for path --- jamdict/util.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/jamdict/util.py b/jamdict/util.py index 54f6c7b..6b502b3 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -88,16 +88,16 @@ def __init__(self, data_source, setup_script=None, setup_file=None): class Jamdict(object): - def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file=None): + def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file=None, auto_config=True): # file paths configuration - self.db_file = db_file if db_file else config.get_file('JAMDICT_DB') - self.kd2_file = kd2_file if kd2_file else config.get_file('JAMDICT_DB') + self.db_file = db_file if db_file else config.get_file('JAMDICT_DB') if auto_config else None + self.kd2_file = kd2_file if kd2_file else config.get_file('JAMDICT_DB') if auto_config else None if not self.db_file: getLogger().warning("JAMDICT_DB could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict.tools import` first") if not self.kd2_file: getLogger().warning("Kanjidic2 database could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict.tools import` first") - self.jmd_xml_file = jmd_xml_file if jmd_xml_file else config.get_file('JMDICT_XML') - self.kd2_xml_file = kd2_xml_file if kd2_xml_file else config.get_file('KD2_XML') + self.jmd_xml_file = jmd_xml_file if jmd_xml_file else config.get_file('JMDICT_XML') if auto_config else None + self.kd2_xml_file = kd2_xml_file if kd2_xml_file else config.get_file('KD2_XML') if auto_config else None # data sources self._db_sqlite = None self._kd2_sqlite = None From abdd6d5ce326652484eff032bf06da677eae0d59 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 16:10:22 +0800 Subject: [PATCH 13/31] hide debug message --- jamdict/tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jamdict/tools.py b/jamdict/tools.py index e465d7a..6c0825b 100755 --- a/jamdict/tools.py +++ b/jamdict/tools.py @@ -68,7 +68,7 @@ def get_jam(cli, args): cli.logger.warning("Kanjidic2 database location: {}".format(args.kd2)) jmd = Jamdict(db_file=args.jdb, kd2_file=args.kd2, jmd_xml_file=args.jmdxml, kd2_xml_file=args.kd2xml) else: - cli.logger.info("Using the same database for both JMDict and Kanjidic2") + cli.logger.debug("Using the same database for both JMDict and Kanjidic2") jmd = Jamdict(db_file=args.jdb, kd2_file=args.jdb, jmd_xml_file=args.jmdxml, kd2_xml_file=args.kd2xml) if jmd.kd2 is None: cli.logger.warning("Kanjidic2 database could not be found") From 50a9dec92a027696b71f5279f23e03058511c77b Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 16:10:44 +0800 Subject: [PATCH 14/31] Use python3 --- test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test.sh b/test.sh index 5936b79..0c15d92 100755 --- a/test.sh +++ b/test.sh @@ -1,3 +1,3 @@ #!/bin/bash -python -m unittest discover +python3 -m unittest discover From ebd0800e86231d206eef4c25be23a4fd15ac684f Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 16:10:52 +0800 Subject: [PATCH 15/31] Use manual config --- test/test_jamdict.py | 2 +- test/test_jmdict_sqlite.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_jamdict.py b/test/test_jamdict.py index aff81b4..c44b46d 100644 --- a/test/test_jamdict.py +++ b/test/test_jamdict.py @@ -141,7 +141,7 @@ def test_kanjidic2_json(self): def test_jamdict_xml(self): print("Test Jamdict search in XML files") - jam = Jamdict(jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2) + jam = Jamdict(jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, auto_config=False) result = jam.lookup('おみやげ') self.assertEqual(len(result.entries), 1) self.assertEqual(len(result.chars), 2) diff --git a/test/test_jmdict_sqlite.py b/test/test_jmdict_sqlite.py index 850ef6c..ee93a82 100644 --- a/test/test_jmdict_sqlite.py +++ b/test/test_jmdict_sqlite.py @@ -112,7 +112,7 @@ def test_xml2ramdb(self): def test_import_function(self): print("Testing JMDict import function") - jd = Jamdict(MINI_JMD, RAM_DB) + jd = Jamdict(db_file=RAM_DB, jmd_xml_file=MINI_JMD, auto_config=False) jd.import_data() def test_search(self): From de30fcf66219129da017c1e0cf30aa0ceab510ef Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 16:11:09 +0800 Subject: [PATCH 16/31] Add logging.json for test scripts --- test/logging.json | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 test/logging.json diff --git a/test/logging.json b/test/logging.json new file mode 100644 index 0000000..5491324 --- /dev/null +++ b/test/logging.json @@ -0,0 +1,54 @@ +{ + "version": 1, + "disable_existing_loggers": false, + "formatters": { + "simple": { + "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + } + }, + + "handlers": { + "console": { + "class": "logging.StreamHandler", + "level": "DEBUG", + "formatter": "simple", + "stream": "ext://sys.stdout" + }, + + "file_handler_important": { + "class": "logging.handlers.RotatingFileHandler", + "level": "WARNING", + "formatter": "simple", + "filename": "test/logs/logging_important.log", + "maxBytes": 1000000, + "backupCount": 20, + "encoding": "utf8" + }, + + "file_handler_verbose": { + "class": "logging.handlers.RotatingFileHandler", + "level": "DEBUG", + "formatter": "simple", + "filename": "test/logs/logging_details.log", + "maxBytes": 1000000, + "backupCount": 20, + "encoding": "utf8" + } + + }, + + "loggers": { + "__main__": { + "level": "INFO", + "handlers": ["file_handler_verbose"], + "propagate": "no" + } + ,"test": { "level": "INFO" } + }, + + "root": { + "level": "WARNING", + "handlers": ["console", "file_handler_important"], + "propagate": "no" + } +} From cee648281d2380545caaffd559fef47ad008fea1 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 17:09:56 +0800 Subject: [PATCH 17/31] Enhance Entry to string --- jamdict/jmdict.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/jamdict/jmdict.py b/jamdict/jmdict.py index 1c8a68b..dab23ad 100644 --- a/jamdict/jmdict.py +++ b/jamdict/jmdict.py @@ -82,18 +82,28 @@ def set_info(self, info): logging.warning("WARNING: multiple info tag") self.info = info - def __repr__(self): - tmp = ['ID:%s' % self.idseq] + def text(self, compact=True, separator=' '): + tmp = [] + if not compact: + tmp.append('ID:%s' % self.idseq) if self.kana_forms: tmp.append(self.kana_forms[0].text) if self.kanji_forms: - tmp.append(self.kanji_forms[0].text) - for sense, idx in zip(self.senses, range(len(self.senses))): - tmp.append('{i}. {s}'.format(i=idx + 1, s=sense)) - return '|'.join(tmp) + tmp.append("({})".format(self.kanji_forms[0].text)) + if self.senses: + tmp.append(':') + if len(self.senses) == 1: + tmp.append(self.senses[0].text(compact=compact)) + else: + for sense, idx in zip(self.senses, range(len(self.senses))): + tmp.append('{i}. {s}'.format(i=idx + 1, s=sense.text(compact=compact))) + return separator.join(tmp) + + def __repr__(self): + return self.text(compact=True) def __str__(self): - return repr(self) + return self.text(compact=False) def to_json(self): ed = {'idseq': self.idseq, From cb6baa34aa184f5e64d837fd1f6a6340c7e4c6ae Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 17:10:11 +0800 Subject: [PATCH 18/31] Enhance character to string --- jamdict/kanjidic2.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/jamdict/kanjidic2.py b/jamdict/kanjidic2.py index e9a25a9..b4b9f63 100644 --- a/jamdict/kanjidic2.py +++ b/jamdict/kanjidic2.py @@ -125,7 +125,12 @@ def __init__(self): self.nanoris = [] # a list of strings def __repr__(self): - return "{l}:{sc}".format(l=self.literal, sc=self.stroke_count) + meanings = [] + for rm in self.rm_groups: + for m in rm.meanings: + if m.m_lang == '': + meanings.append(m.value) + return "{l}:{sc}:{meanings}".format(l=self.literal, sc=self.stroke_count, meanings=','.join(meanings)) def __str__(self): return self.literal From 5f99711aaa83bb02d8886eb29e00ee84263f0cca Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 17:10:21 +0800 Subject: [PATCH 19/31] Hide debug msg --- jamdict/kanjidic2_sqlite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jamdict/kanjidic2_sqlite.py b/jamdict/kanjidic2_sqlite.py index f7f0501..5470668 100644 --- a/jamdict/kanjidic2_sqlite.py +++ b/jamdict/kanjidic2_sqlite.py @@ -196,7 +196,7 @@ def get_char(self, literal, ctx=None): # context was ensured c = ctx.char.select_single('literal=?', (literal,)) if not c: - getLogger().info("character {} could not be found".format(literal)) + getLogger().debug("character {} could not be found".format(literal)) return None else: return self.char_by_id(c.ID, ctx) From 3bea75f64a036fa11cc8d2ac1fe0d77f446a20a2 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 17:10:36 +0800 Subject: [PATCH 20/31] Better result output (notice user when nothing was found) --- jamdict/tools.py | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/jamdict/tools.py b/jamdict/tools.py index 6c0825b..bf73033 100755 --- a/jamdict/tools.py +++ b/jamdict/tools.py @@ -99,13 +99,8 @@ def import_data(cli, args): print("Database paths were not provided. Process aborted.") -def lookup(cli, args): - '''Lookup words by kanji/kana''' - jam = get_jam(cli, args) - results = jam.lookup(args.query) - if args.format == 'json': - print(results.to_json()) - else: +def dump_result(results): + if results.entries: print("=" * 40) print("Found entries") print("=" * 40) @@ -117,6 +112,9 @@ def lookup(cli, args): for idx, s in enumerate(e.senses): print("{idx}. {s}".format(idx=idx + 1, s=s)) print('') + else: + print("No dictionary entry was found.") + if results.chars: print("=" * 40) print("Found characters") print("=" * 40) @@ -126,14 +124,35 @@ def lookup(cli, args): for rmg in c.rm_groups: print("Readings:", ", ".join([r.value for r in rmg.readings])) print("Meanings:", ", ".join([m.value for m in rmg.meanings if not m.m_lang or m.m_lang == 'en'])) + else: + print("No character was found.") + + +def lookup(cli, args): + '''Lookup words by kanji/kana''' + jam = get_jam(cli, args) + results = jam.lookup(args.query) + if args.format == 'json': + print(results.to_json()) + else: + if args.compact: + print(results.text(separator='\n------\n', entry_sep='\n')) + else: + dump_result(results) + + +def file_status(file_path): + real_path = os.path.abspath(os.path.expanduser(file_path)) + return '[NOT FOUND]' if not os.path.isfile(real_path) else '[OK]' def show_info(cli, args): ''' Show jamdict configuration (data folder, configuration file location, etc.) ''' print("Configuration location: {}".format(config._get_config_manager().locate_config())) - print("Jamdict DB location: {}".format(JMD_DB)) - print("JMDict XML file: {}".format(JMD_XML)) - print("KanjiDic2 XML file: {}".format(KD2_XML)) + print("-" * 40) + print("Jamdict DB location : {} - {}".format(args.jdb, file_status(args.jdb))) + print("JMDict XML file : {} - {}".format(args.jmdxml, file_status(args.jmdxml))) + print("KanjiDic2 XML file : {} - {}".format(args.kd2xml, file_status(args.kd2xml))) # ------------------------------------------------------------------------------- @@ -165,6 +184,7 @@ def main(): lookup_task = app.add_task('lookup', func=lookup) lookup_task.add_argument('query', help='kanji/kana') lookup_task.add_argument('-f', '--format', help='json or text') + lookup_task.add_argument('--compact', action='store_true') lookup_task.set_defaults(func=lookup) add_data_config(lookup_task) From 1cc2edf889ede5c55cd94d093dec0bb3a14544f4 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 17:11:01 +0800 Subject: [PATCH 21/31] Better LookupResult to string --- jamdict/util.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/jamdict/util.py b/jamdict/util.py index 6b502b3..a2b72d6 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -75,6 +75,29 @@ def __init__(self, entries, chars): self.entries = entries if entries else [] self.chars = chars if chars else [] + def text(self, compact=True, entry_sep='。', separator=' | '): + output = [] + if self.entries: + entries_txt = str(entry_sep.join(e.text(compact=compact, separator='') for e in self.entries)) + output.append("Entries: ") + output.append(entries_txt) + if self.entries: + if compact: + chars_txt = ', '.join(str(c) for c in self.chars) + else: + chars_txt = ', '.join(repr(c) for c in self.chars) + if output: + output.append(separator) + output.append("Chars: ") + output.append(chars_txt) + return "".join(output) if output else "Found nothing" + + def __repr__(self): + return self.text(compact=True) + + def __str__(self): + return self.text(compact=False) + def to_json(self): return {'entries': [e.to_json() for e in self.entries], 'chars': [c.to_json() for c in self.chars]} From 886d912c31cdb682bc30cdc8b19c1ecb807bbbf4 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 17:11:15 +0800 Subject: [PATCH 22/31] Test lookup result to string --- test/test_jamdict.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/test_jamdict.py b/test/test_jamdict.py index c44b46d..f04452a 100644 --- a/test/test_jamdict.py +++ b/test/test_jamdict.py @@ -89,6 +89,11 @@ def test_basic_models(self): self.assertEqual(str(e[0].text()), 'repetition mark in katakana') # compact is enabled by default self.assertEqual(str(e[0].gloss[0]), 'repetition mark in katakana') + def test_lookup_result(self): + jam = Jamdict(jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, auto_config=False) + result = jam.lookup('おみやげ') + print(repr(result)) + class TestJamdictXML(unittest.TestCase): From aefa4ff76ec255a6c8ed72e215987955256b9c8c Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 22:53:42 +0800 Subject: [PATCH 23/31] Update README with more examples and download links --- README.md | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 609e0e5..04a5a22 100644 --- a/README.md +++ b/README.md @@ -12,28 +12,61 @@ Python library for manipulating Jim Breen's JMdict & KanjiDic2 pip install jamdict # pip script sometimes doesn't work properly, so you may want to try this instead python3 -m pip install jamdict + +# initial setup (this command will create ~/.jamdict for you +# it will also tell you where to copy the data files +python3 -m jamdict.tools info + +# to look up a word using command line +python3 -m jamdict.tools lookup たべる +======================================== +Found entries +======================================== +Entry: 1358280 | Kj: 食べる, 喰べる | Kn: たべる +-------------------- +1. to eat ((Ichidan verb|transitive verb)) +2. to live on (e.g. a salary)/to live off/to subsist on + +======================================== +Found characters +======================================== +Char: 食 | Strokes: 9 +-------------------- +Readings: shi2, si4, sig, sa, 식, 사, Thực, Tự, ショク, ジキ, く.う, く.らう, た.べる, は.む +Meanings: eat, food +Char: 喰 | Strokes: 12 +-------------------- +Readings: shi2, si4, sig, 식, Thặc, Thực, Tự, く.う, く.らう +Meanings: eat, drink, receive (a blow), (kokuji) ``` ## Data -XML files (JMdict_e.xml, kanjidic2.xml) must be downloaded from JMdict home page and copy into `~/local/jamdict/data` +XML files (JMdict_e.xml, kanjidic2.xml) must be downloaded and copy into `~/.jamdict/data` + +I have mirrored these files to Google Drive so you can download there too: +[https://drive.google.com/drive/folders/1z4zF9ImZlNeTZZplflvvnpZfJp3WVLPk](https://drive.google.com/drive/folders/1z4zF9ImZlNeTZZplflvvnpZfJp3WVLPk) + +Official website +- JMdict: [http://edrdg.org/jmdict/edict_doc.html](http://edrdg.org/jmdict/edict_doc.html) +- kanjidic2: [http://www.edrdg.org/kanjidic/kanjd2index.html](http://www.edrdg.org/kanjidic/kanjd2index.html) +- KRADFILE: [http://www.edrdg.org/krad/kradinf.html](http://www.edrdg.org/krad/kradinf.html) -Read more about JMdict here: http://www.csse.monash.edu.au/~jwb/edict.html # Sample codes ```python >>> from jamdict import Jamdict ->>> jmd = Jamdict("/home/tuananh/local/jamdict/data/jamdict.db") +>>> jmd = Jamdict() >>> jmd.lookup('食べる') - +'Entries: たべる(食べる):1. to eat2. to live on (e.g. a salary)/to live off/to subsist on | Chars: 食, 喰' >>> result = jmd.lookup('食べる') >>> print(result.entries) -[ID:1358280|たべる|食べる|1. to eat ((Ichidan verb|transitive verb))|2. to live on (e.g. a salary)/to live off/to subsist on] +[たべる (食べる) : 1. to eat 2. to live on (e.g. a salary)/to live off/to subsist on] >>> for c in result.chars: ... print(c, c.rm_groups) ... -喰 [R: shi2, si4, sig, 식, Thặcÿ, Thựcÿ, Tự,ÿ く.う, く.らう | M: eat, drink, receive (a blow), (kokuji)] 食 [R: shi2, si4, sig, sa, 식, 사, Thực, Tự, ショク, ジキ, く.う, く.らう, た.べる, は.む | M: eat, food, manger, nourriture, alimento, comida, eclipse, comer, comer, comida, alimento] +喰 [R: shi2, si4, sig, 식, Thặc, Thực, Tự, く.う, く.らう | M: eat, drink, receive (a blow), (kokuji)] ``` See `jamdict_demo.py` and `jamdict/tools.py` for more information. From bbc2068b222baa7384cdae24530fdc77034a693b Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 22:54:09 +0800 Subject: [PATCH 24/31] Add *args and **kwargs to constructors --- jamdict/jmdict_sqlite.py | 21 ++++++++++++++++----- jamdict/kanjidic2_sqlite.py | 8 ++++---- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/jamdict/jmdict_sqlite.py b/jamdict/jmdict_sqlite.py index 3c77652..38a510d 100644 --- a/jamdict/jmdict_sqlite.py +++ b/jamdict/jmdict_sqlite.py @@ -80,8 +80,8 @@ class JMDictSchema(Schema): KEY_JMD_VER = "jmdict.version" KEY_JMD_URL = "jmdict.url" - def __init__(self, data_source=":memory:", setup_script=None, setup_file=None): - super().__init__(data_source, setup_script=setup_script, setup_file=setup_file) + def __init__(self, data_source=":memory:", setup_script=None, setup_file=None, *args, **kwargs): + super().__init__(data_source, setup_script=setup_script, setup_file=setup_file, *args, **kwargs) self.add_script(SETUP_SCRIPT) self.add_file(JMDICT_SETUP_FILE) # Meta @@ -117,8 +117,8 @@ def __init__(self, data_source=":memory:", setup_script=None, setup_file=None): class JMDictSQLite(JMDictSchema): - def __init__(self, db_path, setup_script=None, setup_file=None): - super().__init__(db_path, setup_script=setup_script, setup_file=setup_file) + def __init__(self, db_path, setup_script=None, setup_file=None, *args, **kwargs): + super().__init__(db_path, setup_script=setup_script, setup_file=setup_file, *args, **kwargs) def update_meta(self, version, url, ctx=None): # create a default context if none was provided @@ -146,8 +146,19 @@ def search(self, query, ctx=None): if ctx is None: with self.ctx() as ctx: return self.search(query, ctx=ctx) + where = "idseq IN (SELECT idseq FROM Kanji WHERE text like ?) OR idseq IN (SELECT idseq FROM Kana WHERE text like ?)" + params = [query, query] + try: + if query.startswith('id#'): + query_int = int(query[3:]) + if query_int >= 0: + print("Searching by ID: {}".format(query_int)) + where = "idseq = ?" + params = [query_int] + except: + pass # else (a context is provided) - eids = self.Entry.select("idseq IN (SELECT idseq FROM Kanji WHERE text like ?) OR idseq IN (SELECT idseq FROM Kana WHERE text like ?)", (query, query), ctx=ctx) + eids = self.Entry.select(where, params, ctx=ctx) entries = [] for e in eids: entries.append(self.get_entry(e.idseq, ctx=ctx)) diff --git a/jamdict/kanjidic2_sqlite.py b/jamdict/kanjidic2_sqlite.py index 5470668..0375852 100644 --- a/jamdict/kanjidic2_sqlite.py +++ b/jamdict/kanjidic2_sqlite.py @@ -78,8 +78,8 @@ class KanjiDic2Schema(Schema): KEY_DB_VER = 'kanjidic2.database_version' KEY_CREATED_DATE = 'kanjidic2.date_of_creation' - def __init__(self, data_source, setup_script=None, setup_file=None): - super().__init__(data_source, setup_script=setup_script, setup_file=setup_file) + def __init__(self, data_source, setup_script=None, setup_file=None, *args, **kwargs): + super().__init__(data_source, setup_script=setup_script, setup_file=setup_file, *args, **kwargs) self.add_file(KANJIDIC2_SETUP_FILE) self.add_script(KANJIDIC2_SETUP_SCRIPT) # Meta @@ -100,8 +100,8 @@ def __init__(self, data_source, setup_script=None, setup_file=None): class KanjiDic2SQLite(KanjiDic2Schema): - def __init__(self, db_path, setup_script=None, setup_file=None): - super().__init__(db_path, setup_script=setup_script, setup_file=setup_file) + def __init__(self, db_path, setup_script=None, setup_file=None, *args, **kwargs): + super().__init__(db_path, setup_script=setup_script, setup_file=setup_file, *args, **kwargs) def update_meta(self, file_version, database_version, date_of_creation, ctx=None): # ensure context From 1d98b9f428d4942fd0c24d51c9fe8b61080106db Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 22:55:10 +0800 Subject: [PATCH 25/31] Enhance entry to string --- jamdict/jmdict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jamdict/jmdict.py b/jamdict/jmdict.py index dab23ad..20a50b4 100644 --- a/jamdict/jmdict.py +++ b/jamdict/jmdict.py @@ -85,7 +85,7 @@ def set_info(self, info): def text(self, compact=True, separator=' '): tmp = [] if not compact: - tmp.append('ID:%s' % self.idseq) + tmp.append('[id#%s]' % self.idseq) if self.kana_forms: tmp.append(self.kana_forms[0].text) if self.kanji_forms: From dfa37bcffcd5a00e5223ef056b9137434ddc75ed Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 22:55:37 +0800 Subject: [PATCH 26/31] User can switch off auto_expand path (e.g. for in-memory DB) --- jamdict/util.py | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/jamdict/util.py b/jamdict/util.py index a2b72d6..c45742d 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -52,8 +52,6 @@ import threading from collections import defaultdict as dd -from chirptext import io as chio - from . import config from .jmdict import JMDictXMLParser from .jmdict_sqlite import JMDictSQLite @@ -111,13 +109,14 @@ def __init__(self, data_source, setup_script=None, setup_file=None): class Jamdict(object): - def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file=None, auto_config=True): + def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file=None, auto_config=True, auto_expand=True): # file paths configuration + self.auto_expand = auto_expand self.db_file = db_file if db_file else config.get_file('JAMDICT_DB') if auto_config else None self.kd2_file = kd2_file if kd2_file else config.get_file('JAMDICT_DB') if auto_config else None - if not self.db_file: + if not self.db_file or not os.path.isfile(self.db_file): getLogger().warning("JAMDICT_DB could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict.tools import` first") - if not self.kd2_file: + if not self.kd2_file or os.path.isfile(self.kd2_file): getLogger().warning("Kanjidic2 database could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict.tools import` first") self.jmd_xml_file = jmd_xml_file if jmd_xml_file else config.get_file('JMDICT_XML') if auto_config else None self.kd2_xml_file = kd2_xml_file if kd2_xml_file else config.get_file('KD2_XML') if auto_config else None @@ -127,17 +126,28 @@ def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file= self._jmd_xml = None self._kd2_xml = None + @property + def db_file(self): + return self.__db_file + + @db_file.setter + def db_file(self, value): + if self.auto_expand and value: + self.__db_file = os.path.abspath(os.path.expanduser(value)) + else: + self.__db_file = None + @property def jmdict(self): if not self._db_sqlite and self.db_file: with threading.Lock(): if not self.kd2_file or self.kd2_file == self.db_file: # Use 1 DB for both - self._db_sqlite = JamdictSQLite(self.db_file) + self._db_sqlite = JamdictSQLite(self.db_file, auto_expand_path=self.auto_expand) self._kd2_sqlite = self._db_sqlite else: # use 2 separated files - self._db_sqlite = JMDictSQLite(self.db_file) + self._db_sqlite = JMDictSQLite(self.db_file, auto_expand_path=self.auto_expand) return self._db_sqlite @property @@ -145,7 +155,7 @@ def kd2(self): if self._kd2_sqlite is None: if self.kd2_file is not None: with threading.Lock(): - self._kd2_sqlite = KanjiDic2SQLite(self.kd2_file) + self._kd2_sqlite = KanjiDic2SQLite(self.kd2_file, auto_expand_path=self.auto_expand) else: self._kd2_sqlite = self.jmdict return self._kd2_sqlite @@ -204,7 +214,7 @@ def lookup(self, query): # Lookup words entries = [] chars = [] - if self.jmdict: + if self.jmdict is not None: entries = self.jmdict.search(query) elif self.jmdict_xml: entries = self.jmdict_xml.lookup(query) @@ -227,7 +237,7 @@ class JMDictXML(object): ''' def __init__(self, entries): self.entries = entries - self._seqmap = {} + self._seqmap = {} # entryID - entryObj map self._textmap = dd(set) # compile map for entry in self.entries: @@ -246,10 +256,12 @@ def __getitem__(self, idx): def lookup(self, a_query): if a_query in self._textmap: return tuple(self._textmap[a_query]) - elif a_query in self._seqmap: - return (self._seqmap[a_query],) - else: - return () + elif a_query.startswith('id#'): + entry_id = a_query[3:] + if entry_id in self._seqmap: + return (self._seqmap[entry_id],) + # found nothing + return () @staticmethod def from_file(filename): From 9753972b2f89457fc50398b1e22958ab5b6de978 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 22:56:00 +0800 Subject: [PATCH 27/31] Update demo script --- jamdict_demo.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/jamdict_demo.py b/jamdict_demo.py index 7a1504d..3bee9b8 100755 --- a/jamdict_demo.py +++ b/jamdict_demo.py @@ -53,13 +53,9 @@ ######################################################################## -# Path to JMDict XML file -# You might want to change this to something like -# JM_PATH = "/path/to/JMdict.xml" -JM_PATH = os.path.abspath('data/JMdict_mini.xml') - # Create an instance of Jamdict -jam = Jamdict(jmd_xml_file=JM_PATH) +jam = Jamdict() +print("Jamdict DB file: {}".format(jam.db_file)) # Lookup by kana result = jam.lookup('おかえし') @@ -75,7 +71,7 @@ # lookup entry by idseq -otenki = jam.lookup('1002470').entries[0] +otenki = jam.lookup('id#1002470').entries[0] kana_forms = ' '.join([x.text for x in otenki.kana_forms]) kanji_forms = ' '.join([x.text for x in otenki.kanji_forms]) print("Entry #{id}: Kanji: {kj} - Kana: {kn}".format(id=otenki.idseq, kj=kanji_forms, kn=kana_forms)) From 78c820af642ac558803f1350c25b1907709c1b88 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 22:56:10 +0800 Subject: [PATCH 28/31] Fix test cases --- test/test_jamdict.py | 8 +++++++- test/test_jmdict_sqlite.py | 7 +++---- test/test_kanjidic2_sqlite.py | 5 ++--- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/test/test_jamdict.py b/test/test_jamdict.py index f04452a..c4101c4 100644 --- a/test/test_jamdict.py +++ b/test/test_jamdict.py @@ -90,9 +90,15 @@ def test_basic_models(self): self.assertEqual(str(e[0].gloss[0]), 'repetition mark in katakana') def test_lookup_result(self): - jam = Jamdict(jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, auto_config=False) + jam = Jamdict(jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, auto_config=False, auto_expand=False) result = jam.lookup('おみやげ') print(repr(result)) + self.assertTrue(result.entries) + self.assertEqual(result.entries[0].kana_forms[0].text, 'おみやげ') + # test lookup by ID + res = jam.lookup('id#{}'.format(1002490)) + self.assertTrue(res.entries) + self.assertEqual(res.entries[0].kana_forms[0].text, 'おとそ') class TestJamdictXML(unittest.TestCase): diff --git a/test/test_jmdict_sqlite.py b/test/test_jmdict_sqlite.py index ee93a82..5eabbdd 100644 --- a/test/test_jmdict_sqlite.py +++ b/test/test_jmdict_sqlite.py @@ -63,7 +63,6 @@ if not os.path.isdir(TEST_DATA): os.makedirs(TEST_DATA) TEST_DB = os.path.join(TEST_DATA, 'test.db') -RAM_DB = ':memory:' MINI_JMD = os.path.join(TEST_DATA, 'JMdict_mini.xml') @@ -79,7 +78,7 @@ class TestJamdictSQLite(unittest.TestCase): db = JMDictSQLite(TEST_DB) xdb = JMDictXML.from_file(MINI_JMD) - ramdb = JMDictSQLite(RAM_DB) + ramdb = JMDictSQLite(":memory:", auto_expand_path=False) @classmethod def setUpClass(cls): @@ -106,13 +105,13 @@ def test_xml2sqlite(self): def test_xml2ramdb(self): print("Testing XML to RAM") noe = len(self.xdb) - with self.ramdb.ds.open() as ctx: + with self.ramdb.ctx() as ctx: self.ramdb.insert_entries(self.xdb, ctx=ctx) self.assertEqual(len(self.ramdb.Entry.select(ctx=ctx)), noe) def test_import_function(self): print("Testing JMDict import function") - jd = Jamdict(db_file=RAM_DB, jmd_xml_file=MINI_JMD, auto_config=False) + jd = Jamdict(db_file=":memory:", jmd_xml_file=MINI_JMD, auto_config=False, auto_expand=False) jd.import_data() def test_search(self): diff --git a/test/test_kanjidic2_sqlite.py b/test/test_kanjidic2_sqlite.py index b4b1988..58cf20f 100644 --- a/test/test_kanjidic2_sqlite.py +++ b/test/test_kanjidic2_sqlite.py @@ -61,7 +61,6 @@ if not os.path.isdir(TEST_DATA): os.makedirs(TEST_DATA) TEST_DB = os.path.join(TEST_DATA, 'jamcha.db') -RAM_DB = ':memory:' MINI_KD2 = os.path.join(TEST_DATA, 'kanjidic2_mini.xml') @@ -76,7 +75,7 @@ def getLogger(): class TestJamdictSQLite(unittest.TestCase): db = KanjiDic2SQLite(TEST_DB) - ramdb = KanjiDic2SQLite(RAM_DB) + ramdb = KanjiDic2SQLite(":memory:", auto_expand_path=False) xdb = KanjiDic2XML.from_file(MINI_KD2) @classmethod @@ -87,7 +86,7 @@ def setUpClass(cls): def test_xml2sqlite(self): print("Test KanjiDic2 - XML to SQLite DB in RAM") - print(len(self.xdb)) + getLogger().info("Testing using {} test characters".format(len(self.xdb))) db = self.ramdb with db.ctx() as ctx: fv = self.xdb.kd2.file_version From b35ef042767d8a2cd902e65a6f3736aa5ccd060a Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 22:56:56 +0800 Subject: [PATCH 29/31] Add *args & **kwargs to JamdictSQLite's constructor --- jamdict/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jamdict/util.py b/jamdict/util.py index c45742d..e09089f 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -103,8 +103,8 @@ def to_json(self): class JamdictSQLite(KanjiDic2SQLite, JMDictSQLite): - def __init__(self, data_source, setup_script=None, setup_file=None): - super().__init__(data_source, setup_script=setup_script, setup_file=setup_file) + def __init__(self, data_source, setup_script=None, setup_file=None, *args, **kwargs): + super().__init__(data_source, setup_script=setup_script, setup_file=setup_file, *args, **kwargs) class Jamdict(object): From b0e6c0af619571b11e86d7c20025d558e1af501c Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 23:01:44 +0800 Subject: [PATCH 30/31] Add project's homepage to README file --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 04a5a22..b34a7cd 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ Python library for manipulating Jim Breen's JMdict & KanjiDic2 # Installation +Homepage: [https://github.com/neocl/jamdict](https://github.com/neocl/jamdict) + ```bash pip install jamdict # pip script sometimes doesn't work properly, so you may want to try this instead From a5f5c70e081b09936083bfb162bcd1fdedecd272 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 16 Apr 2018 23:01:55 +0800 Subject: [PATCH 31/31] Bump version to 0.1a3 --- jamdict/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jamdict/__version__.py b/jamdict/__version__.py index 6da97c6..e6b8558 100644 --- a/jamdict/__version__.py +++ b/jamdict/__version__.py @@ -10,6 +10,6 @@ __url__ = "https://github.com/neocl/jamdict" __maintainer__ = "Le Tuan Anh" __version_major__ = "0.1" -__version__ = "{}a2".format(__version_major__) +__version__ = "{}a3".format(__version_major__) __version_long__ = "{} - Alpha".format(__version_major__) __status__ = "Prototype"