Merge pull request #7 from neocl/dev

Version 0.1a3 is ready
neocl · Apr 16, 2018 · 8b30d90 · 8b30d90
2 parents cf7cce7 + a5f5c70
commit 8b30d90
Show file tree

Hide file tree

Showing 27 changed files with 673 additions and 313 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,4 +1,5 @@
 include README.rst
 include CHANGES.md
 include LICENSE
-recursive-include jamdict/scripts/ *.sql
+recursive-include jamdict/data/ *.sql
+recursive-include jamdict/data/ *.json
diff --git a/README.md b/README.md
@@ -8,32 +8,67 @@ Python library for manipulating Jim Breen's JMdict & KanjiDic2
 
 # Installation
 
+Homepage: [https://github.com/neocl/jamdict](https://github.com/neocl/jamdict)
+
 ```bash
 pip install jamdict
 # pip script sometimes doesn't work properly, so you may want to try this instead
 python3 -m pip install jamdict
+
+# initial setup (this command will create ~/.jamdict for you
+# it will also tell you where to copy the data files
+python3 -m jamdict.tools info
+
+# to look up a word using command line
+python3 -m jamdict.tools lookup たべる
+========================================
+Found entries
+========================================
+Entry: 1358280 | Kj:  食べる, 喰べる | Kn: たべる
+--------------------
+1. to eat ((Ichidan verb|transitive verb))
+2. to live on (e.g. a salary)/to live off/to subsist on
+
+========================================
+Found characters
+========================================
+Char: 食 | Strokes: 9
+--------------------
+Readings: shi2, si4, sig, sa, 식, 사, Thực, Tự, ショク, ジキ, く.う, く.らう, た.べる, は.む
+Meanings: eat, food
+Char: 喰 | Strokes: 12
+--------------------
+Readings: shi2, si4, sig, 식, Thặc, Thực, Tự, く.う, く.らう
+Meanings: eat, drink, receive (a blow), (kokuji)
 ```
 
 ## Data
-XML files (JMdict_e.xml, kanjidic2.xml) must be downloaded from JMdict home page and copy into `~/local/jamdict/data`
+XML files (JMdict_e.xml, kanjidic2.xml) must be downloaded and copy into `~/.jamdict/data`
+
+I have mirrored these files to Google Drive so you can download there too:
+[https://drive.google.com/drive/folders/1z4zF9ImZlNeTZZplflvvnpZfJp3WVLPk](https://drive.google.com/drive/folders/1z4zF9ImZlNeTZZplflvvnpZfJp3WVLPk)
+
+Official website
+- JMdict: [http://edrdg.org/jmdict/edict_doc.html](http://edrdg.org/jmdict/edict_doc.html)
+- kanjidic2: [http://www.edrdg.org/kanjidic/kanjd2index.html](http://www.edrdg.org/kanjidic/kanjd2index.html)
+- KRADFILE: [http://www.edrdg.org/krad/kradinf.html](http://www.edrdg.org/krad/kradinf.html)
 
-Read more about JMdict here: http://www.csse.monash.edu.au/~jwb/edict.html
 
 # Sample codes
 
 ```python
 >>> from jamdict import Jamdict
->>> jmd = Jamdict("/home/tuananh/local/jamdict/data/jamdict.db")
+>>> jmd = Jamdict()
 >>> jmd.lookup('食べる')
-<jamdict.util.LookupResult object at 0x7fc70775a710>
+'Entries: たべる(食べる):1. to eat2. to live on (e.g. a salary)/to live off/to subsist on | Chars: 食, 喰'
 >>> result = jmd.lookup('食べる')
 >>> print(result.entries)
-[ID:1358280|たべる|食べる|1. to eat ((Ichidan verb|transitive verb))|2. to live on (e.g. a salary)/to live off/to subsist on]
+[たべる (食べる) : 1. to eat 2. to live on (e.g. a salary)/to live off/to subsist on]
 >>> for c in result.chars:
 ...     print(c, c.rm_groups)
 ... 
-喰 [R: shi2, si4, sig, 식, Thặcÿ, Thựcÿ, Tự,ÿ く.う, く.らう | M: eat, drink, receive (a blow), (kokuji)]
 食 [R: shi2, si4, sig, sa, 식, 사, Thực, Tự, ショク, ジキ, く.う, く.らう, た.べる, は.む | M: eat, food, manger, nourriture, alimento, comida, eclipse, comer, comer, comida, alimento]
+喰 [R: shi2, si4, sig, 식, Thặc, Thực, Tự, く.う, く.らう | M: eat, drink, receive (a blow), (kokuji)]
 ```
 
 See `jamdict_demo.py` and `jamdict/tools.py` for more information.
diff --git a/data/README.md b/data/README.md
@@ -1 +1 @@
-Copy JMDict dictionary file (JMdict_e.xml) here
+Copy dictionary files (JMdict_e.xml, kanjidic2.xml, kradfile, etc.) here
diff --git a/jamdict/__init__.py b/jamdict/__init__.py
@@ -44,26 +44,14 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
-
-__author__ = "Le Tuan Anh"
-__email__ = "[email protected]"
-__copyright__ = "Copyright 2016, jamdict"
-__credits__ = []
-__license__ = "MIT License"
-__description__ = "Python library for manipulating Jim Breen's JMdict & KanjiDic2"
-__url__ = "https://github.com/neocl/jamdict"
-__maintainer__ = "Le Tuan Anh"
-__version_major__ = "0.1"
-__version__ = "{}a1".format(__version_major__)
-__version_long__ = "{} - Alpha".format(__version_major__)
-__status__ = "Prototype"
-
 ########################################################################
 
+from .__version__ import __author__, __email__, __copyright__, __maintainer__
+from .__version__ import __credits__, __license__, __description__, __url__
+from .__version__ import __version_major__, __version_long__, __version__, __status__
+
 from .jmdict_sqlite import JMDictSQLite
 from .kanjidic2_sqlite import KanjiDic2SQLite
 from .util import Jamdict, JMDictXML, KanjiDic2XML
-
-########################################################################
-
-__all__ = ['Jamdict', 'JMDictSQLite', 'JMDictXML', 'KanjiDic2SQLite', 'KanjiDic2XML']
+__all__ = ['Jamdict', 'JMDictSQLite', 'JMDictXML', 'KanjiDic2SQLite', 'KanjiDic2XML',
+           "__version__", "__author__", "__description__", "__copyright__"]
diff --git a/jamdict/__version__.py b/jamdict/__version__.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+
+# jamdict's package version information
+__author__ = "Le Tuan Anh"
+__email__ = "[email protected]"
+__copyright__ = "Copyright (c) 2016, Le Tuan Anh"
+__credits__ = []
+__license__ = "MIT License"
+__description__ = "Python library for manipulating Jim Breen's JMdict & KanjiDic2"
+__url__ = "https://github.com/neocl/jamdict"
+__maintainer__ = "Le Tuan Anh"
+__version_major__ = "0.1"
+__version__ = "{}a3".format(__version_major__)
+__version_long__ = "{} - Alpha".format(__version_major__)
+__status__ = "Prototype"
diff --git a/jamdict/config.py b/jamdict/config.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+
+'''
+Jamdict configuration management
+
+Latest version can be found at https://github.com/neocl/jamdict
+
+@author: Le Tuan Anh <[email protected]>
+@license: MIT
+'''
+
+# Copyright (c) 2016, Le Tuan Anh <[email protected]>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+########################################################################
+
+import os
+import logging
+
+from chirptext import AppConfig
+from chirptext.io import read_file, write_file
+
+# ----------------------------------------------------------------------
+# Configuration
+# ----------------------------------------------------------------------
+
+MY_DIR = os.path.dirname(__file__)
+CONFIG_TEMPLATE = os.path.join(MY_DIR, 'data', 'config_template.json')
+__jamdict_home = os.environ.get('JAMDICT_HOME', MY_DIR)
+__app_config = AppConfig('jamdict', mode=AppConfig.JSON, working_dir=__jamdict_home)
+
+
+def getLogger():
+    return logging.getLogger(__name__)
+
+
+def _get_config_manager():
+    ''' Internal function for retrieving application config manager object
+    Don't use this directly, use read_config() method instead
+    '''
+    return __app_config
+
+
+def read_config():
+    if not __app_config.config and not __app_config.locate_config():
+        # need to create a config
+        config_dir = os.path.expanduser('~/.jamdict/')
+        if not os.path.exists(config_dir):
+            os.makedirs(config_dir)
+        cfg_loc = os.path.join(config_dir, 'config.json')
+        default_config = read_file(CONFIG_TEMPLATE)
+        getLogger().warning("Jamdict configuration file could not be found. A new configuration file will be generated at {}".format(cfg_loc))
+        getLogger().debug("Default config: {}".format(default_config))
+        write_file(cfg_loc, default_config)
+    # read config
+    config = __app_config.config
+    return config
+
+
+def home_dir():
+    _config = read_config()
+    return _config.get('JAMDICT_HOME', '.')
+
+
+def data_dir():
+    _config = read_config()
+    _data_dir = _config.get('JAMDICT_DATA', '{JAMDICT_HOME}/data').format(JAMDICT_HOME=home_dir())
+    return _data_dir
+
+
+def get_file(file_key):
+    _config = read_config()
+    _data_dir = data_dir()
+    return _config.get(file_key).format(JAMDICT_DATA=_data_dir)
diff --git a/jamdict/data/config_template.json b/jamdict/data/config_template.json
@@ -0,0 +1,8 @@
+{
+  "JAMDICT_HOME": "~/.jamdict",
+  "JAMDICT_DATA": "{JAMDICT_HOME}/data",
+  "JAMDICT_DB": "{JAMDICT_DATA}/jamdict.db",
+  "JMDICT_XML": "{JAMDICT_DATA}/JMdict_e.gz",
+  "KD2_XML": "{JAMDICT_DATA}/kanjidic2.xml.gz",
+  "KRADFILE": "{JAMDICT_DATA}/kradfile-u.gz"
+}
diff --git a/jamdict/scripts/setup_jmdict.sql → jamdict/data/setup_jmdict.sql b/jamdict/scripts/setup_jmdict.sql → jamdict/data/setup_jmdict.sql
diff --git a/jamdict/scripts/setup_kanjidic2.sql → jamdict/data/setup_kanjidic2.sql b/jamdict/scripts/setup_kanjidic2.sql → jamdict/data/setup_kanjidic2.sql
diff --git a/jamdict/jmdict.py b/jamdict/jmdict.py
@@ -50,6 +50,8 @@
 import logging
 from lxml import etree
 
+from chirptext import io as chio
+
 logger = logging.getLogger(__name__)
 
 ########################################################################
@@ -64,8 +66,8 @@ class JMDEntry(object):
     def __init__(self, idseq=''):
         # A unique numeric sequence number for each entry
         self.idseq = idseq     # ent_seq
-        self.kanji_forms = []  # k_ele*  => KanjiReading[]
-        self.kana_forms = []   # r_ele+  => KanaReading[]
+        self.kanji_forms = []  # k_ele*  => KanjiForm[]
+        self.kana_forms = []   # r_ele+  => KanaForm[]
         self.info = None       # info?   => EntryInfo
         self.senses = []       # sense+
 
@@ -80,18 +82,28 @@ def set_info(self, info):
             logging.warning("WARNING: multiple info tag")
         self.info = info
 
-    def __repr__(self):
-        tmp = ['ID:%s' % self.idseq]
+    def text(self, compact=True, separator=' '):
+        tmp = []
+        if not compact:
+            tmp.append('[id#%s]' % self.idseq)
         if self.kana_forms:
             tmp.append(self.kana_forms[0].text)
         if self.kanji_forms:
-            tmp.append(self.kanji_forms[0].text)
-        for sense, idx in zip(self.senses, range(len(self.senses))):
-            tmp.append('{i}. {s}'.format(i=idx + 1, s=sense))
-        return '|'.join(tmp)
+            tmp.append("({})".format(self.kanji_forms[0].text))
+        if self.senses:
+            tmp.append(':')
+            if len(self.senses) == 1:
+                tmp.append(self.senses[0].text(compact=compact))
+            else:
+                for sense, idx in zip(self.senses, range(len(self.senses))):
+                    tmp.append('{i}. {s}'.format(i=idx + 1, s=sense.text(compact=compact)))
+        return separator.join(tmp)
+
+    def __repr__(self):
+        return self.text(compact=True)
 
     def __str__(self):
-        return repr(self)
+        return self.text(compact=False)
 
     def to_json(self):
         ed = {'idseq': self.idseq,
@@ -103,7 +115,7 @@ def to_json(self):
         return ed
 
 
-class KanjiReading(object):
+class KanjiForm(object):
     ''' The kanji element, or in its absence, the reading element, is
     the defining component of each entry.
     The overwhelming majority of entries will have a single kanji
@@ -174,8 +186,14 @@ def to_json(self):
             kjd['pri'] = self.pri
         return kjd
 
+    def __repr__(self):
+        return str(self)
+
+    def __str__(self):
+        return self.text
 
-class KanaReading(object):
+
+class KanaForm(object):
     '''<!ELEMENT r_ele (reb, re_nokanji?, re_restr*, re_inf*, re_pri*)>
     The reading element typically contains the valid readings
     of the word(s) in the kanji element using modern kanadzukai.
@@ -228,6 +246,12 @@ def to_json(self):
             knd['pri'] = self.pri
         return knd
 
+    def __repr__(self):
+        return str(self)
+
+    def __str__(self):
+        return self.text
+
 
 class EntryInfo(object):
     '''General coded information relating to the entry as a whole.
@@ -375,8 +399,11 @@ def __repr__(self):
         return str(self)
 
     def __str__(self):
+        return self.text(compact=False)
+
+    def text(self, compact=True):
         tmp = [str(x) for x in self.gloss]
-        if self.pos:
+        if not compact and self.pos:
             return '{gloss} ({pos})'.format(gloss='/'.join(tmp), pos=('(%s)' % '|'.join(self.pos)))
         else:
             return '/'.join(tmp)
@@ -513,16 +540,18 @@ def __init__(self):
     def parse_file(self, jmdict_file_path):
         ''' Parse JMDict_e.xml file and return a list of JMDEntry objects
         '''
-        logger.debug('Loading data from file: %s' % (os.path.abspath(jmdict_file_path)))
-
-        tree = etree.iterparse(jmdict_file_path)
-        entries = []
-        for event, element in tree:
-            if event == 'end' and element.tag == 'entry':
-                entries.append(self.parse_entry_tag(element))
-                # and then we can clear the element to save memory
-                element.clear()
-        return entries
+        actual_path = os.path.abspath(os.path.expanduser(jmdict_file_path))
+        logger.debug('Loading data from file: {}'.format(actual_path))
+
+        with chio.open(actual_path, mode='rb') as jmfile:
+            tree = etree.iterparse(jmfile)
+            entries = []
+            for event, element in tree:
+                if event == 'end' and element.tag == 'entry':
+                    entries.append(self.parse_entry_tag(element))
+                    # and then we can clear the element to save memory
+                    element.clear()
+            return entries
 
     def parse_entry_tag(self, etag):
         '''Parse a lxml XML Node and generate a JMDEntry entry'''
@@ -559,7 +588,7 @@ def get_single(self, tag_name, a_tag):
             return children[0]
 
     def parse_k_ele(self, k_ele, entry):
-        kr = KanjiReading()
+        kr = KanjiForm()
         for child in k_ele:
             if child.tag == 'keb':
                 kr.set_text(child.text)
@@ -574,7 +603,7 @@ def parse_k_ele(self, k_ele, entry):
         return kr
 
     def parse_r_ele(self, r_ele, entry):
-        kr = KanaReading()
+        kr = KanaForm()
         for child in r_ele:
             if child.tag == 'reb':
                 kr.set_text(child.text)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		Copy JMDict dictionary file (JMdict_e.xml) here
		Copy dictionary files (JMdict_e.xml, kanjidic2.xml, kradfile, etc.) here