diff --git a/MANIFEST.in b/MANIFEST.in index 4853fac..6870176 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,6 @@ include README.rst include CHANGES.md include LICENSE +include requirements*.txt recursive-include jamdict/data/ *.sql recursive-include jamdict/data/ *.json diff --git a/README.md b/README.md index b34a7cd..08d64fa 100644 --- a/README.md +++ b/README.md @@ -49,9 +49,10 @@ I have mirrored these files to Google Drive so you can download there too: [https://drive.google.com/drive/folders/1z4zF9ImZlNeTZZplflvvnpZfJp3WVLPk](https://drive.google.com/drive/folders/1z4zF9ImZlNeTZZplflvvnpZfJp3WVLPk) Official website -- JMdict: [http://edrdg.org/jmdict/edict_doc.html](http://edrdg.org/jmdict/edict_doc.html) -- kanjidic2: [http://www.edrdg.org/kanjidic/kanjd2index.html](http://www.edrdg.org/kanjidic/kanjd2index.html) -- KRADFILE: [http://www.edrdg.org/krad/kradinf.html](http://www.edrdg.org/krad/kradinf.html) + +* JMdict: [http://edrdg.org/jmdict/edict_doc.html](http://edrdg.org/jmdict/edict_doc.html) +* kanjidic2: [http://www.edrdg.org/kanjidic/kanjd2index.html](http://www.edrdg.org/kanjidic/kanjd2index.html) +* KRADFILE: [http://www.edrdg.org/krad/kradinf.html](http://www.edrdg.org/krad/kradinf.html) # Sample codes diff --git a/jamdict/__init__.py b/jamdict/__init__.py index 23377be..aed1b60 100644 --- a/jamdict/__init__.py +++ b/jamdict/__init__.py @@ -46,6 +46,7 @@ ######################################################################## +from . import __version__ as version_info from .__version__ import __author__, __email__, __copyright__, __maintainer__ from .__version__ import __credits__, __license__, __description__, __url__ from .__version__ import __version_major__, __version_long__, __version__, __status__ @@ -54,4 +55,4 @@ from .kanjidic2_sqlite import KanjiDic2SQLite from .util import Jamdict, JMDictXML, KanjiDic2XML __all__ = ['Jamdict', 'JMDictSQLite', 'JMDictXML', 'KanjiDic2SQLite', 'KanjiDic2XML', - "__version__", "__author__", "__description__", "__copyright__"] + "__version__", "__author__", "__description__", "__copyright__", "version_info"] diff --git a/jamdict/__main__.py b/jamdict/__main__.py new file mode 100644 index 0000000..ceb1b26 --- /dev/null +++ b/jamdict/__main__.py @@ -0,0 +1,2 @@ +from . import tools +tools.main() diff --git a/jamdict/__version__.py b/jamdict/__version__.py index e6b8558..4f87ec4 100644 --- a/jamdict/__version__.py +++ b/jamdict/__version__.py @@ -10,6 +10,6 @@ __url__ = "https://github.com/neocl/jamdict" __maintainer__ = "Le Tuan Anh" __version_major__ = "0.1" -__version__ = "{}a3".format(__version_major__) +__version__ = "{}a4".format(__version_major__) __version_long__ = "{} - Alpha".format(__version_major__) __status__ = "Prototype" diff --git a/jamdict/config.py b/jamdict/config.py index ae16b61..dd11d19 100644 --- a/jamdict/config.py +++ b/jamdict/config.py @@ -35,7 +35,7 @@ import logging from chirptext import AppConfig -from chirptext.io import read_file, write_file +from chirptext.chio import read_file, write_file # ---------------------------------------------------------------------- # Configuration diff --git a/jamdict/data/setup_jmdict.sql b/jamdict/data/setup_jmdict.sql index 0757238..94c4427 100644 --- a/jamdict/data/setup_jmdict.sql +++ b/jamdict/data/setup_jmdict.sql @@ -1,6 +1,6 @@ /* Add meta info */ CREATE TABLE IF NOT EXISTS meta ( - key TEXT UNIQUE, + key TEXT PRIMARY KEY NOT NULL, value TEXT NOT NULL ); diff --git a/jamdict/jmdict.py b/jamdict/jmdict.py index 20a50b4..9c8d819 100644 --- a/jamdict/jmdict.py +++ b/jamdict/jmdict.py @@ -50,7 +50,7 @@ import logging from lxml import etree -from chirptext import io as chio +from chirptext import chio logger = logging.getLogger(__name__) @@ -82,9 +82,9 @@ def set_info(self, info): logging.warning("WARNING: multiple info tag") self.info = info - def text(self, compact=True, separator=' '): + def text(self, compact=True, separator=' ', no_id=False): tmp = [] - if not compact: + if not compact and not no_id: tmp.append('[id#%s]' % self.idseq) if self.kana_forms: tmp.append(self.kana_forms[0].text) diff --git a/jamdict/jmdict_sqlite.py b/jamdict/jmdict_sqlite.py index 38a510d..20b75d4 100644 --- a/jamdict/jmdict_sqlite.py +++ b/jamdict/jmdict_sqlite.py @@ -43,7 +43,7 @@ from puchikarui import Schema from . import __version__ as JAMDICT_VERSION, __url__ as JAMDICT_URL -from .jmdict import JMDEntry, EntryInfo, Link, BibInfo, Audit, KanjiForm, KanaForm, Sense, SenseGloss, LSource +from .jmdict import Meta, JMDEntry, EntryInfo, Link, BibInfo, Audit, KanjiForm, KanaForm, Sense, SenseGloss, LSource # ------------------------------------------------------------------------------- @@ -85,7 +85,7 @@ def __init__(self, data_source=":memory:", setup_script=None, setup_file=None, * self.add_script(SETUP_SCRIPT) self.add_file(JMDICT_SETUP_FILE) # Meta - self.add_table('meta', ['jmdict_version', 'jmdict_url', 'generator', 'generator_version', 'generator_url']) + self.add_table('meta', ['key', 'value'], proto=Meta).set_id('key') self.add_table('Entry', ['idseq']) self.add_table('Link', ['ID', 'idseq', 'tag', 'desc', 'uri']) self.add_table('Bib', ['ID', 'idseq', 'tag', 'text']) @@ -146,8 +146,8 @@ def search(self, query, ctx=None): if ctx is None: with self.ctx() as ctx: return self.search(query, ctx=ctx) - where = "idseq IN (SELECT idseq FROM Kanji WHERE text like ?) OR idseq IN (SELECT idseq FROM Kana WHERE text like ?)" - params = [query, query] + where = "idseq IN (SELECT idseq FROM Kanji WHERE text like ?) OR idseq IN (SELECT idseq FROM Kana WHERE text like ?) OR idseq IN (SELECT idseq FROM sense JOIN sensegloss ON sense.ID == sensegloss.sid WHERE text like ?)" + params = [query, query, query] try: if query.startswith('id#'): query_int = int(query[3:]) diff --git a/jamdict/kanjidic2.py b/jamdict/kanjidic2.py index b4b9f63..0f100be 100644 --- a/jamdict/kanjidic2.py +++ b/jamdict/kanjidic2.py @@ -51,7 +51,7 @@ import logging from lxml import etree -from chirptext import io as chio +from chirptext import chio # ------------------------------------------------------------------------------ @@ -366,7 +366,7 @@ def __init__(self, qc_type='', value='', skip_misclass=""): - stroke_count - a mistake in the number of strokes - stroke_and_posn - mistakes in both division and strokes - stroke_diff - ambiguous stroke counts depending on glyph - --> """ +S --> """ self.cid = None self.qc_type = qc_type self.value = value diff --git a/jamdict/tools.py b/jamdict/tools.py index bf73033..831f424 100755 --- a/jamdict/tools.py +++ b/jamdict/tools.py @@ -39,11 +39,13 @@ import os + from chirptext import confirm, TextReport, Timer from chirptext.cli import CLIApp, setup_logging from jamdict import Jamdict from jamdict import config +from jamdict import version_info # ------------------------------------------------------------------------------- # Configuration @@ -53,7 +55,11 @@ JMD_XML = config.get_file('JMDICT_XML') KD2_XML = config.get_file('KD2_XML') JMD_DB = config.get_file('JAMDICT_DB') -setup_logging('logging.json', 'logs') + +if os.path.isfile('logging.json'): + setup_logging('logging.json', 'logs') +else: + setup_logging(os.path.join(config.home_dir(), 'logging.json'), 'logs') # ------------------------------------------------------------------------------- @@ -99,39 +105,41 @@ def import_data(cli, args): print("Database paths were not provided. Process aborted.") -def dump_result(results): +def dump_result(results, report=None): + if report is None: + report = TextReport() if results.entries: - print("=" * 40) - print("Found entries") - print("=" * 40) + report.print("=" * 40) + report.print("Found entries") + report.print("=" * 40) for e in results.entries: kj = ', '.join([k.text for k in e.kanji_forms]) kn = ', '.join([k.text for k in e.kana_forms]) - print("Entry: {} | Kj: {} | Kn: {}".format(e.idseq, kj, kn)) - print("-" * 20) + report.print("Entry: {} | Kj: {} | Kn: {}".format(e.idseq, kj, kn)) + report.print("-" * 20) for idx, s in enumerate(e.senses): - print("{idx}. {s}".format(idx=idx + 1, s=s)) - print('') + report.print("{idx}. {s}".format(idx=idx + 1, s=s)) + report.print('') else: - print("No dictionary entry was found.") + report.print("No dictionary entry was found.") if results.chars: - print("=" * 40) - print("Found characters") - print("=" * 40) + report.print("=" * 40) + report.print("Found characters") + report.print("=" * 40) for c in results.chars: - print("Char: {} | Strokes: {}".format(c, c.stroke_count)) - print("-" * 20) + report.print("Char: {} | Strokes: {}".format(c, c.stroke_count)) + report.print("-" * 20) for rmg in c.rm_groups: - print("Readings:", ", ".join([r.value for r in rmg.readings])) - print("Meanings:", ", ".join([m.value for m in rmg.meanings if not m.m_lang or m.m_lang == 'en'])) + report.print("Readings:", ", ".join([r.value for r in rmg.readings])) + report.print("Meanings:", ", ".join([m.value for m in rmg.meanings if not m.m_lang or m.m_lang == 'en'])) else: - print("No character was found.") + report.print("No character was found.") def lookup(cli, args): '''Lookup words by kanji/kana''' jam = get_jam(cli, args) - results = jam.lookup(args.query) + results = jam.lookup(args.query, strict_lookup=args.strict) if args.format == 'json': print(results.to_json()) else: @@ -148,11 +156,15 @@ def file_status(file_path): def show_info(cli, args): ''' Show jamdict configuration (data folder, configuration file location, etc.) ''' - print("Configuration location: {}".format(config._get_config_manager().locate_config())) - print("-" * 40) - print("Jamdict DB location : {} - {}".format(args.jdb, file_status(args.jdb))) - print("JMDict XML file : {} - {}".format(args.jmdxml, file_status(args.jmdxml))) - print("KanjiDic2 XML file : {} - {}".format(args.kd2xml, file_status(args.kd2xml))) + output = TextReport(args.output) if 'output' in args else TextReport() + output.header("Jamdict | {} - Version: {}".format(version_info.__description__, version_info.__version__), level='h0') + output.header("Basic configuration") + output.print("JAMDICT_HOME: {}".format(config.home_dir())) + output.print("Configuration location: {}".format(config._get_config_manager().locate_config())) + output.header("Data files") + output.print("Jamdict DB location: {} - {}".format(args.jdb, file_status(args.jdb))) + output.print("JMDict XML file : {} - {}".format(args.jmdxml, file_status(args.jmdxml))) + output.print("KanjiDic2 XML file : {} - {}".format(args.kd2xml, file_status(args.kd2xml))) # ------------------------------------------------------------------------------- @@ -178,6 +190,7 @@ def main(): # show info info_task = app.add_task('info', func=show_info) + info_task.add_argument('-o', '--output', help='Write information to a text file') add_data_config(info_task) # look up task @@ -185,6 +198,7 @@ def main(): lookup_task.add_argument('query', help='kanji/kana') lookup_task.add_argument('-f', '--format', help='json or text') lookup_task.add_argument('--compact', action='store_true') + lookup_task.add_argument('-s', '--strict', action='store_true') lookup_task.set_defaults(func=lookup) add_data_config(lookup_task) diff --git a/jamdict/util.py b/jamdict/util.py index e09089f..38273d9 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -51,6 +51,8 @@ import logging import threading from collections import defaultdict as dd +from collections import OrderedDict +from chirptext.deko import HIRAGANA, KATAKANA from . import config from .jmdict import JMDictXMLParser @@ -73,13 +75,13 @@ def __init__(self, entries, chars): self.entries = entries if entries else [] self.chars = chars if chars else [] - def text(self, compact=True, entry_sep='。', separator=' | '): + def text(self, compact=True, entry_sep='。', separator=' | ', no_id=False, with_chars=True): output = [] if self.entries: - entries_txt = str(entry_sep.join(e.text(compact=compact, separator='') for e in self.entries)) + entries_txt = str(entry_sep.join(e.text(compact=compact, separator='', no_id=no_id) for e in self.entries)) output.append("Entries: ") output.append(entries_txt) - if self.entries: + if self.chars and with_chars: if compact: chars_txt = ', '.join(str(c) for c in self.chars) else: @@ -190,9 +192,9 @@ def import_data(self): getLogger().info("Importing KanjiDic2 data") self.kd2.insert_chars(self.kd2_xml) - def get_char(self, literal): + def get_char(self, literal, ctx=None): if self.kd2 is not None: - return self.kd2.get_char(literal) + return self.kd2.get_char(literal, ctx=ctx) elif self.kd2_xml: return self.kd2_xml.lookup(literal) else: @@ -206,7 +208,7 @@ def get_entry(self, idseq): else: raise LookupError("There is no backend data available") - def lookup(self, query): + def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None): if not self.is_available(): raise LookupError("There is no backend data available") elif not query: @@ -215,18 +217,21 @@ def lookup(self, query): entries = [] chars = [] if self.jmdict is not None: - entries = self.jmdict.search(query) + entries = self.jmdict.search(query, ctx=ctx) elif self.jmdict_xml: entries = self.jmdict_xml.lookup(query) - if self.has_kd2(): + if lookup_chars and self.has_kd2(): # lookup each character in query and kanji readings of each found entries - chars_to_search = set(query) - if entries: + chars_to_search = OrderedDict({c: c for c in query}) + if not strict_lookup and entries: + # auto add characters from entries for e in entries: for k in e.kanji_forms: - chars_to_search.update(k.text) + for c in k.text: + if c not in HIRAGANA and c not in KATAKANA: + chars_to_search[c] = c for c in chars_to_search: - result = self.get_char(c) + result = self.get_char(c, ctx=ctx) if result is not None: chars.append(result) return LookupResult(entries, chars) diff --git a/jamdol-flask.py b/jamdol-flask.py index af39b9f..6b3de2c 100755 --- a/jamdol-flask.py +++ b/jamdol-flask.py @@ -52,22 +52,22 @@ from flask import Flask, Response from functools import wraps from flask import request + +from chirptext.cli import setup_logging + from jamdict import Jamdict # --------------------------------------------------------------------- # CONFIGURATION # --------------------------------------------------------------------- +setup_logging('logging.json', 'logs') app = Flask(__name__, static_url_path="") -# Prefer to use jmdict.en -DB_FILE = os.path.abspath('./data/jamdict.en.db') -if not os.path.isfile(DB_FILE): - DB_FILE = os.path.abspath('./data/jamdict.db') -jmd = Jamdict(db_file=DB_FILE) +jmd = Jamdict() -def get_logger(): - logging.getLogger(__name__) +def getLogger(): + return logging.getLogger(__name__) # --------------------------------------------------------------------- @@ -100,9 +100,11 @@ def get_entry(idseq): @app.route('/jamdol/search/', methods=['GET']) +@app.route('/jamdol/search//', methods=['GET']) @jsonp -def search(query): - results = jmd.lookup(query) +def search(query, strict=None): + getLogger().info("Query = {}".format(query)) + results = jmd.lookup(query, strict_lookup=strict) return results.to_json() diff --git a/logging.json b/logging.json new file mode 100644 index 0000000..939719e --- /dev/null +++ b/logging.json @@ -0,0 +1,57 @@ +{ + "version": 1, + "disable_existing_loggers": false, + "formatters": { + "simple": { + "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + } + }, + + "handlers": { + "console": { + "class": "logging.StreamHandler", + "level": "DEBUG", + "formatter": "simple", + "stream": "ext://sys.stdout" + }, + + "file_handler_important": { + "class": "logging.handlers.RotatingFileHandler", + "level": "WARNING", + "formatter": "simple", + "filename": "logs/logging_important.log", + "maxBytes": 1000000, + "backupCount": 20, + "encoding": "utf8" + }, + + "file_handler_verbose": { + "class": "logging.handlers.RotatingFileHandler", + "level": "DEBUG", + "formatter": "simple", + "filename": "logs/logging_details.log", + "maxBytes": 1000000, + "backupCount": 20, + "encoding": "utf8" + } + + }, + + "loggers": { + "__main__": { + "level": "INFO", + "handlers": ["file_handler_verbose"], + "propagate": "no" + }, + "jamdol-flask": { "level": "INFO" } + , + "chirptext.dekomecab": { + "level": "ERROR" + } + }, + + "root": { + "level": "WARNING", + "handlers": ["console", "file_handler_important"] + } +} diff --git a/release.sh b/release.sh index f52303c..fd8b8e5 100755 --- a/release.sh +++ b/release.sh @@ -1,4 +1,4 @@ #!/bin/bash -pandoc --from=markdown --to=rst README.md -o README.rst +# pandoc --from=markdown --to=rst README.md -o README.rst python3 setup.py sdist diff --git a/requirements.txt b/requirements.txt index 3134372..8a52c56 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ lxml -chirptext +chirptext >= 0.1a18 puchikarui diff --git a/setup.py b/setup.py index 4ed5192..dce68ef 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,6 @@ ''' import io -import os from setuptools import setup @@ -26,12 +25,18 @@ def read(*filenames, **kwargs): return sep.join(buf) -readme_file = 'README.rst' if os.path.isfile('README.rst') else 'README.md' +# readme_file = 'README.rst' if os.path.isfile('README.rst') else 'README.md' +readme_file = 'README.md' long_description = read(readme_file) pkg_info = {} exec(read('jamdict/__version__.py'), pkg_info) +with open('requirements.txt', 'r') as infile: + requirements = infile.read().splitlines() + print(requirements) + + setup( name='jamdict', # package file name (-version.tar.gz) version=pkg_info['__version__'], @@ -43,8 +48,8 @@ def read(*filenames, **kwargs): keywords="nlp", license=pkg_info['__license__'], author=pkg_info['__author__'], - tests_require=['lxml', 'chirptext', 'puchikarui'], - install_requires=['lxml', 'chirptext', 'puchikarui'], + tests_require=requirements, + install_requires=requirements, author_email=pkg_info['__email__'], description=pkg_info['__description__'], long_description=long_description, diff --git a/test/test_jmdict_sqlite.py b/test/test_jmdict_sqlite.py index 5eabbdd..c73d615 100644 --- a/test/test_jmdict_sqlite.py +++ b/test/test_jmdict_sqlite.py @@ -128,6 +128,10 @@ def test_search(self): es = self.db.search('%子%', ctx) self.assertEqual(len(es), 4) getLogger().info('%子%: {}'.format('|'.join([str(x) for x in es]))) + # search by meaning + es = self.db.search('%confections%', ctx) + self.assertTrue(es) + getLogger().info('%confections%: {}'.format('|'.join([str(x) for x in es]))) # -------------------------------------------------------------------------------