Skip to content

Commit

Permalink
remove jsonl output suport from WikidataJsonDump (#20)
Browse files Browse the repository at this point in the history
  • Loading branch information
galtay authored Mar 31, 2019
1 parent 9e7b29b commit a696da4
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 21 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@
qwikidata Change Log
====================

v0.2.0
======

**Removed**:

* Jsonl output support from `WikidataJsonDump` class so that chunks produced by the class can always be read by the class.

v0.1.5
======

Expand Down
2 changes: 1 addition & 1 deletion qwikidata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
"""Metadata for this package."""

__package_name__ = "qwikidata"
__version__ = "0.1.5"
__version__ = "0.2.0"
28 changes: 8 additions & 20 deletions qwikidata/json_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,19 +77,16 @@ def __iter__(self) -> Iterator[Dict]:
yield json.loads(line_str)

def _write_chunk(
self, out_fbase: str, ichunk: int, out_format: str, out_lines: List[str]
self, out_fbase: str, ichunk: int, out_lines: List[str]
) -> Tuple[List[str], int, str]:
"""Write a single chunk to disk."""
out_fname = f"{out_fbase}-ichunk_{ichunk}.{out_format}"
out_fname = f"{out_fbase}-ichunk_{ichunk}.json"
self.logger.debug(f"writing {out_fname}")
out_lines = [out_line.rstrip(",\n") for out_line in out_lines]
with open(out_fname, "w") as fp:
if out_format == "json":
fp.write("[\n")
fp.write(",\n".join(out_lines))
fp.write("\n]\n")
elif out_format == "jsonl":
fp.write("\n".join(out_lines))
fp.write("[\n")
fp.write(",\n".join(out_lines))
fp.write("\n]\n")

if self.compression == "bz2":
args = ["bzip2", out_fname]
Expand All @@ -107,7 +104,6 @@ def _write_chunk(
def create_chunks(
self,
out_fbase: Optional[str] = None,
out_format: str = "json",
num_lines_per_chunk: int = 100,
max_chunks: int = 10 ** 10,
) -> List[str]:
Expand All @@ -116,11 +112,7 @@ def create_chunks(
Parameters
----------
out_fbase: str
Each output file will have the form `{out_fbase}_ichunk_{ichunk}.(json|jsonl)[.bz2|.gz]`
out_format: str
One of ["json", "jsonl"]. If `json`, then each file is a valid json array
(as in the original dump file). If `jsonl`, then each file is in the
"JSON Lines" format (http://jsonlines.org/).
Each output file will have the form `{out_fbase}_ichunk_{ichunk}.json[.bz2|.gz]`
num_lines_per_chunk: int
Number of lines per chunk file
max_chunks: int
Expand All @@ -139,18 +131,14 @@ def create_chunks(
continue
out_lines.append(line)
if len(out_lines) >= num_lines_per_chunk:
out_lines, ichunk, out_fname = self._write_chunk(
out_fbase, ichunk, out_format, out_lines
)
out_lines, ichunk, out_fname = self._write_chunk(out_fbase, ichunk, out_lines)
out_fnames.append(out_fname)

if ichunk >= max_chunks:
return out_fnames

if len(out_lines) > 0:
out_lines, ichunk, out_fname = self._write_chunk(
out_fbase, ichunk, out_format, out_lines
)
out_lines, ichunk, out_fname = self._write_chunk(out_fbase, ichunk, out_lines)
out_fnames.append(out_fname)

return out_fnames
Expand Down

0 comments on commit a696da4

Please sign in to comment.