diff --git a/CHANGELOG.rst b/CHANGELOG.rst index cfe2758..45063d6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,13 @@ qwikidata Change Log ==================== +v0.2.0 +====== + +**Removed**: + +* Jsonl output support from `WikidataJsonDump` class so that chunks produced by the class can always be read by the class. + v0.1.5 ====== diff --git a/qwikidata/__init__.py b/qwikidata/__init__.py index 25a2257..639386e 100644 --- a/qwikidata/__init__.py +++ b/qwikidata/__init__.py @@ -2,4 +2,4 @@ """Metadata for this package.""" __package_name__ = "qwikidata" -__version__ = "0.1.5" +__version__ = "0.2.0" diff --git a/qwikidata/json_dump.py b/qwikidata/json_dump.py index ff21f5d..e437049 100644 --- a/qwikidata/json_dump.py +++ b/qwikidata/json_dump.py @@ -77,19 +77,16 @@ def __iter__(self) -> Iterator[Dict]: yield json.loads(line_str) def _write_chunk( - self, out_fbase: str, ichunk: int, out_format: str, out_lines: List[str] + self, out_fbase: str, ichunk: int, out_lines: List[str] ) -> Tuple[List[str], int, str]: """Write a single chunk to disk.""" - out_fname = f"{out_fbase}-ichunk_{ichunk}.{out_format}" + out_fname = f"{out_fbase}-ichunk_{ichunk}.json" self.logger.debug(f"writing {out_fname}") out_lines = [out_line.rstrip(",\n") for out_line in out_lines] with open(out_fname, "w") as fp: - if out_format == "json": - fp.write("[\n") - fp.write(",\n".join(out_lines)) - fp.write("\n]\n") - elif out_format == "jsonl": - fp.write("\n".join(out_lines)) + fp.write("[\n") + fp.write(",\n".join(out_lines)) + fp.write("\n]\n") if self.compression == "bz2": args = ["bzip2", out_fname] @@ -107,7 +104,6 @@ def _write_chunk( def create_chunks( self, out_fbase: Optional[str] = None, - out_format: str = "json", num_lines_per_chunk: int = 100, max_chunks: int = 10 ** 10, ) -> List[str]: @@ -116,11 +112,7 @@ def create_chunks( Parameters ---------- out_fbase: str - Each output file will have the form `{out_fbase}_ichunk_{ichunk}.(json|jsonl)[.bz2|.gz]` - out_format: str - One of ["json", "jsonl"]. If `json`, then each file is a valid json array - (as in the original dump file). If `jsonl`, then each file is in the - "JSON Lines" format (http://jsonlines.org/). + Each output file will have the form `{out_fbase}_ichunk_{ichunk}.json[.bz2|.gz]` num_lines_per_chunk: int Number of lines per chunk file max_chunks: int @@ -139,18 +131,14 @@ def create_chunks( continue out_lines.append(line) if len(out_lines) >= num_lines_per_chunk: - out_lines, ichunk, out_fname = self._write_chunk( - out_fbase, ichunk, out_format, out_lines - ) + out_lines, ichunk, out_fname = self._write_chunk(out_fbase, ichunk, out_lines) out_fnames.append(out_fname) if ichunk >= max_chunks: return out_fnames if len(out_lines) > 0: - out_lines, ichunk, out_fname = self._write_chunk( - out_fbase, ichunk, out_format, out_lines - ) + out_lines, ichunk, out_fname = self._write_chunk(out_fbase, ichunk, out_lines) out_fnames.append(out_fname) return out_fnames