remove jsonl output suport from WikidataJsonDump (#20)

kensho-technologies · Mar 31, 2019 · a696da4 · a696da4
1 parent 9e7b29b
commit a696da4
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 21 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,6 +2,13 @@
 qwikidata Change Log
 ====================
 
+v0.2.0
+======
+
+**Removed**:
+
+* Jsonl output support from `WikidataJsonDump` class so that chunks produced by the class can always be read by the class.
+
 v0.1.5
 ======
 

diff --git a/qwikidata/__init__.py b/qwikidata/__init__.py
@@ -2,4 +2,4 @@
 """Metadata for this package."""
 
 __package_name__ = "qwikidata"
-__version__ = "0.1.5"
+__version__ = "0.2.0"
diff --git a/qwikidata/json_dump.py b/qwikidata/json_dump.py
@@ -77,19 +77,16 @@ def __iter__(self) -> Iterator[Dict]:
             yield json.loads(line_str)
 
     def _write_chunk(
-        self, out_fbase: str, ichunk: int, out_format: str, out_lines: List[str]
+        self, out_fbase: str, ichunk: int, out_lines: List[str]
     ) -> Tuple[List[str], int, str]:
         """Write a single chunk to disk."""
-        out_fname = f"{out_fbase}-ichunk_{ichunk}.{out_format}"
+        out_fname = f"{out_fbase}-ichunk_{ichunk}.json"
         self.logger.debug(f"writing {out_fname}")
         out_lines = [out_line.rstrip(",\n") for out_line in out_lines]
         with open(out_fname, "w") as fp:
-            if out_format == "json":
-                fp.write("[\n")
-                fp.write(",\n".join(out_lines))
-                fp.write("\n]\n")
-            elif out_format == "jsonl":
-                fp.write("\n".join(out_lines))
+            fp.write("[\n")
+            fp.write(",\n".join(out_lines))
+            fp.write("\n]\n")
 
         if self.compression == "bz2":
             args = ["bzip2", out_fname]
@@ -107,7 +104,6 @@ def _write_chunk(
     def create_chunks(
         self,
         out_fbase: Optional[str] = None,
-        out_format: str = "json",
         num_lines_per_chunk: int = 100,
         max_chunks: int = 10 ** 10,
     ) -> List[str]:
@@ -116,11 +112,7 @@ def create_chunks(
         Parameters
         ----------
         out_fbase: str
-          Each output file will have the form `{out_fbase}_ichunk_{ichunk}.(json|jsonl)[.bz2|.gz]`
-        out_format: str
-          One of ["json", "jsonl"].  If `json`, then each file is a valid json array
-          (as in the original dump file).  If `jsonl`, then each file is in the
-          "JSON Lines" format (http://jsonlines.org/).
+          Each output file will have the form `{out_fbase}_ichunk_{ichunk}.json[.bz2|.gz]`
         num_lines_per_chunk: int
           Number of lines per chunk file
         max_chunks: int
@@ -139,18 +131,14 @@ def create_chunks(
                 continue
             out_lines.append(line)
             if len(out_lines) >= num_lines_per_chunk:
-                out_lines, ichunk, out_fname = self._write_chunk(
-                    out_fbase, ichunk, out_format, out_lines
-                )
+                out_lines, ichunk, out_fname = self._write_chunk(out_fbase, ichunk, out_lines)
                 out_fnames.append(out_fname)
 
             if ichunk >= max_chunks:
                 return out_fnames
 
         if len(out_lines) > 0:
-            out_lines, ichunk, out_fname = self._write_chunk(
-                out_fbase, ichunk, out_format, out_lines
-            )
+            out_lines, ichunk, out_fname = self._write_chunk(out_fbase, ichunk, out_lines)
             out_fnames.append(out_fname)
 
         return out_fnames