From 23e1fbb064d1d91b470652349a6dfca54090066e Mon Sep 17 00:00:00 2001 From: Certainly Not a Genius Date: Wed, 25 Dec 2024 21:37:08 +0100 Subject: [PATCH 1/2] Added Json converter --- src/markitdown/_markitdown.py | 51 +++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e55..1e6e83e2 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1205,6 +1205,56 @@ def convert( ) +class JsonConverter(DocumentConverter): + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if not an image + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".json"]: + return None + + with open(local_path) as test_json: + try: + json_data = json.load(test_json) + except json.JSONDecodeError: + return None + + _prefix, _ = os.path.splitext(os.path.basename(local_path)) + md_content = self._json_traversal(json_data, level=1, prefix=_prefix) + + return DocumentConverterResult( + title=None, + text_content=md_content, + ) + + def _json_traversal(self, d: Union[dict, list], level: int, prefix: str) -> str: + _md = "" + + if type(d) is dict: + for key in d.keys(): + if type(d[key]) is dict or type(d[key]) is list: + _md += "%s %s.%s\n" % ("#" * level, prefix, str(key)) + + _md += self._json_traversal(d[key], level + 1, prefix + "." + key) + + elif type(d) is list: + _md += "---\n" + for index, item in enumerate(d): + _md += ( + str(index) + + ". " + + self._json_traversal( + item, level=0, prefix=prefix + "[" + str(index) + "]" + ) + ) + _md += "---\n" + + else: + _md += "`null` \n" if d is None else "`%s:%s` \n" % (prefix, repr(d)) + return _md + + class FileConversionException(BaseException): pass @@ -1285,6 +1335,7 @@ def __init__( self.register_page_converter(IpynbConverter()) self.register_page_converter(PdfConverter()) self.register_page_converter(ZipConverter()) + self.register_page_converter(JsonConverter()) def convert( self, source: Union[str, requests.Response, Path], **kwargs: Any From 25b40899176985328088284939a8dc0cdf872983 Mon Sep 17 00:00:00 2001 From: Certainly Not a Genius Date: Wed, 25 Dec 2024 21:58:02 +0100 Subject: [PATCH 2/2] added comment to json converter --- src/markitdown/_markitdown.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 1e6e83e2..00104be9 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1206,14 +1206,21 @@ def convert( class JsonConverter(DocumentConverter): + """Converts generic json files to markdown : + - keys are prefixed with the whole dictionnary tree starting with the file name + - values are kept untouched + - key/values are between backtips + - (ordered) lists are converted into markdown ordered lists starting at 0.""" + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: - # Bail if not an image extension = kwargs.get("file_extension", "") if extension.lower() not in [".json"]: return None + # TODO : check similar extensions and/or mime type + with open(local_path) as test_json: try: json_data = json.load(test_json)