From 259c4d94a2a117199ae585a78aaa41ccae8d5566 Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 23:42:17 +0000 Subject: [PATCH] formatting --- Cargo.toml | 2 +- pyproject.toml | 2 +- python/dolma/warc/processor.py | 7 ++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 517035db..5d6fdadc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "dolma" -version = "1.0.15" +version = "1.1.0" edition = "2021" license = "Apache-2.0" diff --git a/pyproject.toml b/pyproject.toml index f75f1da9..dc89ab78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dolma" -version = "1.0.15" +version = "1.1.0" description = "Data filters" license = { text = "Apache-2.0" } readme = "README.md" diff --git a/python/dolma/warc/processor.py b/python/dolma/warc/processor.py index c59f6f51..474c6ca9 100644 --- a/python/dolma/warc/processor.py +++ b/python/dolma/warc/processor.py @@ -134,9 +134,10 @@ def process_single( extension = extension.replace(".gz", "").replace(".warc", "") + ".jsonl.gz" destination_path = join_path(prot, *base_dst[:-1], base_dst[-1] + extension) - with smart_open.open(source_path, "rb") as warc_file, smart_open.open( - destination_path, "wb" - ) as output_file: + with ( + smart_open.open(source_path, "rb") as warc_file, + smart_open.open(destination_path, "wb") as output_file, + ): it = ArchiveIterator(warc_file, record_types=WarcRecordType.response | WarcRecordType.warcinfo) for record in it: if record.record_type == WarcRecordType.warcinfo: