From 57c4b212102d3c1aebe89ffb6ad496a7d0399e86 Mon Sep 17 00:00:00 2001 From: Jan Jurgen Griesfeller Date: Tue, 19 Dec 2023 10:38:55 +0100 Subject: [PATCH] added tar file reading --- .../AeronetSdaTimeseriesReader.py | 60 +++++++++++++------ tests/test_AERONETSDATimeSeriesReader.py | 4 +- 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/src/pyaro_readers/aeronetsdareader/AeronetSdaTimeseriesReader.py b/src/pyaro_readers/aeronetsdareader/AeronetSdaTimeseriesReader.py index 5eb84db..c4c4d96 100644 --- a/src/pyaro_readers/aeronetsdareader/AeronetSdaTimeseriesReader.py +++ b/src/pyaro_readers/aeronetsdareader/AeronetSdaTimeseriesReader.py @@ -11,6 +11,7 @@ import numpy as np import requests import tarfile +import gzip from pyaro.timeseries import ( AutoFilterReaderEngine, Data, @@ -55,6 +56,8 @@ FILL_COUNTRY_FLAG = False +FILE_MASK = ".ONEILL_lev20" + class AeronetSdaTimeseriesReader(AutoFilterReaderEngine.AutoFilterReader): def __init__( @@ -108,25 +111,44 @@ def __init__( # the general format of the data is the same though. # so we just keep the header lines of the 1st station, and add all data lines of all stations # That way we get to the same file format as the zip file - r = requests.get(self._filename) - with tarfile.open(fileobj=BytesIO(r.raw.read()), mode="r") as tf: - lines = [] - for _midx, member in enumerate(tf.getmembers()): - f = tf.extractfile(member) - if _midx == 0: - lines.extend( - [line.decode("utf-8") for line in f.readlines()] - ) - else: - # skip the header lines - for _hidx in range(HEADER_LINE_NO): - dummy = f.readline() - - lines.extend([line.decode("utf-8") for line in f.readlines()]) - except tarfile.TarError: - # read as text file - response = urlopen(self._filename) - lines = [line.decode("utf-8") for line in response.readlines()] + r.close() + try: + r = requests.get(self._filename) + with tarfile.open(fileobj=BytesIO(r.content), mode="r") as tf: + lines = [] + _fidx = 0 + members = tf.getmembers() + bar = tqdm(desc="extracting tar file...", total=len(members)) + for _midx, member in enumerate(members): + if member.name.endswith(FILE_MASK): + bar.update(1) + f = tf.extractfile(member) + if _fidx == 0: + lines.extend( + [line.decode("utf-8") for line in f.readlines()] + ) + _fidx += 1 + else: + # skip the header lines + for _hidx in range(HEADER_LINE_NO): + dummy = f.readline() + + lines.extend( + [line.decode("utf-8") for line in f.readlines()] + ) + else: + continue + + # too many possible exceptions due to different tar possible tar file + # compressions. Just try to read as text if everything fails + except: + # read as text file + r.close() + try: + response = urlopen(self._filename) + lines = [line.decode("utf-8") for line in response.readlines()] + except Exception as e: + print(e) else: with open(self._filename, newline="") as csvfile: diff --git a/tests/test_AERONETSDATimeSeriesReader.py b/tests/test_AERONETSDATimeSeriesReader.py index 9a67acb..cfeca38 100644 --- a/tests/test_AERONETSDATimeSeriesReader.py +++ b/tests/test_AERONETSDATimeSeriesReader.py @@ -45,8 +45,8 @@ def test_dl_data_tared(self): count = 0 for var in ts.variables(): count += len(ts.data(var)) - self.assertEqual(count, 79944) - self.assertEqual(len(ts.stations()), 4) + self.assertEqual(count, 421984) + self.assertEqual(len(ts.stations()), 94) def test_dl_data_unzipped(self): if not self.external_resource_available(TEST_URL):