Skip to content

Commit

Permalink
added tar file reading
Browse files Browse the repository at this point in the history
  • Loading branch information
Jan Jurgen Griesfeller committed Dec 19, 2023
1 parent f318d40 commit 57c4b21
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 21 deletions.
60 changes: 41 additions & 19 deletions src/pyaro_readers/aeronetsdareader/AeronetSdaTimeseriesReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
import requests
import tarfile
import gzip
from pyaro.timeseries import (
AutoFilterReaderEngine,
Data,
Expand Down Expand Up @@ -55,6 +56,8 @@

FILL_COUNTRY_FLAG = False

FILE_MASK = ".ONEILL_lev20"


class AeronetSdaTimeseriesReader(AutoFilterReaderEngine.AutoFilterReader):
def __init__(
Expand Down Expand Up @@ -108,25 +111,44 @@ def __init__(
# the general format of the data is the same though.
# so we just keep the header lines of the 1st station, and add all data lines of all stations
# That way we get to the same file format as the zip file
r = requests.get(self._filename)
with tarfile.open(fileobj=BytesIO(r.raw.read()), mode="r") as tf:
lines = []
for _midx, member in enumerate(tf.getmembers()):
f = tf.extractfile(member)
if _midx == 0:
lines.extend(
[line.decode("utf-8") for line in f.readlines()]
)
else:
# skip the header lines
for _hidx in range(HEADER_LINE_NO):
dummy = f.readline()

lines.extend([line.decode("utf-8") for line in f.readlines()])
except tarfile.TarError:
# read as text file
response = urlopen(self._filename)
lines = [line.decode("utf-8") for line in response.readlines()]
r.close()
try:
r = requests.get(self._filename)
with tarfile.open(fileobj=BytesIO(r.content), mode="r") as tf:
lines = []
_fidx = 0
members = tf.getmembers()
bar = tqdm(desc="extracting tar file...", total=len(members))
for _midx, member in enumerate(members):
if member.name.endswith(FILE_MASK):
bar.update(1)
f = tf.extractfile(member)
if _fidx == 0:
lines.extend(
[line.decode("utf-8") for line in f.readlines()]
)
_fidx += 1
else:
# skip the header lines
for _hidx in range(HEADER_LINE_NO):
dummy = f.readline()

lines.extend(
[line.decode("utf-8") for line in f.readlines()]
)
else:
continue

# too many possible exceptions due to different tar possible tar file
# compressions. Just try to read as text if everything fails
except:
# read as text file
r.close()
try:
response = urlopen(self._filename)
lines = [line.decode("utf-8") for line in response.readlines()]
except Exception as e:
print(e)

else:
with open(self._filename, newline="") as csvfile:
Expand Down
4 changes: 2 additions & 2 deletions tests/test_AERONETSDATimeSeriesReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def test_dl_data_tared(self):
count = 0
for var in ts.variables():
count += len(ts.data(var))
self.assertEqual(count, 79944)
self.assertEqual(len(ts.stations()), 4)
self.assertEqual(count, 421984)
self.assertEqual(len(ts.stations()), 94)

def test_dl_data_unzipped(self):
if not self.external_resource_available(TEST_URL):
Expand Down

0 comments on commit 57c4b21

Please sign in to comment.