From 82839b545b0d03df775f270662c37b5e94ed52bc Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Fri, 9 Apr 2021 17:49:00 +0200 Subject: [PATCH 01/30] Added column 'data_names' to datasource --- metacatalog/models/datasource.py | 183 ++++++++++++++++--------------- 1 file changed, 95 insertions(+), 88 deletions(-) diff --git a/metacatalog/models/datasource.py b/metacatalog/models/datasource.py index 99bc62dc..9ad8db22 100644 --- a/metacatalog/models/datasource.py +++ b/metacatalog/models/datasource.py @@ -17,15 +17,15 @@ class DataSourceType(Base): r"""Data Source Type - Model to represent a type of datasource. - + Model to represent a type of datasource. + Note ---- - While it is possible to add more records to the table, - this is the only Class that needs actual Python functions to - handle the database input. Usually, each type of datasource - relies on a specific :mod:`importer ` - and reader :mod:`reader ` that can use + While it is possible to add more records to the table, + this is the only Class that needs actual Python functions to + handle the database input. Usually, each type of datasource + relies on a specific :mod:`importer ` + and reader :mod:`reader ` that can use the information saved in a :class:`DataSource` to perform I/O operations. Attributes @@ -59,7 +59,7 @@ def to_dict(self, deep=False) -> dict: Parameters ---------- deep : bool - If True, all related objects will be included as + If True, all related objects will be included as dictionary as well and deep will be passed down. Defaults to False @@ -79,11 +79,11 @@ def to_dict(self, deep=False) -> dict: # set optionals if self.description is not None: d['description'] = self.description - + # deep loading if deep: d['sources'] = [s.to_dict(deep=True) for s in self.sources] - + return d def __str__(self): @@ -92,11 +92,11 @@ def __str__(self): class DataType(Base): """ - DataType is describing the type of the actual data. - The metacatalog documentation includes several default abstract - types. Each combination of - :class:`DataType ` and - :class:`DataSourceType ` can be + DataType is describing the type of the actual data. + The metacatalog documentation includes several default abstract + types. Each combination of + :class:`DataType ` and + :class:`DataSourceType ` can be assigned with custom reader and writer functions. Attributes @@ -132,7 +132,7 @@ def to_dict(self, deep=False) -> dict: Parameters ---------- deep : bool - If True, all related objects will be included as + If True, all related objects will be included as dictionary as well and deep will be passed down. Defaults to False @@ -154,7 +154,7 @@ def to_dict(self, deep=False) -> dict: d['description'] = self.description if self.parent_id is not None: d['parent_id'] = self.parent_id - + # deep loading if deep: d['sources'] = [s.to_dict(deep=True) for s in self.sources] @@ -165,15 +165,15 @@ def to_dict(self, deep=False) -> dict: else: d['parents'] = [dt.to_dict(deep=False) for dt in self.parent_list()] d['children'] = [dt.to_dict(deep=False) for dt in self.children_list()] - + return d def parent_list(self): """ Returns an inheritance tree for the current datatype. - If the list is empty, the current datatype is a - top-level datatype. - Otherwise, the list contains all parent datatypes + If the list is empty, the current datatype is a + top-level datatype. + Otherwise, the list contains all parent datatypes that the current one inherits from. """ @@ -182,15 +182,15 @@ def parent_list(self): current_parent = self.parent while current_parent is not None: parents.append(current_parent) - + return parents def children_list(self): """ Returns an dependency tree for the current datatype. - If the list is empty, there are no child (inheriting) + If the list is empty, there are no child (inheriting) datatypes for the current datatype. - Otherwise, the list contains all child datatypes that + Otherwise, the list contains all child datatypes that are inheriting the current datatype. """ children = [] @@ -204,8 +204,8 @@ def children_list(self): class TemporalScale(Base): """ - The TemporalScale is used to commonly describe the temporal scale at which - the data described is valid. metacatalog uses the scale triplet + The TemporalScale is used to commonly describe the temporal scale at which + the data described is valid. metacatalog uses the scale triplet (spacing, extent, support), but renames ``'spacing'`` to ``'resolution'``. Attributes @@ -213,8 +213,8 @@ class TemporalScale(Base): id : int Unique id of the record. If not specified, the database will assign it. resolution : str - Temporal resolution. The resolution has to be given as an ISO 8601 - Duration, or a fraction of it. You can substitute standalone minutes can + Temporal resolution. The resolution has to be given as an ISO 8601 + Duration, or a fraction of it. You can substitute standalone minutes can be identified by non-ISO ``'min'``. .. code-block:: python resolution = '15min' @@ -222,21 +222,21 @@ class TemporalScale(Base): .. code-block:: 'P[n]Y[n]M[n]DT[n]H[n]M[n]S' observation_start : datetime.datetime - Point in time, when the first observation was made. + Point in time, when the first observation was made. Forms the temporal extent toghether with `observation_end`. observation_end : datetime.datetime Point in time, when the last available observation was made. Forms the temporal extent toghether with `observation_start`. support : float The support gives the temporal validity for a single observation. - It specifies the time before and after observation, that is still - represented by the observation. - It is given as a fraction of resolution. - I.e. if ``support=0.5`` at ``resolution='10min'``, the observation - supports ``5min`` (2.5min before and after the timestamp) and the - resulting dataset would **not** be exhaustive. - Defaults to ``support=1.0``, which would make a temporal exhaustive - dataset, but may not apply to each dataset. + It specifies the time before and after observation, that is still + represented by the observation. + It is given as a fraction of resolution. + I.e. if ``support=0.5`` at ``resolution='10min'``, the observation + supports ``5min`` (2.5min before and after the timestamp) and the + resulting dataset would **not** be exhaustive. + Defaults to ``support=1.0``, which would make a temporal exhaustive + dataset, but may not apply to each dataset. """ __tablename__ = 'temporal_scales' @@ -253,7 +253,7 @@ class TemporalScale(Base): sources = relationship("DataSource", back_populates='temporal_scale') def __init__(self, *args, **kwargs): - # handle resoultion + # handle resoultion if 'resolution_timedelta' in kwargs: kwargs['resolution'] = pd.to_timedelta(kwargs['resolution_timedelta']).isoformat() del kwargs['resolution_timedelta'] @@ -264,8 +264,8 @@ def __init__(self, *args, **kwargs): if 'resolution' in kwargs: kwargs['resolution'] = pd.to_timedelta(kwargs['resolution']).isoformat() super(TemporalScale, self).__init__(*args, **kwargs) - - @property + + @property def resolution_timedelta(self): return pd.to_timedelta(self.resolution) @@ -284,7 +284,7 @@ def support_timedelta(self, delta): @property def extent(self): return [self.observation_start, self.observation_end] - + @extent.setter def extent(self, extent): self.observation_start, self.observation_end = extent @@ -297,7 +297,7 @@ def to_dict(self, deep=False) -> dict: Parameters ---------- deep : bool - If True, all related objects will be included as + If True, all related objects will be included as dictionary. Defaults to False Returns @@ -317,14 +317,14 @@ def to_dict(self, deep=False) -> dict: if deep: d['datasources'] = [s.to_dict(deep=False) for s in self.sources] - + return d class SpatialScale(Base): """ - The SpatialScale is used to commonly describe the spatial scale at which - the data described is valid. metacatalog uses the scale triplet + The SpatialScale is used to commonly describe the spatial scale at which + the data described is valid. metacatalog uses the scale triplet (spacing, extent, support), but renames ``'spacing'`` to ``'resolution'``. Attributes @@ -333,22 +333,22 @@ class SpatialScale(Base): Unique id of the record. If not specified, the database will assign it. resolution : int Spatial resoultion in meter. The resolution usually describes a grid - cell size, which only applies to gridded datasets. Use the + cell size, which only applies to gridded datasets. Use the :attr:`resolution_str` property for a string representation extent : geoalchemy2.Geometry - The spatial extent of the dataset is given as a ``'POLYGON'``. While - metacatalog is capable of storing any kind of valid POLYGON as extent, + The spatial extent of the dataset is given as a ``'POLYGON'``. While + metacatalog is capable of storing any kind of valid POLYGON as extent, it is best practice to allow only Bounding Boxes on upload. support : float The support gives the spatial validity for a single observation. It specifies the spatial extent at which an observed value is valid. - It is given as a fraction of resolution. For gridded datasets, it is - common to set support to 1, as the observations are validated to - represent the whole grid cell. In case ground truthing data is - available, the actual footprint fraction of observations can be - given here. - Defaults to ``support=1.0``. - + It is given as a fraction of resolution. For gridded datasets, it is + common to set support to 1, as the observations are validated to + represent the whole grid cell. In case ground truthing data is + available, the actual footprint fraction of observations can be + given here. + Defaults to ``support=1.0``. + """ __tablename__ = 'spatial_scales' @@ -375,7 +375,7 @@ def resolution_str(self): if self.resolution / 1000 > 1: return '%d km' % (int(self.resolution / 1000)) return '%.1f m' % self.resolution - + @property def support_str(self): if (self.support * self.resultion) / 1000 > 1: @@ -390,7 +390,7 @@ def to_dict(self, deep=False) -> dict: Parameters ---------- deep : bool - If True, all related objects will be included as + If True, all related objects will be included as dictionary. Defaults to False Returns @@ -411,15 +411,15 @@ def to_dict(self, deep=False) -> dict: if deep: d['datasources'] = [s.to_dict(deep=False) for s in self.sources] - + return d class DataSource(Base): r"""DataSource - Model to represent a datasource of a specific - :class:`Entry `. The datasource further specifies + Model to represent a datasource of a specific + :class:`Entry `. The datasource further specifies an :class:`DataSourceType` by setting a ``path`` and ``args``. Attributes @@ -427,28 +427,32 @@ class DataSource(Base): id : int Unique id of the record. If not specified, the database will assign it. path : str - Path to the actual data. Depending on type, this can be a filepath, SQL + Path to the actual data. Depending on type, this can be a filepath, SQL tablename or URL. encoding : str - The encoding of the file or database representation of the actual + The encoding of the file or database representation of the actual data. Defaults to ``'utf-8'``. Do only change if necessary. args : str - Optional. If the I/O classes need further arguments, these can be stored + Optional. If the I/O classes need further arguments, these can be stored as a JSON-serializable str. Will be parsed into a dict and passed to the I/O functions as **kwargs. type_id : int - Foreign key referencing the :class:`DataSourceType`. + Foreign key referencing the :class:`DataSourceType`. type : metacatalog.models.DataSourceType - The referenced :class:`DataSourceType`. Can be used instead of setting + The referenced :class:`DataSourceType`. Can be used instead of setting ``type_id``. + data_names : list + .. versionadded:: 0.2.12 + List of column names that will be displayed when exporting the data. + The columns are named in the same order as they appear in the list. Example ------- - There is a :class:`DataSourceType` of ``name='internal'``, which handles - I/O operations on tables in the same database. The datasource itself - will then store the tablename as ``path``. It can be linked to - :class:`Entry ` in a 1:n relationship. - This way, the admin has the full control over data-tables, while still using + There is a :class:`DataSourceType` of ``name='internal'``, which handles + I/O operations on tables in the same database. The datasource itself + will then store the tablename as ``path``. It can be linked to + :class:`Entry ` in a 1:n relationship. + This way, the admin has the full control over data-tables, while still using common I/O classes. """ @@ -460,6 +464,7 @@ class DataSource(Base): datatype_id = Column(Integer, ForeignKey('datatypes.id'), nullable=False) encoding = Column(String(64), default='utf-8') path = Column(String, nullable=False) + data_names = Column(String) args = Column(String) # scales @@ -488,7 +493,7 @@ def to_dict(self, deep=False) -> dict: Parameters ---------- deep : bool - If True, all related objects will be included as + If True, all related objects will be included as dictionary. Defaults to False Returns @@ -507,6 +512,8 @@ def to_dict(self, deep=False) -> dict: ) # set optionals + if self.data_names is not None: + d['data_names'] = self.data_names if self.args is not None: d['args'] = self.parse_args() if self.encoding is not None: @@ -515,12 +522,12 @@ def to_dict(self, deep=False) -> dict: d['temporal_scale'] = self.temporal_scale.to_dict(deep=False) if self.spatial_scale is not None: d['spatial_scale'] = self.spatial_scale.to_dict(deep=False) - + # deep loading if deep: d['entries'] = [e.to_dict() for e in self.entries] - + return d def parse_args(self): @@ -529,10 +536,10 @@ def parse_args(self): use load_args instead Note - ---- + ---- Load the contents of the args column as assumed JSON string. This will be passed to the importer/adder function as **kwargs. - Therefore this is only useful for a DB admin and should not be + Therefore this is only useful for a DB admin and should not be exposed to the end-user. """ @@ -543,33 +550,33 @@ def parse_args(self): # parse and return else: return json.loads(self.args) - + def load_args(self) -> dict: """ .. versionadded:: 0.1.11 - + Load the stored arguments from the ``'args'`` column. - It was filled by a JSON string and will be converted as - dict before. - This dict is usually used for I/O operations and passed + It was filled by a JSON string and will be converted as + dict before. + This dict is usually used for I/O operations and passed as keyword arguments. - Therefore this is only useful for a DB admin and should not be + Therefore this is only useful for a DB admin and should not be exposed to the end-user. """ return self.parse_args() - + def save_args_from_dict(self, args_dict, commit=False): """Save to args - Save all given keyword arguments to the database. + Save all given keyword arguments to the database. These are passed to the importer/adder functions as **kwargs. Parameters ---------- args_dict : dict - Dictionary of JSON-serializable keyword arguments that - will be stored as a JSON string in the database. + Dictionary of JSON-serializable keyword arguments that + will be stored as a JSON string in the database. Note ---- @@ -603,7 +610,7 @@ def create_scale(self, resolution, extent, support, scale_dimension): Cls = SpatialScale else: raise AttributeError("scale_dimension has to be in ['temporal', 'spatial']") - + # build the scale and append scale = Cls(resolution=resolution, extent=extent, support=support) setattr(self, '%s_scale' % scale_dimension.lower(), scale) @@ -613,8 +620,8 @@ def get_source_importer(self): .. deprecated:: 0.1.12 Will be removed with version 0.2 - This function is usually called by a - :class:`Entry ` object. It returns a function + This function is usually called by a + :class:`Entry ` object. It returns a function that will import the data into the correct source. """ From c0e579ff0e13588e1fc569096ab4c05ff4a36e13 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Fri, 9 Apr 2021 18:06:18 +0200 Subject: [PATCH 02/30] Added the Class timeseries_array --- metacatalog/models/__init__.py | 9 +++++---- metacatalog/models/timeseries_array.py | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) create mode 100644 metacatalog/models/timeseries_array.py diff --git a/metacatalog/models/__init__.py b/metacatalog/models/__init__.py index a1d6a502..234f4331 100644 --- a/metacatalog/models/__init__.py +++ b/metacatalog/models/__init__.py @@ -1,13 +1,13 @@ """ -The metacatalog meta data model is split up into different -entities, each representend by a Python class. -Metacatalog uses sqlalchemy to model relationships between the +The metacatalog meta data model is split up into different +entities, each representend by a Python class. +Metacatalog uses sqlalchemy to model relationships between the classes and create and populate an appropriate database instance to store Records of these entities. Note ---- -Due to usage of the geoalchemy2 extension, which can currently only +Due to usage of the geoalchemy2 extension, which can currently only be stored in a PostGIS enabled PostgreSQL database, only PostgreSQL is supported. This may change in a future version. @@ -21,6 +21,7 @@ from .variable import Variable, Unit from .datasource import DataSource, DataSourceType, DataType, SpatialScale, TemporalScale from .timeseries import TimeseriesPoint, TimeseriesPoint2D +from .timeseries_array import TimeseriesArray from .generic_data import DataPoint, DataPoint2D from .geometry_data import GeometryTimeseries, GenericGeometryData from .config import Log, LogCodes diff --git a/metacatalog/models/timeseries_array.py b/metacatalog/models/timeseries_array.py new file mode 100644 index 00000000..d32ca075 --- /dev/null +++ b/metacatalog/models/timeseries_array.py @@ -0,0 +1,19 @@ +import pandas as pd +from sqlalchemy import Column, ForeignKey +from sqlalchemy import Integer, DateTime, Numeric, ARRAY + +from metacatalog.db.base import Base + + +class TimeseriesArray(Base): + __tablename__ = 'timeseries_array' + + # columns + entry_id = Column(Integer, ForeignKey('entries.id'), primary_key=True) + tstamp = Column(DateTime, primary_key=True) + data = Column(ARRAY(Numeric), nullable=False) + precision = Column(ARRAY(Numeric), nullable=True) + + @classmethod + def is_valid_timeseries(cls, data): + return isinstance(data, (pd.DataFrame, pd.Series)) and isinstance(data.index, pd.DatetimeIndex) From a6e957dab6292f51c23038740913ac26de9ab588 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Fri, 9 Apr 2021 18:38:49 +0200 Subject: [PATCH 03/30] Using sqlalchemy.dialects.postgresql ARRAY now --- metacatalog/models/timeseries_array.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metacatalog/models/timeseries_array.py b/metacatalog/models/timeseries_array.py index d32ca075..12ea3b9b 100644 --- a/metacatalog/models/timeseries_array.py +++ b/metacatalog/models/timeseries_array.py @@ -1,6 +1,7 @@ import pandas as pd from sqlalchemy import Column, ForeignKey -from sqlalchemy import Integer, DateTime, Numeric, ARRAY +from sqlalchemy import Integer, DateTime, Numeric +from sqlalchemy.dialects.postgresql import ARRAY from metacatalog.db.base import Base From 21677e7986ceef3dc07415e4a176f1fe3c4bc2a8 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Mon, 12 Apr 2021 16:37:30 +0200 Subject: [PATCH 04/30] added column_names to Variable and modified read.py --- metacatalog/ext/io/reader.py | 16 +++++++++---- metacatalog/models/variable.py | 42 ++++++++++++++++++++-------------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/metacatalog/ext/io/reader.py b/metacatalog/ext/io/reader.py index 4bea1ed9..765764fb 100644 --- a/metacatalog/ext/io/reader.py +++ b/metacatalog/ext/io/reader.py @@ -23,9 +23,15 @@ def read_from_internal_table(entry, datasource, start=None, end=None, **kwargs): sql += " AND tstamp <= '%s'" % (dt.strftime(end, '%Y-%m-%d %H:%M:%S')) # infer table column names order - col_sql = 'select * from %s limit 0' % tablename - col_names = list(pd.read_sql_query(col_sql, session.bind).columns.values) - col_names.remove('entry_id') + if datasource.data_name is not None: + col_names = data_name + elif entry.variable.column_names is not None: + col_names = variable.column_names + else: + col_sql = 'select * from %s limit 0' % tablename + col_names = list(pd.read_sql_query(col_sql, session.bind).columns.values) + col_names.remove('entry_id') + if 'index' in col_names: index_col = ['index'] col_names.remove('index') @@ -57,8 +63,8 @@ def read_from_local_csv_file(entry, datasource, **kwargs): data.set_index('tstamp', inplace=True) elif 'index' in data: data.set_index('index', inplace=True) - + # map column names df.columns = [entry.variable.name if _col== 'value' else _col for _col in df.columns] - return data \ No newline at end of file + return data diff --git a/metacatalog/models/variable.py b/metacatalog/models/variable.py index a9dc447c..a6bc298a 100644 --- a/metacatalog/models/variable.py +++ b/metacatalog/models/variable.py @@ -1,6 +1,7 @@ from sqlalchemy import Column, ForeignKey from sqlalchemy import Integer, String from sqlalchemy.orm import relationship +from sqlalchemy.dialects.postgresql import ARRAY from metacatalog.db.base import Base @@ -16,9 +17,9 @@ class Unit(Base): name : str Full name of the Unit symbol : str - A max. 12 letter symbol that is **commonly** used to represent the + A max. 12 letter symbol that is **commonly** used to represent the unit - si : str + si : str Optional. If applicable, the conversion if the unit into SI units. If the unit is i.e. m/km the si would be m*1000^-1*m^-1 variables : list @@ -44,7 +45,7 @@ def to_dict(self, deep=False) -> dict: Parameters ---------- deep : bool - If True, all related objects will be included as + If True, all related objects will be included as dictionary. Defaults to False Returns @@ -77,19 +78,19 @@ def __str__(self) -> str: class Variable(Base): r""" - Model to represent variables. The variable is any kind of oberservation, - that can be represented by one data type. metacatalog does not take the + Model to represent variables. The variable is any kind of oberservation, + that can be represented by one data type. metacatalog does not take the definition of variables too strict. It is however common to keep variables - as atomic as possbile. - - However, technically, you can also create a new variable that describes a - combined data type and reference a newly created table via - `DataSource `. This can make sense if in the - scope and context of the metacatalog installation a sensor like a Decagon - 5TE always records three parameters at a time like Temperature, Moisture - and Conductance. That can be implemented as a new '5TE' variable and the + as atomic as possbile. + + However, technically, you can also create a new variable that describes a + combined data type and reference a newly created table via + `DataSource `. This can make sense if in the + scope and context of the metacatalog installation a sensor like a Decagon + 5TE always records three parameters at a time like Temperature, Moisture + and Conductance. That can be implemented as a new '5TE' variable and the datasource would point to a table containing all three measurements. - **Note that this should not be common practice and will make your + **Note that this should not be common practice and will make your metadata unusable in other contexts**. Attributes @@ -99,13 +100,17 @@ class Variable(Base): name : str Full name of the Unit symbol : str - A max. 12 letter symbol that is **commonly** used to represent the + A max. 12 letter symbol that is **commonly** used to represent the unit - si : str + si : str Optional. If applicable, the conversion if the unit into SI units. If the unit is i.e. m/km the si would be m*1000^-1*m^-1 variables : list Lazy loaded list of Variables that use the current unit + column_names : list + .. versionadded:: 0.2.12 + List of default column names that will be displayed when exporting the data. + The columns are named in the same order as they appear in the list. """ __tablename__ = 'variables' @@ -114,6 +119,7 @@ class Variable(Base): id = Column(Integer, primary_key=True) name = Column(String(64), nullable=False) symbol = Column(String(12), nullable=False) + column_names = Column(ARRAY(String)) unit_id = Column(Integer, ForeignKey('units.id'), nullable=False) keyword_id = Column(Integer, ForeignKey('keywords.id')) @@ -130,7 +136,7 @@ def to_dict(self, deep=False) -> dict: Parameters ---------- deep : bool - If True, all related objects will be included as + If True, all related objects will be included as dictionary. Defaults to False Returns @@ -148,6 +154,8 @@ def to_dict(self, deep=False) -> dict: ) # set optionals + if self.column_names is not None: + d['column_names'] = self.column_names for attr in ('keyword'): if hasattr(self, attr) and getattr(self, attr) is not None: d[attr] = getattr(self, attr).to_dict(deep=False) From 468038b09529e5324e7369dafbce70ff0111e96d Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Tue, 20 Apr 2021 08:15:08 +0200 Subject: [PATCH 05/30] importer save data_names to the datasource, fill variable.column_names if None. --- metacatalog/ext/io/importer.py | 45 ++++++++++++++++++++-------------- metacatalog/models/variable.py | 11 ++++++--- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/metacatalog/ext/io/importer.py b/metacatalog/ext/io/importer.py index bd2fb409..4ba324d9 100644 --- a/metacatalog/ext/io/importer.py +++ b/metacatalog/ext/io/importer.py @@ -7,26 +7,24 @@ from metacatalog.models.timeseries import TimeseriesPoint -def import_to_internal_table(entry, datasource, data, mapping=None, **kwargs): +def import_to_internal_table(entry, datasource, data, force_data_names=False, **kwargs): """Import to internal DB - The given data is imported into the table - as specified in the datasource. The data column names need to - fit the names as implemented in the database. The mapping - keyword can be used to rename + The given data is imported into the table + as specified in the datasource. + If force_data_names=True the column names of the imported data are saved in + the datasource, otherwise the standard column names in + entry.variable.column_names are used. The column names in + datasource.data_names are used when exporting the data. """ # check that entry is valid assert Entry.is_valid(entry) - + if isinstance(data, pd.Series): data = pd.DataFrame(data) # reset the index imp = data.reset_index(level=0, inplace=False) - - # rename if mapping is given - if mapping is not None and isinstance(mapping, dict): - imp.rename(columns=mapping, inplace=True) # set entry_id if 'entry_id' not in imp.columns: @@ -38,6 +36,15 @@ def import_to_internal_table(entry, datasource, data, mapping=None, **kwargs): else: session = object_session(entry) + # save column names in datasource.data_names + if len(imp.columns) != len(entry.variable.column_names): + force_data_names = True + if force_data_names: + datasource.data_names = imp.columns + else: + datasource.data_names = entry.variable.column_names + + # get the path / table name into which the data should be imported if datasource.path is None: tablename = 'data_entry_%d' % entry.id datasource.path = tablename @@ -51,8 +58,8 @@ def import_to_internal_table(entry, datasource, data, mapping=None, **kwargs): if not all([col in col_names for col in imp.columns.values]): raise ValueError('The input data has columns, that are not present in the database.\n %s' % ', '.join(col_names)) - - # else import + + # else import if_exists = kwargs.get('if_exists', 'append') imp.to_sql(tablename, session.bind, index=None, if_exists=if_exists) @@ -61,8 +68,8 @@ def import_to_local_csv_file(entry, datasource, data, **kwargs): """Import to CSV Saves timeseries data to a local CSV file. - Any existing file will be overwritten. - The default location can be overwritten using the path keyword. + Any existing file will be overwritten. + The default location can be overwritten using the path keyword. """ assert Entry.is_valid(entry) @@ -70,12 +77,12 @@ def import_to_local_csv_file(entry, datasource, data, **kwargs): # get the path if datasource.path is None: path = os.path.join(os.path.expanduser('~')) - + # check for filename if not path.endswith('.csv'): path = os.path.join(path, 'entry_%d.csv' % entry.id) datasource.path = path - + # save new path __update_datasource(datasource) @@ -87,7 +94,7 @@ def import_to_local_csv_file(entry, datasource, data, **kwargs): # save the data if if_exists == 'replace': imp.to_csv(path, index=None) - + elif if_exists == 'append': df = pd.read_csv(path, index=None) new_df = df.append(imp, ignore_index=True) @@ -98,7 +105,7 @@ def import_to_local_csv_file(entry, datasource, data, **kwargs): raise ValueError('%s already exists.' % path) else: data.to_csv(path, index=None) - + else: raise ValueError("if_exists has to be one of ['fail', 'append', 'replace']") @@ -112,4 +119,4 @@ def __update_datasource(datasource): session.rollback() raise e - return datasource \ No newline at end of file + return datasource diff --git a/metacatalog/models/variable.py b/metacatalog/models/variable.py index a6bc298a..97905a49 100644 --- a/metacatalog/models/variable.py +++ b/metacatalog/models/variable.py @@ -123,6 +123,12 @@ class Variable(Base): unit_id = Column(Integer, ForeignKey('units.id'), nullable=False) keyword_id = Column(Integer, ForeignKey('keywords.id')) + # fill column_names if not specified + if column_names is None: + # this wouldn´t work for 3D wind data, as there must be 3 column_names + # but it would work for the (standard 1D variables) + variable.column_names = variable.name + # relationships entries = relationship("Entry", back_populates='variable') unit = relationship("Unit", back_populates='variables') @@ -150,12 +156,11 @@ def to_dict(self, deep=False) -> dict: id=self.id, name=self.name, symbol=self.symbol, - unit=self.unit.to_dict(deep=False) + unit=self.unit.to_dict(deep=False), + column_names=self.column_names ) # set optionals - if self.column_names is not None: - d['column_names'] = self.column_names for attr in ('keyword'): if hasattr(self, attr) and getattr(self, attr) is not None: d[attr] = getattr(self, attr).to_dict(deep=False) From e17e6467be2877259b828e393c8776105ed1b986 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Mon, 26 Apr 2021 10:45:45 +0200 Subject: [PATCH 06/30] Added unstacking of multi-dim data --- metacatalog/ext/io/reader.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/metacatalog/ext/io/reader.py b/metacatalog/ext/io/reader.py index 765764fb..d18bda62 100644 --- a/metacatalog/ext/io/reader.py +++ b/metacatalog/ext/io/reader.py @@ -24,13 +24,13 @@ def read_from_internal_table(entry, datasource, start=None, end=None, **kwargs): # infer table column names order if datasource.data_name is not None: - col_names = data_name + col_names = datasource.data_name elif entry.variable.column_names is not None: col_names = variable.column_names else: col_sql = 'select * from %s limit 0' % tablename col_names = list(pd.read_sql_query(col_sql, session.bind).columns.values) - col_names.remove('entry_id') + col_names.remove('entry_id') if 'index' in col_names: index_col = ['index'] @@ -42,8 +42,14 @@ def read_from_internal_table(entry, datasource, start=None, end=None, **kwargs): # load data df = pd.read_sql(sql, session.bind, index_col=index_col, columns=col_names) + # unstack multi-dimensional data into the single columns + rawvalues = np.vstack(df['data'].values) + + df = pd.DataFrame(data=rawvalues, columns=col_names) + # map column names - df.columns = [entry.variable.name if _col== 'value' else _col for _col in df.columns] + # deprecated (?) + #df.columns = [entry.variable.name if _col== 'value' else _col for _col in df.columns] return df From 97465b0db1912d467f9e6af0624e9f0d8d36379f Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Mon, 3 May 2021 16:02:30 +0200 Subject: [PATCH 07/30] column_names in variable.csv, importer.py, reader.py --- metacatalog/data/variables.csv | 38 ++++++++++++++++---------------- metacatalog/ext/io/importer.py | 34 ++++++++++++++++++++++------- metacatalog/ext/io/reader.py | 40 ++++++++++++++++++++++------------ metacatalog/models/variable.py | 8 +------ 4 files changed, 72 insertions(+), 48 deletions(-) diff --git a/metacatalog/data/variables.csv b/metacatalog/data/variables.csv index 2da48b72..28662e0e 100644 --- a/metacatalog/data/variables.csv +++ b/metacatalog/data/variables.csv @@ -1,19 +1,19 @@ -id,name,symbol,unit_id,keyword_id -1,air temperature,Ta,101,111 -2,soil temperature,Ts,101,5736 -3,water temperature,Tw,101,7402 -4,discharge,Q,108,7327 -5,air pressure,p,104,109 -6,relative humidity,RH,112,6308 -7,daily rainfall sum,P,103,6434 -8,rainfall intensity,Pi,105,6436 -9,solar irradiance,SI,115,5236 -10,net radiation,Rn,115,5227 -11,gravimetric water content,u,114,5727 -12,volumetric water content,theta,113,5727 -13,precision,sigma,21, -14,sap flow,Fm,22,7424 -15,matric potential,phi,24, -16,bulk electrical conductivity,bEC,25,5111 -17,specific electrical conductivity,sEC,25,5111 -18,river water level,L,2, +id,name,symbol,column_names,unit_id,keyword_id +1,air temperature,Ta,air_temperature,101,111 +2,soil temperature,Ts,soil_temperature,101,5736 +3,water temperature,Tw,water_temperature,101,7402 +4,discharge,Q,discharge,108,7327 +5,air pressure,p,air_pressure,104,109 +6,relative humidity,RH,relative_humidity,112,6308 +7,daily rainfall sum,P,daily_rainfall_sum,103,6434 +8,rainfall intensity,Pi,rainfall_intensity,105,6436 +9,solar irradiance,SI,solar_irradiance,115,5236 +10,net radiation,Rn,net_radiation,115,5227 +11,gravimetric water content,u,gravimetric_water_content,114,5727 +12,volumetric water content,theta,volumetric_water_content,113,5727 +13,precision,sigma,precision,21, +14,sap flow,Fm,sap_flow,22,7424 +15,matric potential,phi,matric_potential,24, +16,bulk electrical conductivity,bEC,bulk_electrical_conductivity,25,5111 +17,specific electrical conductivity,sEC,specific_electrical_conductivity,25,5111 +18,river water level,L,river_water_level,2, diff --git a/metacatalog/ext/io/importer.py b/metacatalog/ext/io/importer.py index 4ba324d9..ffed049e 100644 --- a/metacatalog/ext/io/importer.py +++ b/metacatalog/ext/io/importer.py @@ -1,7 +1,9 @@ import os import pandas as pd +import sqlalchemy as sa from sqlalchemy.orm import object_session +from sqlalchemy.dialects.postgresql import ARRAY from metacatalog.models.entry import Entry from metacatalog.models.timeseries import TimeseriesPoint @@ -36,11 +38,20 @@ def import_to_internal_table(entry, datasource, data, force_data_names=False, ** else: session = object_session(entry) - # save column names in datasource.data_names - if len(imp.columns) != len(entry.variable.column_names): + # get the column names - exclude everthing that stores precisions + data_columns = [col for col in imp.columns.tolist() if not col.startswith('precision')] + + # get the precision columns + precision_columns = [col for col in imp.columns.tolist() if col.startswith('precision')] + + # get index + index = imp.index + + # save column names in datasource.data_names (excluding ) + if len(columns) != len(entry.variable.column_names): force_data_names = True if force_data_names: - datasource.data_names = imp.columns + datasource.data_names = columns else: datasource.data_names = entry.variable.column_names @@ -52,12 +63,19 @@ def import_to_internal_table(entry, datasource, data, force_data_names=False, ** else: tablename = datasource.path - # get the available column names from the database - sql = 'select * from %s limit 0' % tablename - col_names = pd.read_sql_query(sql, session.bind).columns.values + # transform the data into a list of arrays + values = [row for row in imp[data_columns].values] + precision = [row for row in imp[precision_columns].values] + + # explicitly map the column types + dtypes = { + 'tstamp': sa.TIMESTAMP, + 'values': ARRAY(sa.REAL), + 'precision': ARRAY(sa.REAL) + } - if not all([col in col_names for col in imp.columns.values]): - raise ValueError('The input data has columns, that are not present in the database.\n %s' % ', '.join(col_names)) + imp_data = pd.DataFrame(data={'tstamp': index, 'values': values, 'precision': precision}) + imp_data['entry_id'] = entry.id # else import if_exists = kwargs.get('if_exists', 'append') diff --git a/metacatalog/ext/io/reader.py b/metacatalog/ext/io/reader.py index d18bda62..ff22d50c 100644 --- a/metacatalog/ext/io/reader.py +++ b/metacatalog/ext/io/reader.py @@ -2,6 +2,7 @@ import pandas as pd from sqlalchemy.orm import object_session +from sqlalchemy.dialects.postgresql import ARRAY from metacatalog.models.entry import Entry @@ -23,14 +24,9 @@ def read_from_internal_table(entry, datasource, start=None, end=None, **kwargs): sql += " AND tstamp <= '%s'" % (dt.strftime(end, '%Y-%m-%d %H:%M:%S')) # infer table column names order - if datasource.data_name is not None: - col_names = datasource.data_name - elif entry.variable.column_names is not None: - col_names = variable.column_names - else: - col_sql = 'select * from %s limit 0' % tablename - col_names = list(pd.read_sql_query(col_sql, session.bind).columns.values) - col_names.remove('entry_id') + col_sql = 'select * from %s limit 0' % tablename + col_names = list(pd.read_sql_query(col_sql, session.bind).columns.values) + col_names.remove('entry_id') if 'index' in col_names: index_col = ['index'] @@ -42,14 +38,30 @@ def read_from_internal_table(entry, datasource, start=None, end=None, **kwargs): # load data df = pd.read_sql(sql, session.bind, index_col=index_col, columns=col_names) - # unstack multi-dimensional data into the single columns - rawvalues = np.vstack(df['data'].values) + # always use data_name from datasource as column names when exporting the data + col_names = datasource.data_name - df = pd.DataFrame(data=rawvalues, columns=col_names) + # if the column 'data' exists, the new routine is used + if 'data' in df.columns: + # unstack multi-dimensional data into the single columns + rawvalues = np.vstack(df['data'].values) - # map column names - # deprecated (?) - #df.columns = [entry.variable.name if _col== 'value' else _col for _col in df.columns] + # unstack precision (precision1, precision2, ...) + rawprecision = np.vstack(df['precision'].values) + + # add precision column names to the col_names + for i in range(1, len(rawprecission[0])+1): + precision_col = 'precision%s' % i + col_names.append(precision_col) + + # horizontally stack data and precission + raw = np.hstack([rawvalues, rawprecision]) + + df = pd.DataFrame(data=raw, columns=col_names) + # if 'data' does not appear in the column names, the old routine is used + else: + # map column names + df.columns = [datasource.data_name if _col== 'value' else _col for _col in df.columns] return df diff --git a/metacatalog/models/variable.py b/metacatalog/models/variable.py index 97905a49..30e83018 100644 --- a/metacatalog/models/variable.py +++ b/metacatalog/models/variable.py @@ -119,16 +119,10 @@ class Variable(Base): id = Column(Integer, primary_key=True) name = Column(String(64), nullable=False) symbol = Column(String(12), nullable=False) - column_names = Column(ARRAY(String)) + column_names = Column(ARRAY(String), nullable=False) unit_id = Column(Integer, ForeignKey('units.id'), nullable=False) keyword_id = Column(Integer, ForeignKey('keywords.id')) - # fill column_names if not specified - if column_names is None: - # this wouldn´t work for 3D wind data, as there must be 3 column_names - # but it would work for the (standard 1D variables) - variable.column_names = variable.name - # relationships entries = relationship("Entry", back_populates='variable') unit = relationship("Unit", back_populates='variables') From 0580fbd94b259e73c5f5d36b2d5103cf2d486efe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 6 May 2021 09:44:03 +0200 Subject: [PATCH 08/30] handle array imports --- metacatalog/api/db.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/metacatalog/api/db.py b/metacatalog/api/db.py index 9ced4caf..114f5cdd 100644 --- a/metacatalog/api/db.py +++ b/metacatalog/api/db.py @@ -121,7 +121,7 @@ def _remove_nan_from_dict(d): return out_d -def import_table_data(fname, InstanceClass): +def import_table_data(fname, InstanceClass, array_col_name=None): try: df = pd.read_csv(os.path.join(DATAPATH, fname)) except ParserError as e: @@ -131,6 +131,10 @@ def import_table_data(fname, InstanceClass): # replace nan with None df = df.where(df.notnull(), None) + # handle arrays + if array_col_name is not None: + df[array_col_name] = [[cell] for cell in df[array_col_name].values] + # build an instance for each line and return return [InstanceClass(**_remove_nan_from_dict(d)) for d in df.to_dict(orient='record')] @@ -226,8 +230,11 @@ def populate_defaults(session, ignore_tables=[], bump_sequences=10000): print('Finished %s' % table) continue - # get the classes - instances = import_table_data('%s.csv' % table, InstanceClass) + elif table == 'variables': + instances = import_table_data('variables.csv', InstanceClass, array_col_name='column_names') + else: + # get the classes + instances = import_table_data('%s.csv' % table, InstanceClass) # add try: From 59b2ca8e45dfd57d0257d6c8cc1ded446b753854 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Thu, 6 May 2021 10:59:46 +0200 Subject: [PATCH 09/30] added array test --- metacatalog/api/add.py | 8 +- metacatalog/data/datatypes.csv | 3 +- metacatalog/models/datasource.py | 2 +- metacatalog/models/variable.py | 2 +- metacatalog/test/test_api_install.py | 16 ++-- metacatalog/test/test_array_type_data.py | 107 +++++++++++++++++++++++ 6 files changed, 125 insertions(+), 13 deletions(-) create mode 100644 metacatalog/test/test_array_type_data.py diff --git a/metacatalog/api/add.py b/metacatalog/api/add.py index 83164d2d..f7104095 100644 --- a/metacatalog/api/add.py +++ b/metacatalog/api/add.py @@ -116,7 +116,7 @@ def add_unit(session, name, symbol, si=None): return add_record(session=session, tablename='units', **attrs) -def add_variable(session, name, symbol, unit): +def add_variable(session, name, symbol, column_names, unit): r"""Add variable record Add a new variable to the database. @@ -130,6 +130,10 @@ def add_variable(session, name, symbol, unit): symbol : str The variable symbol. Try to use the correct physical variable symbols and avoid dublicates. + column_names : list + .. versionadded:: 0.2.12 + List of default column names that will be displayed when exporting the data. + The columns are named in the same order as they appear in the list. unit : int, str Either the id or **full** name of the unit to be linked to this variable. @@ -141,7 +145,7 @@ def add_variable(session, name, symbol, unit): """ #create the attribute dict - attrs = dict(name=name, symbol=symbol) + attrs = dict(name=name, symbol=symbol, column_names=column_names) # get the unit if isinstance(unit, int): diff --git a/metacatalog/data/datatypes.csv b/metacatalog/data/datatypes.csv index e4dca346..d7f93ca9 100644 --- a/metacatalog/data/datatypes.csv +++ b/metacatalog/data/datatypes.csv @@ -12,4 +12,5 @@ id,parent_id,name,title,description 20,16,idataframe,indexed table,"NDArray with any index except datetime information." 21,20,vdataframe,"named, indexed table","idataframe with additional name property of any valid metacatalog Variable." 22,16,time-dataframe,timeseries table,"NDArray indexed by datetime information. The datetimes need to be of increasing order." -23,22,vtime-dataframe,named timeseries table,"Timeseries table that holds an additional Variable name to describe the content." \ No newline at end of file +23,22,vtime-dataframe,named timeseries table,"Timeseries table that holds an additional Variable name to describe the content." +24,,timeseries-array,"timeseries,ARRAY data","sqlalchemy.dialects.postgresql.ARRAY data indexed by datetime information. The datetimes need to be of increasing order." diff --git a/metacatalog/models/datasource.py b/metacatalog/models/datasource.py index 9ad8db22..92c3ef16 100644 --- a/metacatalog/models/datasource.py +++ b/metacatalog/models/datasource.py @@ -464,7 +464,7 @@ class DataSource(Base): datatype_id = Column(Integer, ForeignKey('datatypes.id'), nullable=False) encoding = Column(String(64), default='utf-8') path = Column(String, nullable=False) - data_names = Column(String) + data_names = Column(String, nullable=False) args = Column(String) # scales diff --git a/metacatalog/models/variable.py b/metacatalog/models/variable.py index 30e83018..d98874e0 100644 --- a/metacatalog/models/variable.py +++ b/metacatalog/models/variable.py @@ -119,7 +119,7 @@ class Variable(Base): id = Column(Integer, primary_key=True) name = Column(String(64), nullable=False) symbol = Column(String(12), nullable=False) - column_names = Column(ARRAY(String), nullable=False) + column_names = Column(ARRAY(String(128)), nullable=False) unit_id = Column(Integer, ForeignKey('units.id'), nullable=False) keyword_id = Column(Integer, ForeignKey('keywords.id')) diff --git a/metacatalog/test/test_api_install.py b/metacatalog/test/test_api_install.py index 6b42c9f3..d1f944fd 100644 --- a/metacatalog/test/test_api_install.py +++ b/metacatalog/test/test_api_install.py @@ -9,7 +9,7 @@ def create_tables(session): """ - Install all tables + Install all tables """ # test @@ -28,7 +28,7 @@ def populate_defaults(session): def check_defaults(session, capsys): """ - Load data files from metacatalog and check against + Load data files from metacatalog and check against the populated database """ @@ -37,15 +37,15 @@ def check_defaults(session, capsys): for fname in files: tablename = os.path.basename(fname).split('.')[0] - if tablename == 'keywords': + if tablename in ['keywords', 'variables']: continue # something is going wrong here! TODO fix with capsys.disabled(): print('Testing table: %s' % tablename) - + # load datafile datafile = pd.read_csv(fname, sep=',') datafile = datafile.where(datafile.notnull(), None) # replace NaN by None - + # load table from db table = pd.read_sql_table(tablename, session.bind) @@ -62,13 +62,13 @@ def check_defaults(session, capsys): def test_metacatalog_install(capsys): """ Depends on Postgis install. - Runs tests on creating tables and populating defaults + Runs tests on creating tables and populating defaults using the Python api """ # connect to db session = connect(mode='session') - + # run single tests assert create_tables(session) assert populate_defaults(session) - assert check_defaults(session, capsys) \ No newline at end of file + assert check_defaults(session, capsys) diff --git a/metacatalog/test/test_array_type_data.py b/metacatalog/test/test_array_type_data.py new file mode 100644 index 00000000..7ea3722b --- /dev/null +++ b/metacatalog/test/test_array_type_data.py @@ -0,0 +1,107 @@ +import pytest + +import pandas as pd +from metacatalog import api + + + +### EDDY data chunk erstellen +# using 3D eddy wind speed data (u, v, w) +tstamp = "2018-01-01 00:30:00", "2018-01-01 01:00:00", "2018-01-01 01:30:00", "2018-01-01 02:00:00", "2018-01-01 02:30:00", "2018-01-01 03:00:00", "2018-01-01 03:30:00", "2018-01-01 04:00:00", "2018-01-01 04:30:00", "2018-01-01 05:00:00" +u = 1.123902, 0.214753, 0.446611, 0.962977, 2.915902, 4.048897, 5.368552, 6.046246, 5.405221, 4.172279 +v = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 , 0.0 +w = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 , 0.0 + +df = pd.DataFrame(data={"tstamp": tstamp, "u": u, "v": v, "w": w}) +df['tstamp'] = pd.to_datetime(df['tstamp'], format='%Y-%m-%d %H:%M:%S') +df.set_index('tstamp', inplace=True) + + +def add_eddy_entry(session): + """ + Add an entry for the eddy wind data. + """ + # add the variable + var_3D_wind = api.add_variable(session, name='3D-wind', symbol='uvw', column_names=['u', 'v', 'w'], unit=107) + + # add an author + kit = api.add_person(session, first_name=None, last_name=None, + organisation_name='Karlsruhe Institute of Technology (KIT)', + organisation_abbrev='KIT' + ) + + # add the entry + eddy_wind = api.add_entry(session, title='3-dimensional windspeed data', + abstract='3-dimensional windspeed data from the Fendt data set', + location=(8, 52), + variable=var_3D_wind.id, + comment='after double rotation', + license=6, + author=kit.id, + embargo=False, + is_partial=False) + + # assert + assert var_3D_wind.column_names == ['u', 'v', 'w'] + + return True + + +def create_eddy_datasource(session): + """ + Add a datasource to the eddy entry. + """ + wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries_array', data_names=['u', 'v', 'w']) + + wind.datasource.create_scale( + resolution='30min', + extent=(df.index[0], df.index[-1]), + support=1.0, + scale_dimension='temporal' + ) + + session.commit() + + # assert + assert wind.datasource.data_names == ['u', 'v', 'w'] + + return True + + +def add_eddy_data(session): + """ + Add the previously generated 3D windspeed data to the eddy entry. + """ + wind.import_data(df) + + return True + + +def read_eddy_data(session): + """ + Read the 3D windspeed data and check column names. + """ + eddy = api.find_entry(session, title='3-dimensional windspeed data') + + dat = eddy.get_data() + + print(dat.columns) + + return True + + + + +@pytest.mark.depends(on=['db_init'], name='array_type_data') +def test_array_type_data(): + """ + A simple workflow of 3 persons who contributed to two entries. + The content of some related content is tested randomly + """ + # get a session + session = connect(mode='session') + + # run single tests + assert add_eddy_entry(session) + assert create_eddy_datasource(session) + assert read_eddy_data(session) From acfa0d6adab49de2400fb257356269422f15421a Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Thu, 6 May 2021 13:29:02 +0200 Subject: [PATCH 10/30] array type data test: import_data() not working --- metacatalog/ext/io/interface.py | 89 +++--- metacatalog/models/datasource.py | 3 +- metacatalog/models/entry.py | 340 +++++++++++------------ metacatalog/test/test_array_type_data.py | 21 +- 4 files changed, 226 insertions(+), 227 deletions(-) diff --git a/metacatalog/ext/io/interface.py b/metacatalog/ext/io/interface.py index 39ea9ac9..fef8236e 100644 --- a/metacatalog/ext/io/interface.py +++ b/metacatalog/ext/io/interface.py @@ -6,23 +6,23 @@ from .appender import append_to_internal_table, append_to_local_csv_file from metacatalog.util.exceptions import IOOperationNotFoundError -from metacatalog.models import DataSource, Entry +from metacatalog.models import DataSource, Entry from metacatalog.ext import MetacatalogExtensionInterface class IOExtensionInterface(MetacatalogExtensionInterface): """ - Absctract Base Class for any kind of input / output - activity on all supported data sources. The Interface + Absctract Base Class for any kind of input / output + activity on all supported data sources. The Interface can be used as an extension storage for read and write functions. For this, no interface has to be defined, as new functions - can be registered and loaded by classmethods only. + can be registered and loaded by classmethods only. To actually execute a read, import, delete or append operation, - the interface needs to implement a calling function. That - means, you need to specify, how the function is called. Then, - an interface class can be defined for each Entry and data-source, - data-type and metadata-specific operations can be executed from + the interface needs to implement a calling function. That + means, you need to specify, how the function is called. Then, + an interface class can be defined for each Entry and data-source, + data-type and metadata-specific operations can be executed from a common interface. """ READER = dict( @@ -88,27 +88,27 @@ def init_new_entry(self, entry: Entry): def read(self, **kwargs): """ Execute a read operation on the datasource. - To load the registered function and run the after_read converter, - you can simply call the abstractmethod template from the new + To load the registered function and run the after_read converter, + you can simply call the abstractmethod template from the new Interface like: .. code-block:: python class IOInterface(IOExtensionInterface): def read(self, **kwargs): return super(IOInterface, self).read(**kwargs) - + """ # get reader reader = self.get_reader(self.entry.datasource) - + # build arguments args = self.entry.datasource.load_args() args.update(kwargs) - + # read the data data = reader(self.entry, self.entry.datasource, **kwargs) return self.after_read(data) - + def after_read(self, data): return data @@ -116,19 +116,19 @@ def after_read(self, data): def import_(self, data, **kwargs): """ Execute an import operation on the datasource. - To load the registered function and run the after_import converter, - you can simply call the abstractmethod template from the new + To load the registered function and run the after_import converter, + you can simply call the abstractmethod template from the new Interface like: .. code-block:: python class IOInterface(IOExtensionInterface): def import_(self, data, **kwargs): return super(IOInterface, self).import_(data, **kwargs) - + """ # get importer importer = self.get_importer(self.entry.datasource) - + # build arguments args = self.entry.datasource.load_args() args.update(kwargs) @@ -139,13 +139,13 @@ def import_(self, data, **kwargs): def after_import(self): pass - + @abstractmethod def append(self, data, **kwargs): """ Execute an append operation on the datasource. - To load the registered function and run the after_append converter, - you can simply call the abstractmethod template from the new + To load the registered function and run the after_append converter, + you can simply call the abstractmethod template from the new Interface like: .. code-block:: python @@ -164,7 +164,7 @@ def append(self, data, **kwargs): # append the data appender(self.entry, self.entry.datasource, data, **kwargs) return self.after_append() - + def after_append(self): pass @@ -172,8 +172,8 @@ def after_append(self): def delete(self, **kwargs): """ Execute a delete operation on the datasource. - To load the registered function and run the after_delete converter, - you can simply call the abstractmethod template from the new + To load the registered function and run the after_delete converter, + you can simply call the abstractmethod template from the new Interface like: .. code-block:: python @@ -188,7 +188,7 @@ def delete(self, **kwargs): # build arguments args = self.entry.datasource.load_args() args.update(kwargs) - + # delte the datasource deleter(self.entry, self.entry.datasource, **kwargs) return self.after_delete() @@ -201,7 +201,7 @@ def register(cls, func: callable, operation: str, name: str, datatypes, overwrit # make datatypes iterable if not isinstance(datatypes, (list, tuple)): datatypes = [datatypes] - + name = name.lower() op = operation.upper() @@ -209,7 +209,7 @@ def register(cls, func: callable, operation: str, name: str, datatypes, overwrit D = getattr(cls, op) except AttributeError: raise AttributeError("'%s' is not a valid I/O operation" % op) - + if name not in D.keys(): D[name] = dict() for datatype in datatypes: @@ -217,7 +217,7 @@ def register(cls, func: callable, operation: str, name: str, datatypes, overwrit if datatype in D[name].keys() and not overwrite: raise Warning("A '%s' %s for '%s' data-types already exists. use 'overwrite=True' to overwrite" % (name, operation, datatype)) D[name][datatype] = func - + # set the new mapping setattr(cls, op, D) @@ -227,8 +227,8 @@ def _get_types(cls, operation: str, name: str) -> dict: D = getattr(cls, operation) except AttributeError: raise AttributeError("'%s' is not a valid I/O operation" % operation.upper()) - - try: + + try: return D[name.lower()] except KeyError: raise AttributeError("The type '%s' is not registered for %s I/O operations" % (name.lower(), operation.upper())) @@ -239,7 +239,7 @@ def _get_func(cls, operation: str, name: str, datatype: str) -> callable: if not datatype.lower() in types: raise IOOperationNotFoundError("No registered function for datatype '%s'\nOperation:[%s]->[%s]" % (datatype.lower(), operation.upper(), name.lower())) - + return types[datatype.lower()] @classmethod @@ -252,14 +252,14 @@ def get_func_for_datasource(cls, operation: str, datasource: DataSource) -> call else: datatype = datasource.datatype name = datasource.type.name - + # wrap the loader def load(): try: return cls._get_func(operation, name, datatype.name) except IOOperationNotFoundError: return False - + # go for the function check_for_func = True while check_for_func: @@ -272,22 +272,22 @@ def load(): func = None else: check_for_func = False - + if func is None: - raise IOOperationNotFoundError("No registered function for datatype '%s'\nOperation:[%s]->[%s]" % (datatypename, operation.upper(), name)) + raise IOOperationNotFoundError("No registered function for datatype '%s'\nOperation:[%s]->[%s]" % (name.lower(), operation.upper(), name)) return func @classmethod def get_reader(cls, datasource: DataSource): """ - Return the reader function of :class:`DataSource `, + Return the reader function of :class:`DataSource `, do not use it directly. Parameters ---------- datasource : DataSource - The datasource instance that should use this function to + The datasource instance that should use this function to read data as specified. Returns @@ -300,13 +300,13 @@ def get_reader(cls, datasource: DataSource): @classmethod def get_importer(cls, datasource: DataSource): """ - Return the importer function of :class:`DataSource `, + Return the importer function of :class:`DataSource `, do not use it directly. Parameters ---------- datasource : DataSource - The datasource instance that should use this function to + The datasource instance that should use this function to import data as specified. Returns @@ -319,13 +319,13 @@ def get_importer(cls, datasource: DataSource): @classmethod def get_appender(cls, datasource: DataSource): """ - Return the appender function of :class:`DataSource `, + Return the appender function of :class:`DataSource `, do not use it directly. Parameters ---------- datasource : DataSource - The datasource instance that should use this function to + The datasource instance that should use this function to append data as specified. Returns @@ -334,17 +334,17 @@ def get_appender(cls, datasource: DataSource): The appender function for the requested datasource type """ return cls.get_func_for_datasource('APPENDER', datasource) - + @classmethod def get_deleter(cls, datasource: DataSource): """ - Return the deleter function of :class:`DataSource `, + Return the deleter function of :class:`DataSource `, do not use it directly. Parameters ---------- datasource : DataSource - The datasource instance that should use this function to + The datasource instance that should use this function to delete data as specified. Returns @@ -353,4 +353,3 @@ def get_deleter(cls, datasource: DataSource): The deleter function for the requested datasource type """ return cls.get_func_for_datasource('DELETER', datasource) - diff --git a/metacatalog/models/datasource.py b/metacatalog/models/datasource.py index 92c3ef16..ad5737cc 100644 --- a/metacatalog/models/datasource.py +++ b/metacatalog/models/datasource.py @@ -7,6 +7,7 @@ from sqlalchemy.orm import relationship, object_session, backref from geoalchemy2 import Geometry from geoalchemy2.shape import to_shape, from_shape +from sqlalchemy.dialects.postgresql import ARRAY import pandas as pd @@ -464,7 +465,7 @@ class DataSource(Base): datatype_id = Column(Integer, ForeignKey('datatypes.id'), nullable=False) encoding = Column(String(64), default='utf-8') path = Column(String, nullable=False) - data_names = Column(String, nullable=False) + data_names = Column(ARRAY(String(128)), nullable=False) args = Column(String) # scales diff --git a/metacatalog/models/entry.py b/metacatalog/models/entry.py index 5b790572..9be445db 100644 --- a/metacatalog/models/entry.py +++ b/metacatalog/models/entry.py @@ -1,6 +1,6 @@ """ The Entry is the core class of metacatalog. It represents the core logical unit of the meta data model. -In principle, an Entry needs a first Author, a title, position and a license to describe +In principle, an Entry needs a first Author, a title, position and a license to describe one type of environmental variable. It can hold a reference and interface to the actual data. If a supported data format is used, Entry can load the data. @@ -36,12 +36,12 @@ def get_embargo_end(datetime=None): class Entry(Base): r"""Entry - The Entry is the main entity in metacatalog. An object instance models a - set of metadata needed to store and manage a datasource. The Entry is not - the actual data. - The Entry is designed to store all necessary information to be exportable - in ISO19115 in the scope of metacatalog. That means, Properties which are - always the same across metacatalog, or can be derived from the actual + The Entry is the main entity in metacatalog. An object instance models a + set of metadata needed to store and manage a datasource. The Entry is not + the actual data. + The Entry is designed to store all necessary information to be exportable + in ISO19115 in the scope of metacatalog. That means, Properties which are + always the same across metacatalog, or can be derived from the actual implementation, are not part of an Entry. Attributes @@ -51,103 +51,103 @@ class Entry(Base): uuid : str .. versionadded:: 0.1.9 - Version 4 UUID string to identify the Entry across installations. - This field is read-only and will be assigned on creation. It is primarily + Version 4 UUID string to identify the Entry across installations. + This field is read-only and will be assigned on creation. It is primarily used to export Entry into ISO19115 metadata. title : str A full title (512) to describe the datasource as well as possible. - The truncated title (first 25 signs) is usually used to print an + The truncated title (first 25 signs) is usually used to print an Entry object to the console. abstract : str - Full abstract of the datasource. The abstract should include all - necessary information that is needed to fully understand the data. + Full abstract of the datasource. The abstract should include all + necessary information that is needed to fully understand the data. external_id : str - Any kind of OID that was used to identify the data in the first place. - Usually an unque ID field of other data-storage solutions. The - exernal_id is only stored for reference reasons. + Any kind of OID that was used to identify the data in the first place. + Usually an unque ID field of other data-storage solutions. The + exernal_id is only stored for reference reasons. location : str, tuple The location as a POINT Geometry in unprojected WGS84 (EPSG: 4326). - The location is primarily used to show all Entry objects on a map, or - perform geo-searches. If the data-source needs to store more complex + The location is primarily used to show all Entry objects on a map, or + perform geo-searches. If the data-source needs to store more complex Geometries, you can use the ``geom`` argument. The location can be passed as WKT or a tuple of (x, y) coordinates. - Note that it will be returned and stored as WKB. The output value will + Note that it will be returned and stored as WKB. The output value will be reworked in a future release geom : str .. deprecated:: 0.1.11 The geom attribute will be reomved with version 0.2 .. warning:: - The geom attribute is completely untested so far and might be + The geom attribute is completely untested so far and might be reworked or removed in a future release - It takes a WKT of any kind of OGC-conform Geometry. The return value + It takes a WKT of any kind of OGC-conform Geometry. The return value will be the same Geometry as WKB. creation : datetime.datetime - Following the ISO19115 the *creation* date is referring to the creation - date of the **data resource** described by the Entry, not the Entry - itself. If creation date is not set, it is assumed, that yet no data + Following the ISO19115 the *creation* date is referring to the creation + date of the **data resource** described by the Entry, not the Entry + itself. If creation date is not set, it is assumed, that yet no data resource is connected to the Entry. end : datetime.datimetime - The last date the data source described by this Entry has data for. + The last date the data source described by this Entry has data for. The end date is **not** ISO19115-compliant and will be reworked. version : int - The version of this Entry. Usually metacatalog will handle the version + The version of this Entry. Usually metacatalog will handle the version itself and there is not need to set the version manually. latest_version_id : int - Foreign key to `Entry.id`. This key is self-referencing the another - Entry. This has to be set if the current Entry is not the latest one. - If latest_version_id is None, the Entry is the most recent one and - database operations that find multiple entries will in a future release + Foreign key to `Entry.id`. This key is self-referencing the another + Entry. This has to be set if the current Entry is not the latest one. + If latest_version_id is None, the Entry is the most recent one and + database operations that find multiple entries will in a future release filter to 'version duplicates'. is_partial : bool .. versionadded:: 0.1.10 - If an Entry is partial, it is not self-contained and **has** to be part - of a :class:`EntryGroup ` of type - composite. + If an Entry is partial, it is not self-contained and **has** to be part + of a :class:`EntryGroup ` of type + composite. .. note:: - To make it possbile to add partial Entrys via the models submodule, - The Entry class itself will **not** check integrity. This has to + To make it possbile to add partial Entrys via the models submodule, + The Entry class itself will **not** check integrity. This has to be done on adding partial Entry records, or by checking the database comment : str Arbitrary free-text comment to the Entry citation : str .. versionadded:: 0.1.13 - Citation informatio for this Entry. Note, that metacatalog does not - assign DOIs and thus a citation is only useful if the associated - data has a DOI and the bibliographic information applies to the Entry - as well. + Citation informatio for this Entry. Note, that metacatalog does not + assign DOIs and thus a citation is only useful if the associated + data has a DOI and the bibliographic information applies to the Entry + as well. .. note:: - Metacatalog does not manage bibliography. Thus it is highly - recommended to use thrid party software for management and only + Metacatalog does not manage bibliography. Thus it is highly + recommended to use thrid party software for management and only export the reference to the resource in a common citation style. license : metacatalog.models.License - Data License associated to the data and the metadata. You can pass - the `License `_ itself, or use the + Data License associated to the data and the metadata. You can pass + the `License `_ itself, or use the license_id attribute. license_id : int - Foreign key to the data license. + Foreign key to the data license. author : metacatalog.models.Person - :class:`Person ` that acts as first author - for the given entry. Only one first author is possible, co-authors can - be requested from either the contributors list or the - :py:attr:`authors` property. `author` is a property and setting a + :class:`Person ` that acts as first author + for the given entry. Only one first author is possible, co-authors can + be requested from either the contributors list or the + :py:attr:`authors` property. `author` is a property and setting a new author using this property is not supported. authors : list - List of :class:`Person `. The first element - is the first author, see :py:attr:`~author`. The others are - :class:`Person `s associated with the - :class:`Role ` of ``'coAuthor' ``. + List of :class:`Person `. The first element + is the first author, see :py:attr:`~author`. The others are + :class:`Person `s associated with the + :class:`Role ` of ``'coAuthor' ``. The list of authors is sorted by the `order` attribute. - `authors` is a property and setting a new list of authors using this + `authors` is a property and setting a new list of authors using this property is not supported. Note ---- - One Entry object instance is always described by exactly one variable. - If a datasource is a composite of many datasources, there are two - strategies. Either a new table can be implemented and an abstract - :class:`Variable ` be added. This is done with - Eddy-Covariance data. Secondly, Each variable of the datasource can be - represented by its own Entry, which get then grouped by an + One Entry object instance is always described by exactly one variable. + If a datasource is a composite of many datasources, there are two + strategies. Either a new table can be implemented and an abstract + :class:`Variable ` be added. This is done with + Eddy-Covariance data. Secondly, Each variable of the datasource can be + represented by its own Entry, which get then grouped by an :class:`EntryGroup` of :class:`EntryGroupType` ``'composite'``. See Also @@ -170,7 +170,7 @@ class Entry(Base): is_partial = Column(Boolean, default=False, nullable=False) comment = Column(String, nullable=True) citation = Column(String(2048), nullable=True) - + license_id = Column(Integer, ForeignKey('licenses.id')) variable_id = Column(Integer, ForeignKey('variables.id'), nullable=False) datasource_id = Column(Integer, ForeignKey('datasources.id')) @@ -203,10 +203,10 @@ def to_dict(self, deep=False, stringify=False) -> dict: Parameters ---------- deep : bool - If True, all related objects will be included as + If True, all related objects will be included as dictionary. Defaults to False stringify : bool - If True, all values will be turned into a string, + If True, all values will be turned into a string, to make the object serializable. Returns ------- @@ -239,7 +239,7 @@ def to_dict(self, deep=False, stringify=False) -> dict: if self.details is not None: d['details'] = self.details_dict(full=True) - + # set optional attributes for attr in ('abstract', 'external_id','comment', 'citation'): if hasattr(self, attr) and getattr(self, attr) is not None: @@ -269,11 +269,11 @@ def is_latest_version(self): @property def latest_version(self): versions = [e.version for e in self.other_versions] - + # no other versions, then self is the only if len(versions): return self - + # if more versions exist, find the highest number latest_index = version.index(max(versions)) return self.other_versions[latest_index] @@ -281,7 +281,7 @@ def latest_version(self): @property def author(self): return [c.person for c in self.contributors if c.role.name == 'author'][0] - + @author.setter def author(self, new_author): self.set_new_author(new_author) @@ -296,15 +296,15 @@ def set_new_author(self, new_author, commit=False): The new first author. As of now the new author has to be passed as a model instance. Passing the ID or query parameter is not yet supported. commit : boolean - If True, the whole :class:`Entry ` will commit - and persist itself to the database. + If True, the whole :class:`Entry ` will commit + and persist itself to the database. .. note:: This will also affect other uncommited edits to the Entry. """ if not isinstance(new_author, models.Person): raise AttributeError('The new author has to be of type metatacatalog.models.Person') - + # find the association assoc_idx = [i for i, c in enumerate(self.contributors) if c.role.name == 'author'][0] self.contributors[assoc_idx].person = new_author @@ -317,13 +317,13 @@ def set_new_author(self, new_author, commit=False): except Exception as e: session.rollback() raise e - - + + @property def authors(self): # get all coAuthors = [c for c in self.contributors if c.role.name == 'coAuthor'] - + # order idx = np.argsort([c.order for c in coAuthors]) @@ -341,7 +341,7 @@ def projects(self): @property def composite_entries(self): return [group for group in self.associated_groups if group.type.name.lower() == 'composite'] - + @property def location_shape(self): return to_shape(self.location) @@ -349,25 +349,25 @@ def location_shape(self): @location_shape.setter def location_shape(self, shape): self.location = from_shape(shape) - + def plain_keywords_list(self): """Metadata Keyword list - Returns list of controlled keywords associated with this - instance of meta data. - If there are any associated values or alias of the given + Returns list of controlled keywords associated with this + instance of meta data. + If there are any associated values or alias of the given keywords, use the keywords_dict function """ return [kw.keyword.path() for kw in self.keywords] - + def plain_keywords_dict(self): return [kw.keyword.as_dict() for kw in self.keywords] - + def keywords_dict(self): return [ dict( - path=kw.keyword.full_path, + path=kw.keyword.full_path, alias=kw.alias, value=kw.associated_value ) for kw in self.keywords @@ -376,21 +376,21 @@ def keywords_dict(self): def details_dict(self, full=True): """ Returns the associated details as dictionary. - + Parameters ---------- full : bool - If True (default) the keywords will contain the - full info including key description, ids and + If True (default) the keywords will contain the + full info including key description, ids and stemmed key. If false, it will be truncated to a - plain key:value dict + plain key:value dict """ if full: return {d.stem:d.to_dict() for d in self.details} else: return {d.stem:d.value for d in self.details} - + def details_table(self, fmt='html'): """ Return the associated details as table @@ -399,7 +399,7 @@ def details_table(self, fmt='html'): ---------- fmt : string Can be one of: - + * `html` to return a HTML table * `latex` to return LaTeX table * `markdown` to return Markdown table @@ -416,7 +416,7 @@ def details_table(self, fmt='html'): return df.to_markdown() else: raise ValueError("fmt has to be in ['html', 'latex', 'markdown']") - + def add_details(self, details=None, commit=False, **kwargs): """ Adds arbitrary key-value pairs to this entry. @@ -432,29 +432,29 @@ def add_details(self, details=None, commit=False, **kwargs): 'value': '', 'description': '' }] - where the ``description`` is optional and can be omitted. + where the ``description`` is optional and can be omitted. If no descriptions are passed at all, you can also use `**kwargs` to pass ``key=value`` pairs. commit : bool - If True, the Entry session will be added to the + If True, the Entry session will be added to the current session and the transaction is commited. Can have side-effects. Defaults to False. - + """ ps = nltk.PorterStemmer() - + # build entries here detail_list = [] # parse kwargs for k, v in kwargs.items(): detail_list.append({ - 'entry_id': self.id, - 'key': str(k), - 'stem': ps.stem(k), + 'entry_id': self.id, + 'key': str(k), + 'stem': ps.stem(k), 'value': str(v) }) - + # parse details if details is not None: for detail in details: @@ -467,11 +467,11 @@ def add_details(self, details=None, commit=False, **kwargs): if 'description' in detail.keys(): d['description'] = detail['description'] detail_list.append(d) - + # build the models for detail in detail_list: self.details.append(models.Detail(**detail)) - + if commit: session = object_session(self) try: @@ -483,30 +483,30 @@ def add_details(self, details=None, commit=False, **kwargs): def make_composite(self, others=[], title=None, description=None, commit=False): """ - Create a composite EntryGroup from this Entry. A composite marks + Create a composite EntryGroup from this Entry. A composite marks stand-alone (:attr:`is_partial` ``= False``) entries as inseparable. - A composite can also contain a partial Entry - (:attr:`is_partial` ``= True``), whichs data only makes sense in the + A composite can also contain a partial Entry + (:attr:`is_partial` ``= True``), whichs data only makes sense in the context of the composite group. Parameters ---------- others : list of Entry - The other :class:`Entries ` that + The other :class:`Entries ` that should be part of the composite. title : str Optional title of the composite, if applicable description : str Optional description of the composite if applicable commit : bool - If True, the newly created Group will be persisted in the + If True, the newly created Group will be persisted in the database. Defaults to False. Returns ------- composite : metacatalog.models.EntryGroup The newly created EntryGroup of EntryGroupType.name == 'Composite' - + """ # check type of others if isinstance(others, Entry): @@ -527,14 +527,14 @@ def make_composite(self, others=[], title=None, description=None, commit=False): except Exception as e: session.rollback() raise e - + # return return composite def neighbors(self, distance, unit='meter', buffer_epsg=3857, as_sql=False, **kwargs): """ - Find neighboring :class:`Entries ` around the - location of this instance. You can return the result, or the sqlalchemy + Find neighboring :class:`Entries ` around the + location of this instance. You can return the result, or the sqlalchemy Query object, which can be printed as plain SQL. Parameters @@ -543,23 +543,23 @@ def neighbors(self, distance, unit='meter', buffer_epsg=3857, as_sql=False, **kw The maximum distance at which another Entry is still considered to be a neighbor. unit : str Has to be one of ['meter', 'km', 'mile', 'nautic'] to specify the unit - of the given distance. Note that the distance will always be transformed + of the given distance. Note that the distance will always be transformed into meter. buffer_epsg : int - The EPSG identification number of any projected cartesian coordinate - reference system that uses meter as unit. This CRS will be used to + The EPSG identification number of any projected cartesian coordinate + reference system that uses meter as unit. This CRS will be used to apply the search distance (in meter). - .. note:: - The default system is the transversal Mercartor projection, which is - a global system. Thus, it can always be applied, but may introduce + .. note:: + The default system is the transversal Mercartor projection, which is + a global system. Thus, it can always be applied, but may introduce large uncertainties in small areas. Replace this attribute by a local CRS wherever possible. as_sql : bool - If False (default) the SQL query for neighbors will be executed and + If False (default) the SQL query for neighbors will be executed and the result is returned. Else, the SQL query itself will be returned. kwargs : keyword arguments - Any passed keyword argument will be passed down to the - :func:`api.find_entry ` function to further + Any passed keyword argument will be passed down to the + :func:`api.find_entry ` function to further filter the results. See Also @@ -574,19 +574,19 @@ def neighbors(self, distance, unit='meter', buffer_epsg=3857, as_sql=False, **kw # get the base filter query kwargs['return_iterator'] = True query = api.find_entry(session, **kwargs) - + # get the area filter_query = around(self, distance=distance, unit=unit, query=query, buffer_use_epsg=buffer_epsg) - + if as_sql: return filter_query else: return filter_query.all() - def create_datasource(self, path: str, type, datatype, commit=False, **args): + def create_datasource(self, path: str, type, datatype, data_names, commit=False, **args): """ """ - # + # if self.datasource is not None: raise MetadataMissingError('Datasource already exists. You can edit that one.') @@ -600,12 +600,12 @@ def create_datasource(self, path: str, type, datatype, commit=False, **args): ds_type = api.find_datasource_type(session=session, name=type, return_iterator=True).first() else: raise AttributeError('type has to be of type int or str') - + # TODO need the API for DataTypes here!! dtype = session.query(models.DataType).filter(models.DataType.name==datatype).one() - + # build the datasource object - ds = models.DataSource(type=ds_type, datatype=dtype, path=path) + ds = models.DataSource(type=ds_type, datatype=dtype, path=path, data_names=data_names) # add the args ds.save_args_from_dict(args) @@ -620,28 +620,28 @@ def create_datasource(self, path: str, type, datatype, commit=False, **args): except Exception as e: session.rollback() raise e - + # return return ds def get_data(self, **kwargs): """ .. versionchanged:: 0.1.12 - - Read the data. This is only possible if a datasource is specified and - any kind of IOExtension or IOInterface is activated. By default, - the builtin :class:`IOExtension ` + + Read the data. This is only possible if a datasource is specified and + any kind of IOExtension or IOInterface is activated. By default, + the builtin :class:`IOExtension ` is activated since version 0.1.12. """ if self.datasource is None: raise MetadataMissingError('Entry need datasource information') - + try: # check if an io_extension is set if self.io_extension is not None: - return self.io_extension.read(**kwargs) - + return self.io_extension.read(**kwargs) + # if no extension instance, maybe an interface class is set elif self.io_interface is not None: reader = self.io_interface.get_reader(self.datasource) @@ -655,29 +655,29 @@ def get_data(self, **kwargs): def import_data(self, data, **kwargs): """ .. versionchanged:: 0.1.12 - - Import data. This is only possible if a datasource is specified and - any kind of IOExtension or IOInterface is activated. By default, - the builtin :class:`IOExtension ` + + Import data. This is only possible if a datasource is specified and + any kind of IOExtension or IOInterface is activated. By default, + the builtin :class:`IOExtension ` is activated since version 0.1.12. - For the default interface, the datasource type and data type determine - where the data will be stored and how the data has to look like. - You can easily inherit from the - :class:`IOExtension ` to - customize read and write behaviour. If you import i.e. a timeseries to - the same database as metacatalog, you will need to prepared data to + For the default interface, the datasource type and data type determine + where the data will be stored and how the data has to look like. + You can easily inherit from the + :class:`IOExtension ` to + customize read and write behaviour. If you import i.e. a timeseries to + the same database as metacatalog, you will need to prepared data to to only hold an datetime index and the data to be stored. """ if self.datasource is None: raise MetadataMissingError('Entry need datasource information') - + try: # check if an io_extension is set if self.io_extension is not None: - return self.io_extension.import_(data, **kwargs) - + return self.io_extension.import_(data, **kwargs) + # if no extension instance, maybe an interface class is set elif self.io_interface is not None: importer = self.io_interface.get_importer(self.datasource) @@ -687,33 +687,33 @@ def import_data(self, data, **kwargs): except IOOperationNotFoundError as e: print('[ERROR]: Operation not possible.\n%s' % str(e)) return None - + def append_data(self, data, **kwargs): """ .. versionadded:: 0.1.12 - Append data. This is only possible if a datasource is specified and - any kind of IOExtension or IOInterface is activated. By default, - the builtin :class:`IOExtension ` + Append data. This is only possible if a datasource is specified and + any kind of IOExtension or IOInterface is activated. By default, + the builtin :class:`IOExtension ` is activated since version 0.1.12. - For the default interface, the datasource type and data type determine - where the data will be stored and how the data has to look like. - You can easily inherit from the - :class:`IOExtension ` to - customize read and write behaviour. If you import i.e. a timeseries to - the same database as metacatalog, you will need to prepared data to + For the default interface, the datasource type and data type determine + where the data will be stored and how the data has to look like. + You can easily inherit from the + :class:`IOExtension ` to + customize read and write behaviour. If you import i.e. a timeseries to + the same database as metacatalog, you will need to prepared data to to only hold an datetime index and the data to be stored. """ if self.datasource is None: raise MetadataMissingError('Entry need datasource information') - + try: # check if an io_extension is set if self.io_extension is not None: - return self.io_extension.append(data, **kwargs) - + return self.io_extension.append(data, **kwargs) + # if no extension instance, maybe an interface class is set elif self.io_interface is not None: appender = self.io_interface.get_appender(self.datasource) @@ -723,38 +723,38 @@ def append_data(self, data, **kwargs): except IOOperationNotFoundError as e: print('[ERROR]: Operation not possible.\n%s' % str(e)) return None - + def delete_data(self, delete_source=False, **kwargs): """ .. versionadded:: 0.1.12 - Delete data. This is only possible if a datasource is specified and - any kind of IOExtension or IOInterface is activated. By default, - the builtin :class:`IOExtension ` + Delete data. This is only possible if a datasource is specified and + any kind of IOExtension or IOInterface is activated. By default, + the builtin :class:`IOExtension ` is activated since version 0.1.12. - For the default interface, the datasource type and data type determine - where the data is stored and how the data will be delted. - You can easily inherit from the - :class:`IOExtension ` to - customize read and write behaviour. + For the default interface, the datasource type and data type determine + where the data is stored and how the data will be delted. + You can easily inherit from the + :class:`IOExtension ` to + customize read and write behaviour. Parameters ---------- delete_source : bool - If True, the DataSource will be deleted as well after the data + If True, the DataSource will be deleted as well after the data has been deleted. """ if self.datasource is None: raise MetadataMissingError('Entry need datasource information') - + kwargs['delete_source'] = delete_source try: # check if an io_extension is set if self.io_extension is not None: - return self.io_extension.delete(**kwargs) - + return self.io_extension.delete(**kwargs) + # if no extension instance, maybe an interface class is set elif self.io_interface is not None: deleter = self.io_interface.get_deleter(self.datasource) @@ -774,7 +774,7 @@ def add_data(self): def __str__(self): return "" % ( - self.id, - self.title[:20], + self.id, + self.title[:20], self.variable.name ) diff --git a/metacatalog/test/test_array_type_data.py b/metacatalog/test/test_array_type_data.py index 7ea3722b..b958e64e 100644 --- a/metacatalog/test/test_array_type_data.py +++ b/metacatalog/test/test_array_type_data.py @@ -2,6 +2,7 @@ import pandas as pd from metacatalog import api +from ._util import connect @@ -51,19 +52,15 @@ def create_eddy_datasource(session): """ Add a datasource to the eddy entry. """ - wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries_array', data_names=['u', 'v', 'w']) + eddy_wind = api.find_entry(session, title='3-dimensional windspeed data')[0] + eddy_wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries-array', data_names=['u', 'v', 'w']) - wind.datasource.create_scale( - resolution='30min', - extent=(df.index[0], df.index[-1]), - support=1.0, - scale_dimension='temporal' - ) + eddy_wind.datasource.create_scale(resolution='30min', extent=(df.index[0], df.index[-1]), support=1.0, scale_dimension='temporal') session.commit() # assert - assert wind.datasource.data_names == ['u', 'v', 'w'] + assert eddy_wind.datasource.data_names == ['u', 'v', 'w'] return True @@ -72,7 +69,8 @@ def add_eddy_data(session): """ Add the previously generated 3D windspeed data to the eddy entry. """ - wind.import_data(df) + eddy_wind = api.find_entry(session, title='3-dimensional windspeed data')[0] + eddy_wind.import_data(df) return True @@ -81,9 +79,9 @@ def read_eddy_data(session): """ Read the 3D windspeed data and check column names. """ - eddy = api.find_entry(session, title='3-dimensional windspeed data') + eddy_wind = api.find_entry(session, title='3-dimensional windspeed data')[0] - dat = eddy.get_data() + dat = eddy_wind.get_data() print(dat.columns) @@ -104,4 +102,5 @@ def test_array_type_data(): # run single tests assert add_eddy_entry(session) assert create_eddy_datasource(session) + assert add_eddy_data(session) assert read_eddy_data(session) From e889296e21697fd500423689da07051aac77a797 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Thu, 6 May 2021 15:44:48 +0200 Subject: [PATCH 11/30] timeseries_array UPLOAD working now --- metacatalog/data/datatypes.csv | 1 - metacatalog/ext/io/importer.py | 25 ++++++++++++------------ metacatalog/ext/io/interface.py | 2 +- metacatalog/test/test_array_type_data.py | 4 ++-- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/metacatalog/data/datatypes.csv b/metacatalog/data/datatypes.csv index d7f93ca9..fddbf86d 100644 --- a/metacatalog/data/datatypes.csv +++ b/metacatalog/data/datatypes.csv @@ -13,4 +13,3 @@ id,parent_id,name,title,description 21,20,vdataframe,"named, indexed table","idataframe with additional name property of any valid metacatalog Variable." 22,16,time-dataframe,timeseries table,"NDArray indexed by datetime information. The datetimes need to be of increasing order." 23,22,vtime-dataframe,named timeseries table,"Timeseries table that holds an additional Variable name to describe the content." -24,,timeseries-array,"timeseries,ARRAY data","sqlalchemy.dialects.postgresql.ARRAY data indexed by datetime information. The datetimes need to be of increasing order." diff --git a/metacatalog/ext/io/importer.py b/metacatalog/ext/io/importer.py index ffed049e..381345c8 100644 --- a/metacatalog/ext/io/importer.py +++ b/metacatalog/ext/io/importer.py @@ -28,30 +28,31 @@ def import_to_internal_table(entry, datasource, data, force_data_names=False, ** # reset the index imp = data.reset_index(level=0, inplace=False) - # set entry_id - if 'entry_id' not in imp.columns: - imp['entry_id'] = entry.id - # check if a session was passed if 'session' in kwargs.keys(): session = kwargs['session'] else: session = object_session(entry) + # get index, drop it afterwards + index = imp.tstamp + imp.drop('tstamp', axis=1, inplace=True) + # get the column names - exclude everthing that stores precisions data_columns = [col for col in imp.columns.tolist() if not col.startswith('precision')] # get the precision columns precision_columns = [col for col in imp.columns.tolist() if col.startswith('precision')] - # get index - index = imp.index + # set entry_id + if 'entry_id' not in imp.columns: + imp['entry_id'] = entry.id - # save column names in datasource.data_names (excluding ) - if len(columns) != len(entry.variable.column_names): + # save column names in datasource.data_names (excluding precision) + if len(data_columns) != len(entry.variable.column_names): force_data_names = True if force_data_names: - datasource.data_names = columns + datasource.data_names = data_columns else: datasource.data_names = entry.variable.column_names @@ -70,16 +71,16 @@ def import_to_internal_table(entry, datasource, data, force_data_names=False, ** # explicitly map the column types dtypes = { 'tstamp': sa.TIMESTAMP, - 'values': ARRAY(sa.REAL), + 'data': ARRAY(sa.REAL), 'precision': ARRAY(sa.REAL) } - imp_data = pd.DataFrame(data={'tstamp': index, 'values': values, 'precision': precision}) + imp_data = pd.DataFrame(data={'tstamp': index, 'data': values, 'precision': precision}) imp_data['entry_id'] = entry.id # else import if_exists = kwargs.get('if_exists', 'append') - imp.to_sql(tablename, session.bind, index=None, if_exists=if_exists) + imp_data.to_sql(tablename, session.bind, index=None, dtype=dtypes, if_exists=if_exists) def import_to_local_csv_file(entry, datasource, data, **kwargs): diff --git a/metacatalog/ext/io/interface.py b/metacatalog/ext/io/interface.py index fef8236e..d2151463 100644 --- a/metacatalog/ext/io/interface.py +++ b/metacatalog/ext/io/interface.py @@ -274,7 +274,7 @@ def load(): check_for_func = False if func is None: - raise IOOperationNotFoundError("No registered function for datatype '%s'\nOperation:[%s]->[%s]" % (name.lower(), operation.upper(), name)) + raise IOOperationNotFoundError("No registered function for datatype '%s'\nOperation:[%s]->[%s]" % (datatype, operation.upper(), name)) return func diff --git a/metacatalog/test/test_array_type_data.py b/metacatalog/test/test_array_type_data.py index b958e64e..8a01b390 100644 --- a/metacatalog/test/test_array_type_data.py +++ b/metacatalog/test/test_array_type_data.py @@ -13,7 +13,7 @@ v = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 , 0.0 w = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 , 0.0 -df = pd.DataFrame(data={"tstamp": tstamp, "u": u, "v": v, "w": w}) +df = pd.DataFrame(data={"tstamp": tstamp, "u_m": u, "v_m": v, "w_m": w}) df['tstamp'] = pd.to_datetime(df['tstamp'], format='%Y-%m-%d %H:%M:%S') df.set_index('tstamp', inplace=True) @@ -53,7 +53,7 @@ def create_eddy_datasource(session): Add a datasource to the eddy entry. """ eddy_wind = api.find_entry(session, title='3-dimensional windspeed data')[0] - eddy_wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries-array', data_names=['u', 'v', 'w']) + eddy_wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries', data_names=['u', 'v', 'w']) eddy_wind.datasource.create_scale(resolution='30min', extent=(df.index[0], df.index[-1]), support=1.0, scale_dimension='temporal') From 884eb6e0d2c1d603ae2d305c556d5ed139c0bb8d Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Tue, 11 May 2021 11:37:56 +0200 Subject: [PATCH 12/30] 3D array test now running --- metacatalog/ext/io/reader.py | 35 +++++----- metacatalog/test/test_array_type_data.py | 89 +++++++++++++++++------- 2 files changed, 83 insertions(+), 41 deletions(-) diff --git a/metacatalog/ext/io/reader.py b/metacatalog/ext/io/reader.py index ff22d50c..8d882765 100644 --- a/metacatalog/ext/io/reader.py +++ b/metacatalog/ext/io/reader.py @@ -1,6 +1,7 @@ from datetime import datetime as dt import pandas as pd +import numpy as np from sqlalchemy.orm import object_session from sqlalchemy.dialects.postgresql import ARRAY @@ -25,43 +26,43 @@ def read_from_internal_table(entry, datasource, start=None, end=None, **kwargs): # infer table column names order col_sql = 'select * from %s limit 0' % tablename - col_names = list(pd.read_sql_query(col_sql, session.bind).columns.values) - col_names.remove('entry_id') + col_names_sql = list(pd.read_sql_query(col_sql, session.bind).columns.values) + col_names_sql.remove('entry_id') - if 'index' in col_names: - index_col = ['index'] - col_names.remove('index') - elif 'tstamp' in col_names: - index_col = ['tstamp'] - col_names.remove('tstamp') + if 'index' in col_names_sql: + index_col_sql = ['index'] + col_names_sql.remove('index') + elif 'tstamp' in col_names_sql: + index_col_sql = ['tstamp'] + col_names_sql.remove('tstamp') # load data - df = pd.read_sql(sql, session.bind, index_col=index_col, columns=col_names) + df_sql = pd.read_sql(sql, session.bind, index_col=index_col_sql, columns=col_names_sql) - # always use data_name from datasource as column names when exporting the data - col_names = datasource.data_name + # always use data_names from datasource as column names when exporting the data + col_names = datasource.data_names # if the column 'data' exists, the new routine is used - if 'data' in df.columns: + if 'data' in df_sql.columns: # unstack multi-dimensional data into the single columns - rawvalues = np.vstack(df['data'].values) + rawvalues = np.vstack(df_sql['data'].values) # unstack precision (precision1, precision2, ...) - rawprecision = np.vstack(df['precision'].values) + rawprecision = np.vstack(df_sql['precision'].values) # add precision column names to the col_names - for i in range(1, len(rawprecission[0])+1): + for i in range(1, len(rawprecision[0])+1): precision_col = 'precision%s' % i col_names.append(precision_col) # horizontally stack data and precission raw = np.hstack([rawvalues, rawprecision]) - df = pd.DataFrame(data=raw, columns=col_names) + df = pd.DataFrame(data=raw, columns=col_names, index=df_sql.index) # if 'data' does not appear in the column names, the old routine is used else: # map column names - df.columns = [datasource.data_name if _col== 'value' else _col for _col in df.columns] + df_sql.columns = [datasource.data_names if _col== 'value' else _col for _col in df.columns] return df diff --git a/metacatalog/test/test_array_type_data.py b/metacatalog/test/test_array_type_data.py index 8a01b390..cb14c79b 100644 --- a/metacatalog/test/test_array_type_data.py +++ b/metacatalog/test/test_array_type_data.py @@ -6,19 +6,19 @@ -### EDDY data chunk erstellen -# using 3D eddy wind speed data (u, v, w) +# using eddy wind speed data for the tests (u, v, w) tstamp = "2018-01-01 00:30:00", "2018-01-01 01:00:00", "2018-01-01 01:30:00", "2018-01-01 02:00:00", "2018-01-01 02:30:00", "2018-01-01 03:00:00", "2018-01-01 03:30:00", "2018-01-01 04:00:00", "2018-01-01 04:30:00", "2018-01-01 05:00:00" u = 1.123902, 0.214753, 0.446611, 0.962977, 2.915902, 4.048897, 5.368552, 6.046246, 5.405221, 4.172279 v = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 , 0.0 w = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 , 0.0 -df = pd.DataFrame(data={"tstamp": tstamp, "u_m": u, "v_m": v, "w_m": w}) -df['tstamp'] = pd.to_datetime(df['tstamp'], format='%Y-%m-%d %H:%M:%S') -df.set_index('tstamp', inplace=True) +df_3D_wind = pd.DataFrame(data={"tstamp": tstamp, "u_ms": u, "v_ms": v, "w_ms": w}) # use different column names to test force_data_names=True +df_3D_wind['tstamp'] = pd.to_datetime(df_3D_wind['tstamp'], format='%Y-%m-%d %H:%M:%S') +df_3D_wind.set_index('tstamp', inplace=True) -def add_eddy_entry(session): + +def add_3D_entry(session): """ Add an entry for the eddy wind data. """ @@ -32,7 +32,7 @@ def add_eddy_entry(session): ) # add the entry - eddy_wind = api.add_entry(session, title='3-dimensional windspeed data', + entry_3D_wind = api.add_entry(session, title='3-dimensional windspeed data', abstract='3-dimensional windspeed data from the Fendt data set', location=(8, 52), variable=var_3D_wind.id, @@ -48,47 +48,87 @@ def add_eddy_entry(session): return True -def create_eddy_datasource(session): +def create_3D_datasource(session): """ Add a datasource to the eddy entry. """ - eddy_wind = api.find_entry(session, title='3-dimensional windspeed data')[0] - eddy_wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries', data_names=['u', 'v', 'w']) + entry_3D_wind = api.find_entry(session, title='3-dimensional windspeed data')[0] + entry_3D_wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries', data_names=['u', 'v', 'w']) - eddy_wind.datasource.create_scale(resolution='30min', extent=(df.index[0], df.index[-1]), support=1.0, scale_dimension='temporal') + entry_3D_wind.datasource.create_scale(resolution='30min', extent=(df_3D_wind.index[0], df_3D_wind.index[-1]), support=1.0, scale_dimension='temporal') session.commit() # assert - assert eddy_wind.datasource.data_names == ['u', 'v', 'w'] + assert entry_3D_wind.datasource.data_names == ['u', 'v', 'w'] return True -def add_eddy_data(session): +def add_3D_data(session): """ - Add the previously generated 3D windspeed data to the eddy entry. + Add Eddy 3D windspeed data to the eddy entry. """ - eddy_wind = api.find_entry(session, title='3-dimensional windspeed data')[0] - eddy_wind.import_data(df) + entry_3D_wind = api.find_entry(session, title='3-dimensional windspeed data')[0] + entry_3D_wind.import_data(df_3D_wind) return True -def read_eddy_data(session): +def read_3D_data(session): """ Read the 3D windspeed data and check column names. """ - eddy_wind = api.find_entry(session, title='3-dimensional windspeed data')[0] + entry_3D_wind = api.find_entry(session, title='3-dimensional windspeed data')[0] + + dat = entry_3D_wind.get_data() + + print(dat['u'].mean()) + + # assert + assert dat.columns[1] == 'v' + assert dat.columns.tolist() == ['u', 'v', 'w'] # at the moment, no precision columns will be returned when there is no data, is this the wanted behaviour? + assert dat.index[2] == pd.to_datetime("2018-01-01 01:30:00", format='%Y-%m-%d %H:%M:%S') + assert dat['u'].mean() == 3.070534 + + return True + + +def one_dim_data(session): + """ + Do the same as above, but with one-dimensional data instead. + """ + # generate data + df_1D_wind = pd.DataFrame(data={"tstamp": tstamp, "u_ms": u}) + df_1D['tstamp'] = pd.to_datetime(df_1D['tstamp'], format='%Y-%m-%d %H:%M:%S') + df_1D.set_index('tstamp', inplace=True) + + # add the variable + var_1D_wind = api.add_variable(session, name='1D-wind', symbol='u', column_names=['u'], unit=107) + + # add the entry + entry_1D_wind = api.add_entry(session, title='1-dimensional windspeed data',abstract='1-dimensional windspeed data from the Fendt data set', + location=(8, 52), + variable=var_1D_wind.id, + license=6, + author=kit.id, + embargo=False, + is_partial=False) - dat = eddy_wind.get_data() + # create datasource and scale + entry_1D_wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries', data_names=['u']) - print(dat.columns) + entry_1D_wind.datasource.create_scale(resolution='30min', extent=(df_1D.index[0], df_1D.index[-1]), support=1.0, scale_dimension='temporal') return True +#def force_data_names_true(session): +# return True + +#def test_old_timeseries(session): +# return True @pytest.mark.depends(on=['db_init'], name='array_type_data') def test_array_type_data(): @@ -100,7 +140,8 @@ def test_array_type_data(): session = connect(mode='session') # run single tests - assert add_eddy_entry(session) - assert create_eddy_datasource(session) - assert add_eddy_data(session) - assert read_eddy_data(session) + assert add_3D_entry(session) + assert create_3D_datasource(session) + assert add_3D_data(session) + assert read_3D_data(session) + #assert one_dim_data(session) From 1ce2dea81c6e6384981bffaee21f9b96ac398459 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Tue, 11 May 2021 12:10:45 +0200 Subject: [PATCH 13/30] 1D array test now running --- metacatalog/test/test_array_type_data.py | 74 +++++++++++++++++++++--- 1 file changed, 66 insertions(+), 8 deletions(-) diff --git a/metacatalog/test/test_array_type_data.py b/metacatalog/test/test_array_type_data.py index cb14c79b..bde9a877 100644 --- a/metacatalog/test/test_array_type_data.py +++ b/metacatalog/test/test_array_type_data.py @@ -83,8 +83,6 @@ def read_3D_data(session): dat = entry_3D_wind.get_data() - print(dat['u'].mean()) - # assert assert dat.columns[1] == 'v' assert dat.columns.tolist() == ['u', 'v', 'w'] # at the moment, no precision columns will be returned when there is no data, is this the wanted behaviour? @@ -100,12 +98,15 @@ def one_dim_data(session): """ # generate data df_1D_wind = pd.DataFrame(data={"tstamp": tstamp, "u_ms": u}) - df_1D['tstamp'] = pd.to_datetime(df_1D['tstamp'], format='%Y-%m-%d %H:%M:%S') - df_1D.set_index('tstamp', inplace=True) + df_1D_wind['tstamp'] = pd.to_datetime(df_1D_wind['tstamp'], format='%Y-%m-%d %H:%M:%S') + df_1D_wind.set_index('tstamp', inplace=True) # add the variable var_1D_wind = api.add_variable(session, name='1D-wind', symbol='u', column_names=['u'], unit=107) + # find the previously added author + kit = api.find_person(session, organisation_abbrev='KIT')[0] + # add the entry entry_1D_wind = api.add_entry(session, title='1-dimensional windspeed data',abstract='1-dimensional windspeed data from the Fendt data set', location=(8, 52), @@ -118,13 +119,69 @@ def one_dim_data(session): # create datasource and scale entry_1D_wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries', data_names=['u']) - entry_1D_wind.datasource.create_scale(resolution='30min', extent=(df_1D.index[0], df_1D.index[-1]), support=1.0, scale_dimension='temporal') + entry_1D_wind.datasource.create_scale(resolution='30min', extent=(df_1D_wind.index[0], df_1D_wind.index[-1]), support=1.0, scale_dimension='temporal') + + # add data + entry_1D_wind.import_data(df_1D_wind) + + # read data + dat = entry_1D_wind.get_data() + + # assert + assert dat.columns == 'u' + assert dat['u'].mean() == 3.070534 return True -#def force_data_names_true(session): -# return True +def force_data_names_true(session): + """ + Test force_data_names=True when loading the data into the database. + In this case, datasource.data_names will be overwritten with the column + names of the imported data, when exporting the data, these column col_names + will be displayed. + We use the 3D eddy wind data for this again. + """ + # find the variable + var_3D_wind = api.find_variable(session, name='3D-wind')[0] + + # find the previously added person + kit = api.find_person(session, organisation_abbrev='KIT')[0] + + # find the previously added author + kit = api.find_person(session, organisation_abbrev='KIT')[0] + + # add the entry + entry_3D_force_data_names = api.add_entry(session, title='3-dimensional windspeed data, force_data_names', + abstract='3-dimensional windspeed data from the Fendt data set', + location=(8, 52), + variable=var_3D_wind.id, + comment='after double rotation', + license=6, + author=kit.id, + embargo=False, + is_partial=False) + + # create datasource and scale + entry_3D_force_data_names.create_datasource(type=1, path='timeseries_array', datatype='timeseries', data_names=['u', 'v', 'w']) + + entry_3D_force_data_names.datasource.create_scale(resolution='30min', extent=(df_3D_wind.index[0], df_3D_wind.index[-1]), support=1.0, scale_dimension='temporal') + + # add data + entry_3D_force_data_names.import_data(df_3D_wind, force_data_names=True) + + #load data + dat = entry_3D_force_data_names.get_data() + + assert dat.columns.tolist() == ['u_ms', 'v_ms', 'w_ms'] + assert dat['u_ms'].mean() == 3.070534 + + return True + + +# TEST len(data_columns) != len(entry.variable.column_names) + + #### a datasource must always be created first, datasource.data_names is not nullable -> WHEN would we use variable.column_names?? #def test_old_timeseries(session): @@ -144,4 +201,5 @@ def test_array_type_data(): assert create_3D_datasource(session) assert add_3D_data(session) assert read_3D_data(session) - #assert one_dim_data(session) + assert one_dim_data(session) + assert force_data_names_true(session) From 190c88bbfe8d51bd2105f06396e266cbdd09178a Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Tue, 11 May 2021 12:19:51 +0200 Subject: [PATCH 14/30] force_data_names test running --- metacatalog/test/test_array_type_data.py | 50 ++++++++++++------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/metacatalog/test/test_array_type_data.py b/metacatalog/test/test_array_type_data.py index bde9a877..4bf311aa 100644 --- a/metacatalog/test/test_array_type_data.py +++ b/metacatalog/test/test_array_type_data.py @@ -6,18 +6,6 @@ -# using eddy wind speed data for the tests (u, v, w) -tstamp = "2018-01-01 00:30:00", "2018-01-01 01:00:00", "2018-01-01 01:30:00", "2018-01-01 02:00:00", "2018-01-01 02:30:00", "2018-01-01 03:00:00", "2018-01-01 03:30:00", "2018-01-01 04:00:00", "2018-01-01 04:30:00", "2018-01-01 05:00:00" -u = 1.123902, 0.214753, 0.446611, 0.962977, 2.915902, 4.048897, 5.368552, 6.046246, 5.405221, 4.172279 -v = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 , 0.0 -w = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 , 0.0 - -df_3D_wind = pd.DataFrame(data={"tstamp": tstamp, "u_ms": u, "v_ms": v, "w_ms": w}) # use different column names to test force_data_names=True -df_3D_wind['tstamp'] = pd.to_datetime(df_3D_wind['tstamp'], format='%Y-%m-%d %H:%M:%S') -df_3D_wind.set_index('tstamp', inplace=True) - - - def add_3D_entry(session): """ Add an entry for the eddy wind data. @@ -48,7 +36,7 @@ def add_3D_entry(session): return True -def create_3D_datasource(session): +def create_3D_datasource(session, df_3D_wind): """ Add a datasource to the eddy entry. """ @@ -65,7 +53,7 @@ def create_3D_datasource(session): return True -def add_3D_data(session): +def add_3D_data(session, df_3D_wind): """ Add Eddy 3D windspeed data to the eddy entry. """ @@ -92,15 +80,10 @@ def read_3D_data(session): return True -def one_dim_data(session): +def one_dim_data(session, df_1D_wind): """ Do the same as above, but with one-dimensional data instead. """ - # generate data - df_1D_wind = pd.DataFrame(data={"tstamp": tstamp, "u_ms": u}) - df_1D_wind['tstamp'] = pd.to_datetime(df_1D_wind['tstamp'], format='%Y-%m-%d %H:%M:%S') - df_1D_wind.set_index('tstamp', inplace=True) - # add the variable var_1D_wind = api.add_variable(session, name='1D-wind', symbol='u', column_names=['u'], unit=107) @@ -134,7 +117,7 @@ def one_dim_data(session): return True -def force_data_names_true(session): +def force_data_names_true(session, df_3D_wind): """ Test force_data_names=True when loading the data into the database. In this case, datasource.data_names will be overwritten with the column @@ -196,10 +179,27 @@ def test_array_type_data(): # get a session session = connect(mode='session') + # using eddy wind speed data for the tests (u, v, w) + tstamp = "2018-01-01 00:30:00", "2018-01-01 01:00:00", "2018-01-01 01:30:00", "2018-01-01 02:00:00", "2018-01-01 02:30:00", "2018-01-01 03:00:00", "2018-01-01 03:30:00", "2018-01-01 04:00:00", "2018-01-01 04:30:00", "2018-01-01 05:00:00" + u = 1.123902, 0.214753, 0.446611, 0.962977, 2.915902, 4.048897, 5.368552, 6.046246, 5.405221, 4.172279 + v = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 , 0.0 + w = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 , 0.0 + + # generate 3D data + df_3D_wind = pd.DataFrame(data={"tstamp": tstamp, "u_ms": u, "v_ms": v, "w_ms": w}) # use different column names to test force_data_names=True + df_3D_wind['tstamp'] = pd.to_datetime(df_3D_wind['tstamp'], format='%Y-%m-%d %H:%M:%S') + df_3D_wind.set_index('tstamp', inplace=True) + + # generate 1D data + df_1D_wind = pd.DataFrame(data={"tstamp": tstamp, "u_ms": u}) + df_1D_wind['tstamp'] = pd.to_datetime(df_1D_wind['tstamp'], format='%Y-%m-%d %H:%M:%S') + df_1D_wind.set_index('tstamp', inplace=True) + + # run single tests assert add_3D_entry(session) - assert create_3D_datasource(session) - assert add_3D_data(session) + assert create_3D_datasource(session, df_3D_wind) + assert add_3D_data(session, df_3D_wind) assert read_3D_data(session) - assert one_dim_data(session) - assert force_data_names_true(session) + assert one_dim_data(session, df_1D_wind) + assert force_data_names_true(session, df_3D_wind) From 458fc3620e12c8827c336869baf85830b4c903a3 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Tue, 11 May 2021 13:29:10 +0200 Subject: [PATCH 15/30] importer.py now working for the (old) test_models_data.py --- metacatalog/ext/io/importer.py | 58 +++++++++++++++++++++------- metacatalog/test/test_models_data.py | 6 +-- 2 files changed, 47 insertions(+), 17 deletions(-) diff --git a/metacatalog/ext/io/importer.py b/metacatalog/ext/io/importer.py index 381345c8..75a2ed3b 100644 --- a/metacatalog/ext/io/importer.py +++ b/metacatalog/ext/io/importer.py @@ -35,8 +35,12 @@ def import_to_internal_table(entry, datasource, data, force_data_names=False, ** session = object_session(entry) # get index, drop it afterwards - index = imp.tstamp - imp.drop('tstamp', axis=1, inplace=True) + if 'index' in imp.columns: + index = imp.index + imp.drop('index', axis=1, inplace=True) + elif 'tstamp' in imp.columns: + index = imp.tstamp + imp.drop('tstamp', axis=1, inplace=True) # get the column names - exclude everthing that stores precisions data_columns = [col for col in imp.columns.tolist() if not col.startswith('precision')] @@ -68,19 +72,45 @@ def import_to_internal_table(entry, datasource, data, force_data_names=False, ** values = [row for row in imp[data_columns].values] precision = [row for row in imp[precision_columns].values] - # explicitly map the column types - dtypes = { - 'tstamp': sa.TIMESTAMP, - 'data': ARRAY(sa.REAL), - 'precision': ARRAY(sa.REAL) - } - - imp_data = pd.DataFrame(data={'tstamp': index, 'data': values, 'precision': precision}) - imp_data['entry_id'] = entry.id + # make importer.py compatible with the (old) 1D timeseries table + if tablename == 'timeseries': + # explicitly map the column types + dtypes = { + 'tstamp': sa.TIMESTAMP, + 'value': sa.NUMERIC, + 'precision': sa.NUMERIC + } + + # convert 1D np.ndarray data and precision to type float + values = [number for array in values for number in array] + precision = [number for array in precision for number in array] + + # the list comprehension above creates an empty list if precision is empty: + if not precision: + imp_data = pd.DataFrame(data={'tstamp': index, 'value': values}) + imp_data['entry_id'] = entry.id + else: + imp_data = pd.DataFrame(data={'tstamp': index, 'value': values, 'precision': precision}) + imp_data['entry_id'] = entry.id - # else import - if_exists = kwargs.get('if_exists', 'append') - imp_data.to_sql(tablename, session.bind, index=None, dtype=dtypes, if_exists=if_exists) + # else import + if_exists = kwargs.get('if_exists', 'append') + imp_data.to_sql(tablename, session.bind, index=None, dtype=dtypes, if_exists=if_exists) + else: + # else: the (new) timeseries_array is used + # explicitly map the column types + dtypes = { + 'tstamp': sa.TIMESTAMP, + 'data': ARRAY(sa.REAL), + 'precision': ARRAY(sa.REAL) + } + + imp_data = pd.DataFrame(data={'tstamp': index, 'data': values, 'precision': precision}) + imp_data['entry_id'] = entry.id + + # else import + if_exists = kwargs.get('if_exists', 'append') + imp_data.to_sql(tablename, session.bind, index=None, dtype=dtypes, if_exists=if_exists) def import_to_local_csv_file(entry, datasource, data, **kwargs): diff --git a/metacatalog/test/test_models_data.py b/metacatalog/test/test_models_data.py index fa4dbb42..571c4737 100644 --- a/metacatalog/test/test_models_data.py +++ b/metacatalog/test/test_models_data.py @@ -1,7 +1,7 @@ """ This e2e Test needs the add-find API tests to be finished. -It will use the Entries created in that test to create +It will use the Entries created in that test to create some data samples and upload them to the database. """ @@ -18,7 +18,7 @@ def create_datasource(session, entry: models.Entry, data): # create the datasource - datasource = entry.create_datasource('timeseries', 'internal', 'timeseries', commit=True) + datasource = entry.create_datasource('timeseries', 'internal', 'timeseries', data_names=['data_name'], commit=True) assert datasource is not None # check @@ -102,4 +102,4 @@ def test_data_crud_operations(): read_data(session, entry, timeseries) assert append_data(session, entry, new_chunk) read_data(session, entry, all_data) - assert delete_data(session, entry) \ No newline at end of file + assert delete_data(session, entry) From c13ce39aa42e5bc51fde7c1949ea17e57f114301 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Mon, 17 May 2021 14:03:43 +0200 Subject: [PATCH 16/30] current unittests working --- metacatalog/ext/io/importer.py | 2 +- metacatalog/ext/io/reader.py | 11 +++- metacatalog/models/datasource.py | 2 +- metacatalog/models/entry.py | 4 +- metacatalog/test/test_array_type_data.py | 70 +++++++++++++++++++----- metacatalog/test/test_models_data.py | 4 +- 6 files changed, 71 insertions(+), 22 deletions(-) diff --git a/metacatalog/ext/io/importer.py b/metacatalog/ext/io/importer.py index 75a2ed3b..f8ff12f6 100644 --- a/metacatalog/ext/io/importer.py +++ b/metacatalog/ext/io/importer.py @@ -42,7 +42,7 @@ def import_to_internal_table(entry, datasource, data, force_data_names=False, ** index = imp.tstamp imp.drop('tstamp', axis=1, inplace=True) - # get the column names - exclude everthing that stores precisions + # get the column names - exclude everthing that stores precision data_columns = [col for col in imp.columns.tolist() if not col.startswith('precision')] # get the precision columns diff --git a/metacatalog/ext/io/reader.py b/metacatalog/ext/io/reader.py index 8d882765..6e31261d 100644 --- a/metacatalog/ext/io/reader.py +++ b/metacatalog/ext/io/reader.py @@ -59,10 +59,15 @@ def read_from_internal_table(entry, datasource, start=None, end=None, **kwargs): raw = np.hstack([rawvalues, rawprecision]) df = pd.DataFrame(data=raw, columns=col_names, index=df_sql.index) - # if 'data' does not appear in the column names, the old routine is used - else: + elif 'value' in df_sql.columns: + # if 'data' does not appear in the column names, the old routine is used + df = df_sql.copy() + df.drop(['entry_id'], axis=1, inplace=True) + # map column names - df_sql.columns = [datasource.data_names if _col== 'value' else _col for _col in df.columns] + df.columns = [datasource.data_names[0] if _col== 'value' else _col for _col in df.columns] + else: + print('Currently only "timeseries" and "timeseries_array" are supported.') return df diff --git a/metacatalog/models/datasource.py b/metacatalog/models/datasource.py index ad5737cc..ad54aa5c 100644 --- a/metacatalog/models/datasource.py +++ b/metacatalog/models/datasource.py @@ -465,7 +465,7 @@ class DataSource(Base): datatype_id = Column(Integer, ForeignKey('datatypes.id'), nullable=False) encoding = Column(String(64), default='utf-8') path = Column(String, nullable=False) - data_names = Column(ARRAY(String(128)), nullable=False) + data_names = Column(ARRAY(String(128)), nullable=True) args = Column(String) # scales diff --git a/metacatalog/models/entry.py b/metacatalog/models/entry.py index 066424e5..23aaf516 100644 --- a/metacatalog/models/entry.py +++ b/metacatalog/models/entry.py @@ -596,7 +596,7 @@ def neighbors(self, distance, unit='meter', buffer_epsg=3857, as_sql=False, **kw else: return filter_query.all() - def create_datasource(self, path: str, type, datatype, data_names, commit=False, **args): + def create_datasource(self, path: str, type, datatype, commit=False, **args): """ """ # @@ -618,7 +618,7 @@ def create_datasource(self, path: str, type, datatype, data_names, commit=False, dtype = session.query(models.DataType).filter(models.DataType.name==datatype).one() # build the datasource object - ds = models.DataSource(type=ds_type, datatype=dtype, path=path, data_names=data_names) + ds = models.DataSource(type=ds_type, datatype=dtype, path=path) # add the args ds.save_args_from_dict(args) diff --git a/metacatalog/test/test_array_type_data.py b/metacatalog/test/test_array_type_data.py index 4bf311aa..d2510555 100644 --- a/metacatalog/test/test_array_type_data.py +++ b/metacatalog/test/test_array_type_data.py @@ -1,6 +1,7 @@ import pytest import pandas as pd +import numpy as np from metacatalog import api from ._util import connect @@ -41,14 +42,14 @@ def create_3D_datasource(session, df_3D_wind): Add a datasource to the eddy entry. """ entry_3D_wind = api.find_entry(session, title='3-dimensional windspeed data')[0] - entry_3D_wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries', data_names=['u', 'v', 'w']) + entry_3D_wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries') entry_3D_wind.datasource.create_scale(resolution='30min', extent=(df_3D_wind.index[0], df_3D_wind.index[-1]), support=1.0, scale_dimension='temporal') session.commit() # assert - assert entry_3D_wind.datasource.data_names == ['u', 'v', 'w'] + assert entry_3D_wind.variable.column_names == ['u', 'v', 'w'] return True @@ -100,7 +101,7 @@ def one_dim_data(session, df_1D_wind): is_partial=False) # create datasource and scale - entry_1D_wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries', data_names=['u']) + entry_1D_wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries') entry_1D_wind.datasource.create_scale(resolution='30min', extent=(df_1D_wind.index[0], df_1D_wind.index[-1]), support=1.0, scale_dimension='temporal') @@ -128,9 +129,6 @@ def force_data_names_true(session, df_3D_wind): # find the variable var_3D_wind = api.find_variable(session, name='3D-wind')[0] - # find the previously added person - kit = api.find_person(session, organisation_abbrev='KIT')[0] - # find the previously added author kit = api.find_person(session, organisation_abbrev='KIT')[0] @@ -146,7 +144,7 @@ def force_data_names_true(session, df_3D_wind): is_partial=False) # create datasource and scale - entry_3D_force_data_names.create_datasource(type=1, path='timeseries_array', datatype='timeseries', data_names=['u', 'v', 'w']) + entry_3D_force_data_names.create_datasource(type=1, path='timeseries_array', datatype='timeseries') entry_3D_force_data_names.datasource.create_scale(resolution='30min', extent=(df_3D_wind.index[0], df_3D_wind.index[-1]), support=1.0, scale_dimension='temporal') @@ -156,25 +154,63 @@ def force_data_names_true(session, df_3D_wind): #load data dat = entry_3D_force_data_names.get_data() + # assert assert dat.columns.tolist() == ['u_ms', 'v_ms', 'w_ms'] assert dat['u_ms'].mean() == 3.070534 return True +def precision_test(session, df_3D_prec): + """ + Test if precision columns are handled correctly. + We use the 3D eddy wind data with 3 precision columns for this. + """ + # find the variable + var_3D_wind = api.find_variable(session, name='3D-wind')[0] + + # find the previously added person + kit = api.find_person(session, organisation_abbrev='KIT')[0] -# TEST len(data_columns) != len(entry.variable.column_names) + # add the entry + entry_3D_precision = api.add_entry(session, title='3-dimensional windspeed data, precision', + abstract='3-dimensional windspeed data from the Fendt data set', + location=(8, 52), + variable=var_3D_wind.id, + comment='after double rotation', + license=6, + author=kit.id, + embargo=False, + is_partial=False) - #### a datasource must always be created first, datasource.data_names is not nullable -> WHEN would we use variable.column_names?? + # create datasource and scale + entry_3D_precision.create_datasource(type=1, path='timeseries_array', datatype='timeseries') + entry_3D_precision.datasource.create_scale(resolution='30min', extent=(df_3D_prec.index[0], df_3D_prec.index[-1]), support=1.0, scale_dimension='temporal') + + # add data + entry_3D_precision.import_data(df_3D_prec, force_data_names=False) + + #load data + dat = entry_3D_precision.get_data() + + # assert + assert dat.columns.tolist() == ['u', 'v', 'w', 'precision1', 'precision2', 'precision3'] # note: input was 'precision_1' + assert dat['u'].mean() == 3.070534 + + return True + +def auto_force_data_names(session): + """ + If + """ +# TEST len(data_columns) != len(entry.variable.column_names) -#def test_old_timeseries(session): -# return True @pytest.mark.depends(on=['db_init'], name='array_type_data') def test_array_type_data(): """ - A simple workflow of 3 persons who contributed to two entries. - The content of some related content is tested randomly + Test if timeseries_array works correctly. + Backward compatibility with the old timeseries path is tested in test_models_data.py """ # get a session session = connect(mode='session') @@ -184,6 +220,9 @@ def test_array_type_data(): u = 1.123902, 0.214753, 0.446611, 0.962977, 2.915902, 4.048897, 5.368552, 6.046246, 5.405221, 4.172279 v = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 , 0.0 w = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 , 0.0 + prec1 = np.random.rand(10) + prec2 = np.random.rand(10) + prec3 = np.random.rand(10) # generate 3D data df_3D_wind = pd.DataFrame(data={"tstamp": tstamp, "u_ms": u, "v_ms": v, "w_ms": w}) # use different column names to test force_data_names=True @@ -195,6 +234,10 @@ def test_array_type_data(): df_1D_wind['tstamp'] = pd.to_datetime(df_1D_wind['tstamp'], format='%Y-%m-%d %H:%M:%S') df_1D_wind.set_index('tstamp', inplace=True) + # generate 3D data with random 3D precision + df_3D_prec = pd.DataFrame(data={"tstamp": tstamp, "u_ms": u, "v_ms": v, "w_ms": w, "precision_1": prec1, "precision_2": prec2, "precision_3": prec3}) + df_3D_prec['tstamp'] = pd.to_datetime(df_3D_prec['tstamp'], format='%Y-%m-%d %H:%M:%S') + df_3D_prec.set_index('tstamp', inplace=True) # run single tests assert add_3D_entry(session) @@ -203,3 +246,4 @@ def test_array_type_data(): assert read_3D_data(session) assert one_dim_data(session, df_1D_wind) assert force_data_names_true(session, df_3D_wind) + #assert precision_test(session, df_3D_prec) diff --git a/metacatalog/test/test_models_data.py b/metacatalog/test/test_models_data.py index 571c4737..8a40d24b 100644 --- a/metacatalog/test/test_models_data.py +++ b/metacatalog/test/test_models_data.py @@ -18,7 +18,7 @@ def create_datasource(session, entry: models.Entry, data): # create the datasource - datasource = entry.create_datasource('timeseries', 'internal', 'timeseries', data_names=['data_name'], commit=True) + datasource = entry.create_datasource('timeseries', 'internal', 'timeseries', commit=True) assert datasource is not None # check @@ -50,7 +50,7 @@ def read_data(session, entry, data): db_data = entry.get_data() return assert_array_almost_equal( - getattr(db_data, entry.variable.name).values, + getattr(db_data, entry.variable.column_names[0]).values, data.value.values, decimal=3 ) From d779efc070b630b56ca51f8b91bfa8c1846b69a2 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Mon, 17 May 2021 16:28:36 +0200 Subject: [PATCH 17/30] assert update --- metacatalog/test/test_array_type_data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/metacatalog/test/test_array_type_data.py b/metacatalog/test/test_array_type_data.py index d2510555..a3b7469b 100644 --- a/metacatalog/test/test_array_type_data.py +++ b/metacatalog/test/test_array_type_data.py @@ -76,7 +76,7 @@ def read_3D_data(session): assert dat.columns[1] == 'v' assert dat.columns.tolist() == ['u', 'v', 'w'] # at the moment, no precision columns will be returned when there is no data, is this the wanted behaviour? assert dat.index[2] == pd.to_datetime("2018-01-01 01:30:00", format='%Y-%m-%d %H:%M:%S') - assert dat['u'].mean() == 3.070534 + assert dat['u'].mean() == pytest.approx(3.1, 0.05) return True @@ -113,7 +113,7 @@ def one_dim_data(session, df_1D_wind): # assert assert dat.columns == 'u' - assert dat['u'].mean() == 3.070534 + assert dat['u'].mean() == pytest.approx(3.1, 0.05) return True @@ -156,7 +156,7 @@ def force_data_names_true(session, df_3D_wind): # assert assert dat.columns.tolist() == ['u_ms', 'v_ms', 'w_ms'] - assert dat['u_ms'].mean() == 3.070534 + assert dat['u_ms'].mean() == pytest.approx(3.1, 0.05) return True @@ -195,7 +195,7 @@ def precision_test(session, df_3D_prec): # assert assert dat.columns.tolist() == ['u', 'v', 'w', 'precision1', 'precision2', 'precision3'] # note: input was 'precision_1' - assert dat['u'].mean() == 3.070534 + assert dat['u'].mean() == pytest.approx(3.1, 0.05) return True From c3761abbd95b9ee9e3c229065553ec842326f301 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 20 May 2021 10:54:11 +0200 Subject: [PATCH 18/30] adding two more missing varaibles --- metacatalog/data/variables.csv | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metacatalog/data/variables.csv b/metacatalog/data/variables.csv index 28662e0e..244827a2 100644 --- a/metacatalog/data/variables.csv +++ b/metacatalog/data/variables.csv @@ -17,3 +17,5 @@ id,name,symbol,column_names,unit_id,keyword_id 16,bulk electrical conductivity,bEC,bulk_electrical_conductivity,25,5111 17,specific electrical conductivity,sEC,specific_electrical_conductivity,25,5111 18,river water level,L,river_water_level,2, +19,evapotranspiration,ET,evapotranspiration,6319 +20,drainage,D,drainage,7328 \ No newline at end of file From b73aaa3d9b6b17ce8d8907cdbc5f77a96c002187 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 20 May 2021 10:59:52 +0200 Subject: [PATCH 19/30] add unit to new variables --- metacatalog/data/variables.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metacatalog/data/variables.csv b/metacatalog/data/variables.csv index 244827a2..2fe01f4b 100644 --- a/metacatalog/data/variables.csv +++ b/metacatalog/data/variables.csv @@ -17,5 +17,5 @@ id,name,symbol,column_names,unit_id,keyword_id 16,bulk electrical conductivity,bEC,bulk_electrical_conductivity,25,5111 17,specific electrical conductivity,sEC,specific_electrical_conductivity,25,5111 18,river water level,L,river_water_level,2, -19,evapotranspiration,ET,evapotranspiration,6319 -20,drainage,D,drainage,7328 \ No newline at end of file +19,evapotranspiration,ET,evapotranspiration,103,6319 +20,drainage,D,drainage,103,7328 \ No newline at end of file From cc351900621ccd06d81050c96c4b425d1708d495 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Thu, 20 May 2021 12:06:07 +0200 Subject: [PATCH 20/30] documentation: version 0.3.0 variable.column_names: symbol or name? --- metacatalog/api/add.py | 2 +- metacatalog/data/variables.csv | 36 +++++++++++------------ metacatalog/models/datasource.py | 2 +- metacatalog/models/details.py | 50 ++++++++++++++++---------------- metacatalog/models/variable.py | 2 +- 5 files changed, 46 insertions(+), 46 deletions(-) diff --git a/metacatalog/api/add.py b/metacatalog/api/add.py index f7104095..2654f1e3 100644 --- a/metacatalog/api/add.py +++ b/metacatalog/api/add.py @@ -131,7 +131,7 @@ def add_variable(session, name, symbol, column_names, unit): The variable symbol. Try to use the correct physical variable symbols and avoid dublicates. column_names : list - .. versionadded:: 0.2.12 + .. versionadded:: 0.3.0 List of default column names that will be displayed when exporting the data. The columns are named in the same order as they appear in the list. unit : int, str diff --git a/metacatalog/data/variables.csv b/metacatalog/data/variables.csv index 28662e0e..a1e14b2a 100644 --- a/metacatalog/data/variables.csv +++ b/metacatalog/data/variables.csv @@ -1,19 +1,19 @@ id,name,symbol,column_names,unit_id,keyword_id -1,air temperature,Ta,air_temperature,101,111 -2,soil temperature,Ts,soil_temperature,101,5736 -3,water temperature,Tw,water_temperature,101,7402 -4,discharge,Q,discharge,108,7327 -5,air pressure,p,air_pressure,104,109 -6,relative humidity,RH,relative_humidity,112,6308 -7,daily rainfall sum,P,daily_rainfall_sum,103,6434 -8,rainfall intensity,Pi,rainfall_intensity,105,6436 -9,solar irradiance,SI,solar_irradiance,115,5236 -10,net radiation,Rn,net_radiation,115,5227 -11,gravimetric water content,u,gravimetric_water_content,114,5727 -12,volumetric water content,theta,volumetric_water_content,113,5727 -13,precision,sigma,precision,21, -14,sap flow,Fm,sap_flow,22,7424 -15,matric potential,phi,matric_potential,24, -16,bulk electrical conductivity,bEC,bulk_electrical_conductivity,25,5111 -17,specific electrical conductivity,sEC,specific_electrical_conductivity,25,5111 -18,river water level,L,river_water_level,2, +1,air temperature,Ta,Ta,101,111 +2,soil temperature,Ts,Ts,101,5736 +3,water temperature,Tw,Tw,101,7402 +4,discharge,Q,Q,108,7327 +5,air pressure,p,p,104,109 +6,relative humidity,RH,RH,112,6308 +7,daily rainfall sum,P,P,103,6434 +8,rainfall intensity,Pi,Pi,105,6436 +9,solar irradiance,SI,SI,115,5236 +10,net radiation,Rn,Rn,115,5227 +11,gravimetric water content,u,u,114,5727 +12,volumetric water content,theta,theta,113,5727 +13,precision,sigma,sigma,21, +14,sap flow,Fm,Fm,22,7424 +15,matric potential,phi,phi,24, +16,bulk electrical conductivity,bEC,bEC,25,5111 +17,specific electrical conductivity,sEC,sEC,25,5111 +18,river water level,L,L,2, diff --git a/metacatalog/models/datasource.py b/metacatalog/models/datasource.py index ad54aa5c..bf966aec 100644 --- a/metacatalog/models/datasource.py +++ b/metacatalog/models/datasource.py @@ -443,7 +443,7 @@ class DataSource(Base): The referenced :class:`DataSourceType`. Can be used instead of setting ``type_id``. data_names : list - .. versionadded:: 0.2.12 + .. versionadded:: 0.3.0 List of column names that will be displayed when exporting the data. The columns are named in the same order as they appear in the list. diff --git a/metacatalog/models/details.py b/metacatalog/models/details.py index 9729b897..46c2e20b 100644 --- a/metacatalog/models/details.py +++ b/metacatalog/models/details.py @@ -13,55 +13,55 @@ class Detail(Base): """Entry detail - `Detail` data are optional key-value pairs that can be linked to - `metacatalo.models.Entry` records by `1:n` relationships. This is - vital metadata information that is specific to the `Entry` itself. - E.g. specific to the sensor or variable, but cannot be generalized - to all kinds of `Entry` types. - - Details can be loaded as a python dict or be converted to - text-based tables. A HTML or markdown table can e.g. be appended + `Detail` data are optional key-value pairs that can be linked to + `metacatalo.models.Entry` records by `1:n` relationships. This is + vital metadata information that is specific to the `Entry` itself. + E.g. specific to the sensor or variable, but cannot be generalized + to all kinds of `Entry` types. + + Details can be loaded as a python dict or be converted to + text-based tables. A HTML or markdown table can e.g. be appended to the `Entry.abstract` on export. - Since version 0.1.13, it is possible to link an existing + Since version 0.1.13, it is possible to link an existing :class:`Thesaurus ` to the detail. - This makes the export to ISO 19115 in princile possible as an + This makes the export to ISO 19115 in princile possible as an ``MD_MetadataExtensionInformation`` object. Attributes ---------- id : int - Primary Key. Identifies the record. If left empty the + Primary Key. Identifies the record. If left empty the Database will assign one. entry_id : int - Foreign Key. Identifies the `metacatalog.models.Entry` + Foreign Key. Identifies the `metacatalog.models.Entry` which is decribed by this detail. key : str - The key of the key vaule par. Maximum 20 letters, + The key of the key vaule par. Maximum 20 letters, ideally no whitespaces. stem : str - Stemmed key using a `nltk.PorterStemmer`. The stemmed + Stemmed key using a `nltk.PorterStemmer`. The stemmed key can be used to search for related keys - value : str, dict - .. versionchanged:: 0.2.12 + value : str, list + .. versionchanged:: 0.3.0 The actual value of this detail. This can be a string or a flat dictionary. description : str - Description what the key means in the context of the - :class:`Entry ` or + Description what the key means in the context of the + :class:`Entry ` or :class:`EntryGroup `. Optional, can be omitted, if not applicable. thesaurus : metacatalog.models.Thesaurus .. versionadded:: 0.1.13 Optional. If the detail :attr:`key` is described in a thesaurus or - controlled dictionary list, you can link the thesaurus - to the detail. Details with thesaurus information are - in principle exportable to ISO 19115 using an + controlled dictionary list, you can link the thesaurus + to the detail. Details with thesaurus information are + in principle exportable to ISO 19115 using an ``MD_MetadataExtensionInformation``. thesaurus_id : int .. versionadded:: 0.1.13 - Foreign key of the linked - :class:`Thesaurus `. + Foreign key of the linked + :class:`Thesaurus `. """ __tablename__ = 'details' @@ -87,7 +87,7 @@ def __init__(self, **kwargs): if 'value' in kwargs: value = kwargs['value'] del kwargs['value'] - + # call the main init func super(Detail, self).__init__(**kwargs) @@ -136,4 +136,4 @@ def __str__(self): if self.thesaurus is not None: return '%s = %s <%s>' % (self.key, self.value, self.thesaurus.name) else: - return "%s = %s" % (self.key, self.value) + return "%s = %s" % (self.key, self.value) diff --git a/metacatalog/models/variable.py b/metacatalog/models/variable.py index d98874e0..c95505c5 100644 --- a/metacatalog/models/variable.py +++ b/metacatalog/models/variable.py @@ -108,7 +108,7 @@ class Variable(Base): variables : list Lazy loaded list of Variables that use the current unit column_names : list - .. versionadded:: 0.2.12 + .. versionadded:: 0.3.0 List of default column names that will be displayed when exporting the data. The columns are named in the same order as they appear in the list. From fc8434ba68677adc3c67c0b02cd5a8b8bedbc74d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 20 May 2021 12:18:18 +0200 Subject: [PATCH 21/30] database revision for PR --- metacatalog/data/entrygroup_types.csv | 1 + metacatalog/db/revisions/__init__.py | 2 + metacatalog/db/revisions/rev6.py | 84 ++++++++++++++++++++++++++ metacatalog/models/timeseries.py | 2 +- metacatalog/models/timeseries_array.py | 2 +- 5 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 metacatalog/db/revisions/rev6.py diff --git a/metacatalog/data/entrygroup_types.csv b/metacatalog/data/entrygroup_types.csv index 48da58c8..0b78bc7f 100644 --- a/metacatalog/data/entrygroup_types.csv +++ b/metacatalog/data/entrygroup_types.csv @@ -2,3 +2,4 @@ id,name,description 1,Project,"A Project groups datasets into a lager collection of datasets that have been collected or used in the same Campaign." 2,Composite,"A composite dataset groups a number of datasets that are inseparable." 3,Split dataset,"A split dataset groups a number of identical datasets that have to be split e.g. in case of different time scale resolution." +4,Label,"A Label groups different datasets into a larger collection of datasets, that are now a composite, but i.e. collected at the same site." diff --git a/metacatalog/db/revisions/__init__.py b/metacatalog/db/revisions/__init__.py index d48fe389..c25ebcf9 100644 --- a/metacatalog/db/revisions/__init__.py +++ b/metacatalog/db/revisions/__init__.py @@ -5,6 +5,7 @@ rev3, rev4, rev5, + rev6, ) revisions = { @@ -14,4 +15,5 @@ 3: rev3, 4: rev4, 5: rev5, + 6: rev6, } diff --git a/metacatalog/db/revisions/rev6.py b/metacatalog/db/revisions/rev6.py new file mode 100644 index 00000000..cdec6096 --- /dev/null +++ b/metacatalog/db/revisions/rev6.py @@ -0,0 +1,84 @@ +""" +Metacatalog database revision +----------------------------- +date: 2021-05-20T11:02:13.319954 + +revision #6 + + +""" +from sqlalchemy.orm import Session +from metacatalog import api, models + + +UPGRADE_SQL = """ +-- add a new Entrygroup type +INSERT INTO entrygroup_types (id, name, description) VALUES +(4,'Label','A Label groups different datasets into a larger collection of datasets, that are now a composite, but i.e. collected at the same site.'); + +-- todo, here the new column creation is missing +ALTER TABLE variables ADD COLUMN column_names CHARACTER VARYING(128)[]; + +-- add new variables +INSERT INTO variables (id,name,symbol,column_names,unit_id,keyword_id) VALUES + (19,'evapotranspiration','ET','{"evapotranspiration"}',103,6319), + (20,'drainage','D','{"drainage"}',103,7328); + +-- rename timeseries to timeseries_1d +ALTER TABLE timeseries RENAME TO timeseries_1d; +ALTER TABLE timeseries_1d RENAME CONSTRAINT timeseries_pkey TO timeseries_1d_pkey; +ALTER TABLE timeseries_1d RENAME CONSTRAINT timeseries_entry_id_fkey TO timeseries_1d_entry_id_fkey; + +-- update datasources +UPDATE datasources SET path='timeseries_1d' WHERE path='timeseries'; + +-- create new table +CREATE TABLE timeseries ( + entry_id INTEGER NOT NULL, + tstamp timestamp without time zone NOT NULL, + "data" REAL[], + "precision" REAL[] +); +ALTER TABLE timeseries ADD CONSTRAINT timeseries_pkey PRIMARY KEY (entry_id, tstamp); +ALTER TABLE timeseries ADD CONSTRAINT timeseries_entry_id_fkey FOREIGN KEY (entry_id) REFERENCES entries (id); + +COMMIT; +""" + +DOWNGRADE_SQL = """ +-- delete entrygroups that use the Label type +DELETE FROM nm_entrygroups WHERE group_id in (SELECT id FROM entrygroups WHERE type_id=4); +DELETE FROM entrygroups WHERE type_id=4; + +-- remove the entrygroup type +DELETE FROM entrygroup_types WHERE id=4; + +-- remove the colmap column +ALTER TABLE variables DROP COLUMN column_names; +DELETE FROM VARIABLES WHERE id in (19, 20); + +-- delete timeseries +DROP TABLE timeseries; +COMMIT; + +-- rename the stuff back +ALTER TABLE timeseries_1d RENAME TO timeseries; +ALTER TABLE timeseries RENAME CONSTRAINT timeseries_1d_pkey TO timeseries_pkey; +ALTER TABLE timeseries RENAME CONSTRAINT timeseries_1d_entry_id_fkey TO timeseries_entry_id_fkey; + +-- update datasources +UPDATE datasources SET path='timeseries' WHERE path='timeseries_1d'; +COMMIT; +""" + +# define the upgrade function +def upgrade(session: Session): + # create the new EntryGroup type + with session.bind.connect() as con: + con.execute(UPGRADE_SQL) + + +# define the downgrade function +def downgrade(session: Session): + with session.bind.connect() as con: + con.execute(DOWNGRADE_SQL) \ No newline at end of file diff --git a/metacatalog/models/timeseries.py b/metacatalog/models/timeseries.py index 65e1cffb..843a01c8 100644 --- a/metacatalog/models/timeseries.py +++ b/metacatalog/models/timeseries.py @@ -6,7 +6,7 @@ class TimeseriesPoint(Base): - __tablename__ = 'timeseries' + __tablename__ = 'timeseries_1d' # columns entry_id = Column(Integer, ForeignKey('entries.id'), primary_key=True) diff --git a/metacatalog/models/timeseries_array.py b/metacatalog/models/timeseries_array.py index 12ea3b9b..78cca63c 100644 --- a/metacatalog/models/timeseries_array.py +++ b/metacatalog/models/timeseries_array.py @@ -7,7 +7,7 @@ class TimeseriesArray(Base): - __tablename__ = 'timeseries_array' + __tablename__ = 'timeseries' # columns entry_id = Column(Integer, ForeignKey('entries.id'), primary_key=True) From 5bd651f342104ae3f06b02544da39b817a8ace2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 20 May 2021 12:58:31 +0200 Subject: [PATCH 22/30] renaming timeseries module --- metacatalog/ext/io/importer.py | 1 - metacatalog/models/__init__.py | 3 +-- metacatalog/models/timeseries.py | 25 ++++-------------- metacatalog/models/timeseries_array.py | 20 -------------- metacatalog/models/timeseries_legacy.py | 35 +++++++++++++++++++++++++ 5 files changed, 41 insertions(+), 43 deletions(-) delete mode 100644 metacatalog/models/timeseries_array.py create mode 100644 metacatalog/models/timeseries_legacy.py diff --git a/metacatalog/ext/io/importer.py b/metacatalog/ext/io/importer.py index f8ff12f6..7e31f3a9 100644 --- a/metacatalog/ext/io/importer.py +++ b/metacatalog/ext/io/importer.py @@ -6,7 +6,6 @@ from sqlalchemy.dialects.postgresql import ARRAY from metacatalog.models.entry import Entry -from metacatalog.models.timeseries import TimeseriesPoint def import_to_internal_table(entry, datasource, data, force_data_names=False, **kwargs): diff --git a/metacatalog/models/__init__.py b/metacatalog/models/__init__.py index 234f4331..700a708d 100644 --- a/metacatalog/models/__init__.py +++ b/metacatalog/models/__init__.py @@ -20,8 +20,7 @@ from .license import License from .variable import Variable, Unit from .datasource import DataSource, DataSourceType, DataType, SpatialScale, TemporalScale -from .timeseries import TimeseriesPoint, TimeseriesPoint2D -from .timeseries_array import TimeseriesArray +from .timeseries import Timeseries from .generic_data import DataPoint, DataPoint2D from .geometry_data import GeometryTimeseries, GenericGeometryData from .config import Log, LogCodes diff --git a/metacatalog/models/timeseries.py b/metacatalog/models/timeseries.py index 843a01c8..97ad174d 100644 --- a/metacatalog/models/timeseries.py +++ b/metacatalog/models/timeseries.py @@ -1,35 +1,20 @@ import pandas as pd from sqlalchemy import Column, ForeignKey from sqlalchemy import Integer, DateTime, Numeric +from sqlalchemy.dialects.postgresql import ARRAY from metacatalog.db.base import Base -class TimeseriesPoint(Base): - __tablename__ = 'timeseries_1d' +class Timeseries(Base): + __tablename__ = 'timeseries' # columns entry_id = Column(Integer, ForeignKey('entries.id'), primary_key=True) tstamp = Column(DateTime, primary_key=True) - value = Column(Numeric, nullable=False) - precision = Column(Numeric, nullable=True) + data = Column(ARRAY(Numeric), nullable=False) + precision = Column(ARRAY(Numeric), nullable=True) @classmethod def is_valid_timeseries(cls, data): return isinstance(data, (pd.DataFrame, pd.Series)) and isinstance(data.index, pd.DatetimeIndex) - - -class TimeseriesPoint2D(Base): - __tablename__ = 'timeseries_2d' - - # columns - entry_id = Column(Integer, ForeignKey('entries.id'), primary_key=True) - tstamp = Column(DateTime, primary_key=True) - value1 = Column(Numeric, nullable=False) - value2 = Column(Numeric, nullable=False) - precision1 = Column(Numeric, nullable=True) - precision2 = Column(Numeric, nullable=True) - - @classmethod - def is_valid_timeseries(cls, data): - return isinstance(data, pd.DataFrame) and isinstance(data.index, pd.DatetimeIndex) and len(data.columns) == 2 \ No newline at end of file diff --git a/metacatalog/models/timeseries_array.py b/metacatalog/models/timeseries_array.py deleted file mode 100644 index 78cca63c..00000000 --- a/metacatalog/models/timeseries_array.py +++ /dev/null @@ -1,20 +0,0 @@ -import pandas as pd -from sqlalchemy import Column, ForeignKey -from sqlalchemy import Integer, DateTime, Numeric -from sqlalchemy.dialects.postgresql import ARRAY - -from metacatalog.db.base import Base - - -class TimeseriesArray(Base): - __tablename__ = 'timeseries' - - # columns - entry_id = Column(Integer, ForeignKey('entries.id'), primary_key=True) - tstamp = Column(DateTime, primary_key=True) - data = Column(ARRAY(Numeric), nullable=False) - precision = Column(ARRAY(Numeric), nullable=True) - - @classmethod - def is_valid_timeseries(cls, data): - return isinstance(data, (pd.DataFrame, pd.Series)) and isinstance(data.index, pd.DatetimeIndex) diff --git a/metacatalog/models/timeseries_legacy.py b/metacatalog/models/timeseries_legacy.py new file mode 100644 index 00000000..843a01c8 --- /dev/null +++ b/metacatalog/models/timeseries_legacy.py @@ -0,0 +1,35 @@ +import pandas as pd +from sqlalchemy import Column, ForeignKey +from sqlalchemy import Integer, DateTime, Numeric + +from metacatalog.db.base import Base + + +class TimeseriesPoint(Base): + __tablename__ = 'timeseries_1d' + + # columns + entry_id = Column(Integer, ForeignKey('entries.id'), primary_key=True) + tstamp = Column(DateTime, primary_key=True) + value = Column(Numeric, nullable=False) + precision = Column(Numeric, nullable=True) + + @classmethod + def is_valid_timeseries(cls, data): + return isinstance(data, (pd.DataFrame, pd.Series)) and isinstance(data.index, pd.DatetimeIndex) + + +class TimeseriesPoint2D(Base): + __tablename__ = 'timeseries_2d' + + # columns + entry_id = Column(Integer, ForeignKey('entries.id'), primary_key=True) + tstamp = Column(DateTime, primary_key=True) + value1 = Column(Numeric, nullable=False) + value2 = Column(Numeric, nullable=False) + precision1 = Column(Numeric, nullable=True) + precision2 = Column(Numeric, nullable=True) + + @classmethod + def is_valid_timeseries(cls, data): + return isinstance(data, pd.DataFrame) and isinstance(data.index, pd.DatetimeIndex) and len(data.columns) == 2 \ No newline at end of file From 8e8ad7eca80e5a32450d54c651995be09adc1b6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 20 May 2021 13:05:33 +0200 Subject: [PATCH 23/30] fix in importer --- metacatalog/ext/io/importer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metacatalog/ext/io/importer.py b/metacatalog/ext/io/importer.py index 7e31f3a9..33ff4eff 100644 --- a/metacatalog/ext/io/importer.py +++ b/metacatalog/ext/io/importer.py @@ -72,7 +72,7 @@ def import_to_internal_table(entry, datasource, data, force_data_names=False, ** precision = [row for row in imp[precision_columns].values] # make importer.py compatible with the (old) 1D timeseries table - if tablename == 'timeseries': + if tablename == 'timeseries_1d': # explicitly map the column types dtypes = { 'tstamp': sa.TIMESTAMP, From 06cb308e42445533186ebbf05565d7c6fb47b131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 20 May 2021 13:07:20 +0200 Subject: [PATCH 24/30] update old models test to legacy --- metacatalog/test/test_models_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metacatalog/test/test_models_data.py b/metacatalog/test/test_models_data.py index 8a40d24b..646ed69b 100644 --- a/metacatalog/test/test_models_data.py +++ b/metacatalog/test/test_models_data.py @@ -18,7 +18,7 @@ def create_datasource(session, entry: models.Entry, data): # create the datasource - datasource = entry.create_datasource('timeseries', 'internal', 'timeseries', commit=True) + datasource = entry.create_datasource('timeseries_1d', 'internal', 'timeseries', commit=True) assert datasource is not None # check From ab613d49e7dae5b5fec27a508b7035cfb03c3121 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 20 May 2021 13:11:37 +0200 Subject: [PATCH 25/30] test update --- metacatalog/test/test_models_data.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/metacatalog/test/test_models_data.py b/metacatalog/test/test_models_data.py index 646ed69b..3dc4179c 100644 --- a/metacatalog/test/test_models_data.py +++ b/metacatalog/test/test_models_data.py @@ -42,31 +42,37 @@ def create_datasource(session, entry: models.Entry, data): def import_data(session, entry: models.Entry, data): entry.import_data(data) - nrecords = session.execute('SELECT count(*) from timeseries where entry_id=%d' % entry.id) - return nrecords.scalar() == 400 + nrecords = session.execute('SELECT count(*) from timeseries_1d where entry_id=%d' % entry.id) + assert nrecords.scalar() == 400 + + return True def read_data(session, entry, data): db_data = entry.get_data() - return assert_array_almost_equal( + assert_array_almost_equal( getattr(db_data, entry.variable.column_names[0]).values, data.value.values, decimal=3 ) + return True + def append_data(session, entry: models.Entry, data): entry.append_data(data) - nrecords = session.execute("SELECT count(*) FROM timeseries WHERE entry_id=%d" % entry.id) - return nrecords.scalar() == 450 + nrecords = session.execute("SELECT count(*) FROM timeseries_1d WHERE entry_id=%d" % entry.id) + assert nrecords.scalar() == 450 + + return True def delete_data(session, entry): entry.delete_data(delete_source=True) - nrecords = session.execute("SELECT count(*) FROM timeseries WHERE entry_id=%d" % entry.id) + nrecords = session.execute("SELECT count(*) FROM timeseries_1d WHERE entry_id=%d" % entry.id) assert nrecords.scalar() == 0 return entry.datasource is None From cd16e79f169d664c0640c740334d801d937729e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Thu, 20 May 2021 14:27:12 +0200 Subject: [PATCH 26/30] another fix for revision 6 --- metacatalog/db/revisions/rev6.py | 39 ++++++++++++++++++++++++++++++-- metacatalog/models/entrygroup.py | 2 +- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/metacatalog/db/revisions/rev6.py b/metacatalog/db/revisions/rev6.py index cdec6096..584484d0 100644 --- a/metacatalog/db/revisions/rev6.py +++ b/metacatalog/db/revisions/rev6.py @@ -18,11 +18,40 @@ -- todo, here the new column creation is missing ALTER TABLE variables ADD COLUMN column_names CHARACTER VARYING(128)[]; +ALTER TABLE datasources ADD COLUMN data_names CHARACTER VARYING(128)[]; -- add new variables INSERT INTO variables (id,name,symbol,column_names,unit_id,keyword_id) VALUES (19,'evapotranspiration','ET','{"evapotranspiration"}',103,6319), - (20,'drainage','D','{"drainage"}',103,7328); + (20,'drainage','D','{"drainage"}',103,7328) +ON CONFLICT ON CONSTRAINT variables_pkey +DO + UPDATE SET column_names=EXCLUDED.column_names; + +-- add column names +UPDATE variables set column_names='{"air_temperature"}' WHERE id=1; +UPDATE variables set column_names='{"soil_temperature"}' WHERE id=2; +UPDATE variables set column_names='{"water_temperature"}' WHERE id=3; +UPDATE variables set column_names='{"discharge"}' WHERE id=4; +UPDATE variables set column_names='{"air_pressure"}' WHERE id=5; +UPDATE variables set column_names='{"relative_humidity"}' WHERE id=6; +UPDATE variables set column_names='{"daily_rainfall_sum"}' WHERE id=7; +UPDATE variables set column_names='{"rainfall_intensity"}' WHERE id=8; +UPDATE variables set column_names='{"solar_irradiance"}' WHERE id=9; +UPDATE variables set column_names='{"net_radiation"}' WHERE id=10; +UPDATE variables set column_names='{"gravimetric_water_content"}' WHERE id=11; +UPDATE variables set column_names='{"volumetric_water_content"}' WHERE id=12; +UPDATE variables set column_names='{"precision"}' WHERE id=13; +UPDATE variables set column_names='{"sap_flow"}' WHERE id=14; +UPDATE variables set column_names='{"matric_potential"}' WHERE id=15; +UPDATE variables set column_names='{"bulk_electrical_conductivity"}' WHERE id=16; +UPDATE variables set column_names='{"specific_electrical_conductivity"}' WHERE id=17; +UPDATE variables set column_names='{"river_water_level"}' WHERE id=18; + +-- column names are build therefore the data_names can be filled +UPDATE datasources SET data_names=column_names +FROM entries JOIN variables ON entries.variable_id=variables.id +WHERE datasources.id = entries.datasource_id; -- rename timeseries to timeseries_1d ALTER TABLE timeseries RENAME TO timeseries_1d; @@ -42,6 +71,9 @@ ALTER TABLE timeseries ADD CONSTRAINT timeseries_pkey PRIMARY KEY (entry_id, tstamp); ALTER TABLE timeseries ADD CONSTRAINT timeseries_entry_id_fkey FOREIGN KEY (entry_id) REFERENCES entries (id); +-- make entrygroup titles longer +ALTER TABLE entrygroups ALTER COLUMN title TYPE character varying(250); + COMMIT; """ @@ -55,7 +87,7 @@ -- remove the colmap column ALTER TABLE variables DROP COLUMN column_names; -DELETE FROM VARIABLES WHERE id in (19, 20); +ALTER TABLE datasources DROP COLUMN data_names; -- delete timeseries DROP TABLE timeseries; @@ -68,6 +100,9 @@ -- update datasources UPDATE datasources SET path='timeseries' WHERE path='timeseries_1d'; + +-- change entrygroup title back +ALTER TABLE entrygroups ALTER COLUMN title TYPE character varying(40); COMMIT; """ diff --git a/metacatalog/models/entrygroup.py b/metacatalog/models/entrygroup.py index d19d89bb..9282e5d7 100644 --- a/metacatalog/models/entrygroup.py +++ b/metacatalog/models/entrygroup.py @@ -104,7 +104,7 @@ class EntryGroup(Base): id = Column(Integer, primary_key=True) uuid = Column(String(36), nullable=False, default=lambda: str(uuid4())) type_id = Column(Integer, ForeignKey('entrygroup_types.id'), nullable=False) - title = Column(String(40)) + title = Column(String(250)) description = Column(String) publication = Column(DateTime, default=dt.utcnow) From 66687fa732a1c5a1c2a3832094ad73656fd93543 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Tue, 25 May 2021 16:31:39 +0200 Subject: [PATCH 27/30] precision handling added to array_type_data Precision test passes without error --- metacatalog/ext/io/importer.py | 62 ++++++++++++++++++------ metacatalog/ext/io/reader.py | 11 +++-- metacatalog/test/test_array_type_data.py | 12 ++--- 3 files changed, 61 insertions(+), 24 deletions(-) diff --git a/metacatalog/ext/io/importer.py b/metacatalog/ext/io/importer.py index 33ff4eff..fb9ba9c9 100644 --- a/metacatalog/ext/io/importer.py +++ b/metacatalog/ext/io/importer.py @@ -8,10 +8,10 @@ from metacatalog.models.entry import Entry -def import_to_internal_table(entry, datasource, data, force_data_names=False, **kwargs): +def import_to_internal_table(entry, datasource, data, precision=None, force_data_names=False, **kwargs): """Import to internal DB - The given data is imported into the table + The given data and (optional) precision is imported into the table as specified in the datasource. If force_data_names=True the column names of the imported data are saved in the datasource, otherwise the standard column names in @@ -24,6 +24,35 @@ def import_to_internal_table(entry, datasource, data, force_data_names=False, ** if isinstance(data, pd.Series): data = pd.DataFrame(data) + # handle precision + if precision is not None: + if isinstance(precision, pd.Series): + precision = pd.DataFrame(precision) + + # raise error if data and precision are of different length + if len(precision) != len(data): + raise ValueError('Data and precision must be of same length.') + # raise error if data and precision are of different length + if all(precision.index != data.index): + raise ValueError('Data and precision index are differing.') + + # flag if precision is passed to the function: + handle_precision = True + + # drop index from precision, index of data is used as index + precision.reset_index(level=0, inplace=True) + if 'index' in precision.columns: + precision.drop('index', axis=1, inplace=True) + elif 'tstamp' in precision.columns: + precision.drop('tstamp', axis=1, inplace=True) + + # get the precision columns + precision_columns = [col for col in precision.columns.tolist()] + # transform precision data into a list of arrays + precision = [row for row in precision[precision_columns].values] + else: + handle_precision = False + # reset the index imp = data.reset_index(level=0, inplace=False) @@ -41,11 +70,8 @@ def import_to_internal_table(entry, datasource, data, force_data_names=False, ** index = imp.tstamp imp.drop('tstamp', axis=1, inplace=True) - # get the column names - exclude everthing that stores precision - data_columns = [col for col in imp.columns.tolist() if not col.startswith('precision')] - - # get the precision columns - precision_columns = [col for col in imp.columns.tolist() if col.startswith('precision')] + # get the data column names + data_columns = [col for col in imp.columns.tolist()] # set entry_id if 'entry_id' not in imp.columns: @@ -69,7 +95,6 @@ def import_to_internal_table(entry, datasource, data, force_data_names=False, ** # transform the data into a list of arrays values = [row for row in imp[data_columns].values] - precision = [row for row in imp[precision_columns].values] # make importer.py compatible with the (old) 1D timeseries table if tablename == 'timeseries_1d': @@ -80,16 +105,17 @@ def import_to_internal_table(entry, datasource, data, force_data_names=False, ** 'precision': sa.NUMERIC } - # convert 1D np.ndarray data and precision to type float + # convert 1D np.ndarray data and precision to type numeric values = [number for array in values for number in array] - precision = [number for array in precision for number in array] - # the list comprehension above creates an empty list if precision is empty: - if not precision: - imp_data = pd.DataFrame(data={'tstamp': index, 'value': values}) + # add precision if handle_precision is set to True + if handle_precision is True: + precision = [number for array in precision for number in array] + + imp_data = pd.DataFrame(data={'tstamp': index, 'value': values, 'precision': precision}) imp_data['entry_id'] = entry.id else: - imp_data = pd.DataFrame(data={'tstamp': index, 'value': values, 'precision': precision}) + imp_data = pd.DataFrame(data={'tstamp': index, 'value': values}) imp_data['entry_id'] = entry.id # else import @@ -104,6 +130,14 @@ def import_to_internal_table(entry, datasource, data, force_data_names=False, ** 'precision': ARRAY(sa.REAL) } + # add precision if handle_precision is set to True + if handle_precision is True: + imp_data = pd.DataFrame(data={'tstamp': index, 'data': values, 'precision': precision}) + imp_data['entry_id'] = entry.id + else: + imp_data = pd.DataFrame(data={'tstamp': index, 'data': values}) + imp_data['entry_id'] = entry.id + imp_data = pd.DataFrame(data={'tstamp': index, 'data': values, 'precision': precision}) imp_data['entry_id'] = entry.id diff --git a/metacatalog/ext/io/reader.py b/metacatalog/ext/io/reader.py index 6e31261d..b1aa3bd4 100644 --- a/metacatalog/ext/io/reader.py +++ b/metacatalog/ext/io/reader.py @@ -50,10 +50,13 @@ def read_from_internal_table(entry, datasource, start=None, end=None, **kwargs): # unstack precision (precision1, precision2, ...) rawprecision = np.vstack(df_sql['precision'].values) - # add precision column names to the col_names - for i in range(1, len(rawprecision[0])+1): - precision_col = 'precision%s' % i - col_names.append(precision_col) + if not all(x is None for x in np.hstack(rawprecision)): # check if precision contains any values + # add precision column names to col_names, if data is contained + for i in range(1, len(rawprecision[0])+1): + precision_col = 'precision%s' % i + col_names.append(precision_col) + else: + rawprecision = np.array([], dtype=np.int64).reshape(len(rawprecision),0) # horizontally stack data and precission raw = np.hstack([rawvalues, rawprecision]) diff --git a/metacatalog/test/test_array_type_data.py b/metacatalog/test/test_array_type_data.py index a3b7469b..bb4b44c5 100644 --- a/metacatalog/test/test_array_type_data.py +++ b/metacatalog/test/test_array_type_data.py @@ -74,7 +74,7 @@ def read_3D_data(session): # assert assert dat.columns[1] == 'v' - assert dat.columns.tolist() == ['u', 'v', 'w'] # at the moment, no precision columns will be returned when there is no data, is this the wanted behaviour? + assert dat.columns.tolist() == ['u', 'v', 'w'] assert dat.index[2] == pd.to_datetime("2018-01-01 01:30:00", format='%Y-%m-%d %H:%M:%S') assert dat['u'].mean() == pytest.approx(3.1, 0.05) @@ -160,7 +160,7 @@ def force_data_names_true(session, df_3D_wind): return True -def precision_test(session, df_3D_prec): +def precision_test(session, df_3D_wind, df_3D_prec): """ Test if precision columns are handled correctly. We use the 3D eddy wind data with 3 precision columns for this. @@ -185,10 +185,10 @@ def precision_test(session, df_3D_prec): # create datasource and scale entry_3D_precision.create_datasource(type=1, path='timeseries_array', datatype='timeseries') - entry_3D_precision.datasource.create_scale(resolution='30min', extent=(df_3D_prec.index[0], df_3D_prec.index[-1]), support=1.0, scale_dimension='temporal') + entry_3D_precision.datasource.create_scale(resolution='30min', extent=(df_3D_wind.index[0], df_3D_wind.index[-1]), support=1.0, scale_dimension='temporal') # add data - entry_3D_precision.import_data(df_3D_prec, force_data_names=False) + entry_3D_precision.import_data(data=df_3D_wind, precision=df_3D_prec, force_data_names=False) #load data dat = entry_3D_precision.get_data() @@ -235,7 +235,7 @@ def test_array_type_data(): df_1D_wind.set_index('tstamp', inplace=True) # generate 3D data with random 3D precision - df_3D_prec = pd.DataFrame(data={"tstamp": tstamp, "u_ms": u, "v_ms": v, "w_ms": w, "precision_1": prec1, "precision_2": prec2, "precision_3": prec3}) + df_3D_prec = pd.DataFrame(data={"tstamp": tstamp, "precision_1": prec1, "precision_2": prec2, "precision_3": prec3}) df_3D_prec['tstamp'] = pd.to_datetime(df_3D_prec['tstamp'], format='%Y-%m-%d %H:%M:%S') df_3D_prec.set_index('tstamp', inplace=True) @@ -246,4 +246,4 @@ def test_array_type_data(): assert read_3D_data(session) assert one_dim_data(session, df_1D_wind) assert force_data_names_true(session, df_3D_wind) - #assert precision_test(session, df_3D_prec) + assert precision_test(session, df_3D_wind, df_3D_prec) From cb8d61662d8826a07fd2c5b426aea9587d331fe6 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Wed, 26 May 2021 16:50:56 +0200 Subject: [PATCH 28/30] added auto_force_data_names test + small adjustment to importer.py --- metacatalog/ext/io/importer.py | 4 +- metacatalog/test/test_array_type_data.py | 49 ++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/metacatalog/ext/io/importer.py b/metacatalog/ext/io/importer.py index fb9ba9c9..9df31488 100644 --- a/metacatalog/ext/io/importer.py +++ b/metacatalog/ext/io/importer.py @@ -32,8 +32,8 @@ def import_to_internal_table(entry, datasource, data, precision=None, force_data # raise error if data and precision are of different length if len(precision) != len(data): raise ValueError('Data and precision must be of same length.') - # raise error if data and precision are of different length - if all(precision.index != data.index): + # raise error if data and precision differing at any position + if any(precision.index != data.index): raise ValueError('Data and precision index are differing.') # flag if precision is passed to the function: diff --git a/metacatalog/test/test_array_type_data.py b/metacatalog/test/test_array_type_data.py index bb4b44c5..55f40a59 100644 --- a/metacatalog/test/test_array_type_data.py +++ b/metacatalog/test/test_array_type_data.py @@ -92,7 +92,8 @@ def one_dim_data(session, df_1D_wind): kit = api.find_person(session, organisation_abbrev='KIT')[0] # add the entry - entry_1D_wind = api.add_entry(session, title='1-dimensional windspeed data',abstract='1-dimensional windspeed data from the Fendt data set', + entry_1D_wind = api.add_entry(session, title='1-dimensional windspeed data', + abstract='1-dimensional windspeed data from the Fendt data set', location=(8, 52), variable=var_1D_wind.id, license=6, @@ -199,11 +200,46 @@ def precision_test(session, df_3D_wind, df_3D_prec): return True -def auto_force_data_names(session): +def auto_force_data_names(session, df_1D_wind, df_3D_prec): """ - If + If len(data_columns) != len(entry.variable.column_names) force_data_names + should automatically become True and the column names of the imported data + should be saved in datasource.data_names. + To test this, we add 1D wind data (with 3D precision) to the 3D wind + variable with variable.column_names=['u', 'v', 'w']. """ -# TEST len(data_columns) != len(entry.variable.column_names) + # find the variable + var_3D_wind = api.find_variable(session, name='3D-wind')[0] + + # find the previously added person + kit = api.find_person(session, organisation_abbrev='KIT')[0] + + # add the entry + entry_1D_precision = api.add_entry(session, title='1-dimensional windspeed data, precision', + abstract='1-dimensional windspeed data', + location=(8, 52), + variable=var_3D_wind.id, + comment='after double rotation', + license=6, + author=kit.id, + embargo=False, + is_partial=False) + # create datasource and scale + entry_1D_precision.create_datasource(type=1, path='timeseries_array', datatype='timeseries') + + entry_1D_precision.datasource.create_scale(resolution='30min', extent=(df_1D_wind.index[0], df_1D_wind.index[-1]), support=1.0, scale_dimension='temporal') + + # add data + entry_1D_precision.import_data(data=df_1D_wind, precision=df_3D_prec, force_data_names=False) + + #load data + dat = entry_1D_precision.get_data() + + # assert + assert dat.columns.tolist() == ['u_ms', 'precision1', 'precision2', 'precision3'] + assert dat['u_ms'].mean() == pytest.approx(3.1, 0.05) + + return True @pytest.mark.depends(on=['db_init'], name='array_type_data') @@ -239,6 +275,10 @@ def test_array_type_data(): df_3D_prec['tstamp'] = pd.to_datetime(df_3D_prec['tstamp'], format='%Y-%m-%d %H:%M:%S') df_3D_prec.set_index('tstamp', inplace=True) + # use a copy of precision for auto_force_data_names() + df_3D_prec_copy = df_3D_prec.copy() + + # run single tests assert add_3D_entry(session) assert create_3D_datasource(session, df_3D_wind) @@ -247,3 +287,4 @@ def test_array_type_data(): assert one_dim_data(session, df_1D_wind) assert force_data_names_true(session, df_3D_wind) assert precision_test(session, df_3D_wind, df_3D_prec) + assert auto_force_data_names(session, df_1D_wind, df_3D_prec_copy) From 0d00bf3ef72a79c02acac40620c498b210ff643e Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Thu, 27 May 2021 18:24:53 +0200 Subject: [PATCH 29/30] array_test: path 'timeseries_array' to 'timeseries' --- metacatalog/test/test_array_type_data.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/metacatalog/test/test_array_type_data.py b/metacatalog/test/test_array_type_data.py index 55f40a59..0f0dbd7d 100644 --- a/metacatalog/test/test_array_type_data.py +++ b/metacatalog/test/test_array_type_data.py @@ -42,7 +42,7 @@ def create_3D_datasource(session, df_3D_wind): Add a datasource to the eddy entry. """ entry_3D_wind = api.find_entry(session, title='3-dimensional windspeed data')[0] - entry_3D_wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries') + entry_3D_wind.create_datasource(type=1, path='timeseries', datatype='timeseries') entry_3D_wind.datasource.create_scale(resolution='30min', extent=(df_3D_wind.index[0], df_3D_wind.index[-1]), support=1.0, scale_dimension='temporal') @@ -102,7 +102,7 @@ def one_dim_data(session, df_1D_wind): is_partial=False) # create datasource and scale - entry_1D_wind.create_datasource(type=1, path='timeseries_array', datatype='timeseries') + entry_1D_wind.create_datasource(type=1, path='timeseries', datatype='timeseries') entry_1D_wind.datasource.create_scale(resolution='30min', extent=(df_1D_wind.index[0], df_1D_wind.index[-1]), support=1.0, scale_dimension='temporal') @@ -145,7 +145,7 @@ def force_data_names_true(session, df_3D_wind): is_partial=False) # create datasource and scale - entry_3D_force_data_names.create_datasource(type=1, path='timeseries_array', datatype='timeseries') + entry_3D_force_data_names.create_datasource(type=1, path='timeseries', datatype='timeseries') entry_3D_force_data_names.datasource.create_scale(resolution='30min', extent=(df_3D_wind.index[0], df_3D_wind.index[-1]), support=1.0, scale_dimension='temporal') @@ -184,7 +184,7 @@ def precision_test(session, df_3D_wind, df_3D_prec): is_partial=False) # create datasource and scale - entry_3D_precision.create_datasource(type=1, path='timeseries_array', datatype='timeseries') + entry_3D_precision.create_datasource(type=1, path='timeseries', datatype='timeseries') entry_3D_precision.datasource.create_scale(resolution='30min', extent=(df_3D_wind.index[0], df_3D_wind.index[-1]), support=1.0, scale_dimension='temporal') @@ -225,7 +225,7 @@ def auto_force_data_names(session, df_1D_wind, df_3D_prec): embargo=False, is_partial=False) # create datasource and scale - entry_1D_precision.create_datasource(type=1, path='timeseries_array', datatype='timeseries') + entry_1D_precision.create_datasource(type=1, path='timeseries', datatype='timeseries') entry_1D_precision.datasource.create_scale(resolution='30min', extent=(df_1D_wind.index[0], df_1D_wind.index[-1]), support=1.0, scale_dimension='temporal') @@ -234,7 +234,7 @@ def auto_force_data_names(session, df_1D_wind, df_3D_prec): #load data dat = entry_1D_precision.get_data() - + # assert assert dat.columns.tolist() == ['u_ms', 'precision1', 'precision2', 'precision3'] assert dat['u_ms'].mean() == pytest.approx(3.1, 0.05) @@ -245,7 +245,7 @@ def auto_force_data_names(session, df_1D_wind, df_3D_prec): @pytest.mark.depends(on=['db_init'], name='array_type_data') def test_array_type_data(): """ - Test if timeseries_array works correctly. + Test if timeseries array works correctly. Backward compatibility with the old timeseries path is tested in test_models_data.py """ # get a session From 58962cc5ff20b90d8877aff1cd0c4d6c07de3db1 Mon Sep 17 00:00:00 2001 From: AlexDo1 Date: Fri, 28 May 2021 09:24:27 +0200 Subject: [PATCH 30/30] small adjustments to importer.py and reader.py --- metacatalog/ext/io/importer.py | 10 +++++----- metacatalog/ext/io/reader.py | 6 ++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/metacatalog/ext/io/importer.py b/metacatalog/ext/io/importer.py index 9df31488..57654e32 100644 --- a/metacatalog/ext/io/importer.py +++ b/metacatalog/ext/io/importer.py @@ -29,10 +29,10 @@ def import_to_internal_table(entry, datasource, data, precision=None, force_data if isinstance(precision, pd.Series): precision = pd.DataFrame(precision) - # raise error if data and precision are of different length + # raise error if data and precision have different number of rows if len(precision) != len(data): - raise ValueError('Data and precision must be of same length.') - # raise error if data and precision differing at any position + raise ValueError('Data and precision must match in their number of rows.') + # raise error if data and precision index differ at any position if any(precision.index != data.index): raise ValueError('Data and precision index are differing.') @@ -96,8 +96,8 @@ def import_to_internal_table(entry, datasource, data, precision=None, force_data # transform the data into a list of arrays values = [row for row in imp[data_columns].values] - # make importer.py compatible with the (old) 1D timeseries table if tablename == 'timeseries_1d': + # make importer.py compatible with the (old) 1D timeseries table # explicitly map the column types dtypes = { 'tstamp': sa.TIMESTAMP, @@ -122,7 +122,7 @@ def import_to_internal_table(entry, datasource, data, precision=None, force_data if_exists = kwargs.get('if_exists', 'append') imp_data.to_sql(tablename, session.bind, index=None, dtype=dtypes, if_exists=if_exists) else: - # else: the (new) timeseries_array is used + # store n-dimensional data and precision as array type timeseries data # explicitly map the column types dtypes = { 'tstamp': sa.TIMESTAMP, diff --git a/metacatalog/ext/io/reader.py b/metacatalog/ext/io/reader.py index b1aa3bd4..ccad95dd 100644 --- a/metacatalog/ext/io/reader.py +++ b/metacatalog/ext/io/reader.py @@ -42,7 +42,7 @@ def read_from_internal_table(entry, datasource, start=None, end=None, **kwargs): # always use data_names from datasource as column names when exporting the data col_names = datasource.data_names - # if the column 'data' exists, the new routine is used + # if the column 'data' exists, ND array data must be unstacked if 'data' in df_sql.columns: # unstack multi-dimensional data into the single columns rawvalues = np.vstack(df_sql['data'].values) @@ -63,14 +63,12 @@ def read_from_internal_table(entry, datasource, start=None, end=None, **kwargs): df = pd.DataFrame(data=raw, columns=col_names, index=df_sql.index) elif 'value' in df_sql.columns: - # if 'data' does not appear in the column names, the old routine is used + # if 'value' appears in the column names, the old routine for 1D data is used df = df_sql.copy() df.drop(['entry_id'], axis=1, inplace=True) # map column names df.columns = [datasource.data_names[0] if _col== 'value' else _col for _col in df.columns] - else: - print('Currently only "timeseries" and "timeseries_array" are supported.') return df