From 0f7ec1bba2ffd33090f055073d9d5765f74892fa Mon Sep 17 00:00:00 2001 From: Raymond Wiker Date: Fri, 1 Nov 2024 13:17:39 +0100 Subject: [PATCH] Defer 'expensive' imports until they are needed. Also, do a little cleanup based on flake8 findings. --- examples/table-aggregation.ipynb | 262 ++++++++++++++++++++++ src/fmu/sumo/explorer/objects/polygons.py | 8 +- src/fmu/sumo/explorer/objects/table.py | 23 +- 3 files changed, 279 insertions(+), 14 deletions(-) create mode 100644 examples/table-aggregation.ipynb diff --git a/examples/table-aggregation.ipynb b/examples/table-aggregation.ipynb new file mode 100644 index 00000000..0425b1e8 --- /dev/null +++ b/examples/table-aggregation.ipynb @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3f2d46a1-b5f0-4c75-bc55-4e768b9de112", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "class Timer:\n", + " def __init__(self):\n", + " return\n", + " def __enter__(self):\n", + " self._t0 = time.perf_counter()\n", + " return\n", + " def __exit__(self, type, value, traceback):\n", + " t1 = time.perf_counter()\n", + " print(f\"Elapsed: {t1-self._t0:0.3f} seconds.\")\n", + " return\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fb6c5b9c-314b-47ec-8157-6b75433f58b5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/RAYW/py-envs/explorer/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from fmu.sumo.explorer import Explorer\n", + "exp=Explorer(env=\"preview\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "501ca50e-272a-4231-8edd-a4dfa4905e68", + "metadata": {}, + "outputs": [], + "source": [ + "def total_blob_size(sc):\n", + " tbs = sc.metrics.sum(\"file.size_bytes\")\n", + " if tbs == 0:\n", + " tbs = sc.metrics.sum(\"_sumo.blob_size\")\n", + " return tbs\n", + "\n", + "def do_aggregate(tagname, rels, columns):\n", + " print(f\"{tagname}: {len(rels)} objects, {len(rels.columns)} columns.\")\n", + " tot_size_bytes = total_blob_size(rels)\n", + " print(f\"Total size of input: {tot_size_bytes / (1024*1024*1024):.3f} GiB\")\n", + " with Timer():\n", + " agg=rels.filter(column=columns).aggregate(columns=columns)\n", + " print(agg.to_pandas().sort_values(by=[\"REAL\", \"DATE\"]))\n", + "\n", + "def run_exp(caseuuid, itername, tagname, columns):\n", + " case = exp.get_case_by_uuid(caseuuid)\n", + " print(f\"{case.asset}: {case.name}: {caseuuid}\")\n", + " rels=case.tables.filter(iteration=itername, realization=True, tagname=tagname, \n", + " complex={\"bool\": {\"must_not\": [{\"term\": {\"_sumo.hidden\": True}}]}})\n", + " do_aggregate(tagname, rels, columns)\n", + " rels=case.tables.filter(iteration=itername, realization=True, tagname=tagname,\n", + " complex={\"term\": {\"_sumo.hidden\": True}})\n", + " do_aggregate(tagname, rels, columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5f3d4e12-2b23-4585-a935-eb0e48951dd6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Troll: 24.0.0-20240828_ix_network_test5: 359e7c72-a4ca-43ee-9203-f09cd0f149a9\n", + "summary: 27 objects, 64996 columns.\n", + "Total size of input: 1.248 GiB\n", + "Elapsed: 15.166 seconds.\n", + " DATE FOPT REAL\n", + "137 2024-07-02 282442208.0 6\n", + "138 2024-07-03 282451072.0 6\n", + "139 2024-08-01 282677120.0 6\n", + "140 2024-09-01 282889760.0 6\n", + "141 2024-10-01 283077440.0 6\n", + ".. ... ... ...\n", + "47 2025-02-15 286229120.0 249\n", + "48 2025-04-01 286425696.0 249\n", + "49 2025-09-01 287060416.0 249\n", + "50 2025-10-01 287176832.0 249\n", + "51 2026-01-01 287523552.0 249\n", + "\n", + "[265 rows x 3 columns]\n", + "summary: 3537 objects, 64996 columns.\n", + "Total size of input: 1.087 GiB\n", + "Elapsed: 1.692 seconds.\n", + " DATE FOPT REAL\n", + "52 2024-07-02 282442208.0 6\n", + "53 2024-07-03 282451072.0 6\n", + "54 2024-08-01 282677120.0 6\n", + "55 2024-09-01 282889760.0 6\n", + "56 2024-10-01 283077440.0 6\n", + ".. ... ... ...\n", + "173 2025-02-15 286229120.0 249\n", + "174 2025-04-01 286425696.0 249\n", + "175 2025-09-01 287060416.0 249\n", + "176 2025-10-01 287176832.0 249\n", + "177 2026-01-01 287523552.0 249\n", + "\n", + "[265 rows x 3 columns]\n" + ] + } + ], + "source": [ + "run_exp(\"359e7c72-a4ca-43ee-9203-f09cd0f149a9\", \"pred-0\", \"summary\", [\"FOPT\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4ba6a7a8-4c32-4015-8767-41b2f7c777e0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Troll: 24.0.0-20240820: fc6cc7d3-6162-46a3-9d69-48ad1eaecdfb\n", + "summary: 196 objects, 24568 columns.\n", + "Total size of input: 30.013 GiB\n", + "Elapsed: 32.124 seconds.\n", + " DATE FOPT REAL\n", + "708796 1990-02-01 0.000000e+00 1\n", + "708797 1990-03-01 1.445590e+05 1\n", + "708798 1990-04-01 2.741935e+05 1\n", + "708799 1990-05-01 4.145006e+05 1\n", + "708800 1990-06-01 5.512956e+05 1\n", + "... ... ... ...\n", + "841571 2024-06-27 2.980280e+08 249\n", + "841572 2024-06-28 2.980311e+08 249\n", + "841573 2024-06-29 2.980342e+08 249\n", + "841574 2024-06-30 2.980384e+08 249\n", + "841575 2024-07-01 2.980405e+08 249\n", + "\n", + "[952560 rows x 3 columns]\n", + "summary: 9800 objects, 24568 columns.\n", + "Total size of input: 29.907 GiB\n", + "Elapsed: 4.722 seconds.\n", + " DATE FOPT REAL\n", + "34020 1990-02-01 0.000000e+00 1\n", + "34021 1990-03-01 1.445590e+05 1\n", + "34022 1990-04-01 2.741935e+05 1\n", + "34023 1990-05-01 4.145006e+05 1\n", + "34024 1990-06-01 5.512956e+05 1\n", + "... ... ... ...\n", + "316447 2024-06-27 2.980280e+08 249\n", + "316448 2024-06-28 2.980311e+08 249\n", + "316449 2024-06-29 2.980342e+08 249\n", + "316450 2024-06-30 2.980384e+08 249\n", + "316451 2024-07-01 2.980405e+08 249\n", + "\n", + "[952560 rows x 3 columns]\n" + ] + } + ], + "source": [ + "run_exp(\"fc6cc7d3-6162-46a3-9d69-48ad1eaecdfb\", \"iter-0\", \"summary\", [\"FOPT\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d07581ef-8fdb-4621-b81c-8aaf20b0c204", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Drogon: ruaj_testcase: 5b558daf-61c5-400a-9aa2-c602bb471a16\n", + "summary: 160 objects, 974 columns.\n", + "Total size of input: 0.175 GiB\n", + "Elapsed: 2.485 seconds.\n", + " DATE FOPT REAL\n", + "4910 2018-01-01 0.000000e+00 0\n", + "4911 2018-01-02 0.000000e+00 0\n", + "4912 2018-01-05 0.000000e+00 0\n", + "4913 2018-01-06 3.991868e+03 0\n", + "4914 2018-01-09 1.596676e+04 0\n", + "... ... ... ...\n", + "36831 2020-06-14 7.278816e+06 159\n", + "36832 2020-06-27 7.349246e+06 159\n", + "36833 2020-06-28 7.354664e+06 159\n", + "36834 2020-06-30 7.365482e+06 159\n", + "36835 2020-07-01 7.370888e+06 159\n", + "\n", + "[39280 rows x 3 columns]\n", + "summary: 320 objects, 974 columns.\n", + "Total size of input: 0.163 GiB\n", + "Elapsed: 2.528 seconds.\n", + " DATE FOPT REAL\n", + "19394 2018-01-01 0.000000e+00 0\n", + "19395 2018-01-02 0.000000e+00 0\n", + "19396 2018-01-05 0.000000e+00 0\n", + "19397 2018-01-06 3.991868e+03 0\n", + "19398 2018-01-09 1.596676e+04 0\n", + "... ... ... ...\n", + "10795 2020-06-14 7.278816e+06 159\n", + "10796 2020-06-27 7.349246e+06 159\n", + "10797 2020-06-28 7.354664e+06 159\n", + "10798 2020-06-30 7.365482e+06 159\n", + "10799 2020-07-01 7.370888e+06 159\n", + "\n", + "[39280 rows x 3 columns]\n" + ] + } + ], + "source": [ + "run_exp(\"5b558daf-61c5-400a-9aa2-c602bb471a16\", \"iter-0\", \"summary\", [\"FOPT\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "429d688e-34d1-4e19-b433-348d965dd436", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/fmu/sumo/explorer/objects/polygons.py b/src/fmu/sumo/explorer/objects/polygons.py index decce680..68599a18 100644 --- a/src/fmu/sumo/explorer/objects/polygons.py +++ b/src/fmu/sumo/explorer/objects/polygons.py @@ -1,10 +1,8 @@ """Module containing class for polygons object""" from typing import Dict -import pandas as pd from sumo.wrapper import SumoClient from fmu.sumo.explorer.objects._child import Child -from warnings import warn class Polygons(Child): @@ -18,25 +16,27 @@ def __init__(self, sumo: SumoClient, metadata: Dict, blob=None) -> None: """ super().__init__(sumo, metadata, blob) - def to_pandas(self) -> pd.DataFrame: + def to_pandas(self): """Get polygons object as a DataFrame Returns: DataFrame: A DataFrame object """ + import pandas as pd try: return pd.read_csv(self.blob) except TypeError as type_err: raise TypeError(f"Unknown format: {self.format}") from type_err - async def to_pandas_async(self) -> pd.DataFrame: + async def to_pandas_async(self): """Get polygons object as a DataFrame Returns: DataFrame: A DataFrame object """ + import pandas as pd try: return pd.read_csv(await self.blob_async) except TypeError as type_err: diff --git a/src/fmu/sumo/explorer/objects/table.py b/src/fmu/sumo/explorer/objects/table.py index 6b05a52b..56636dd1 100644 --- a/src/fmu/sumo/explorer/objects/table.py +++ b/src/fmu/sumo/explorer/objects/table.py @@ -1,13 +1,8 @@ """module containing class for table""" import logging -import pandas as pd -import pyarrow as pa -import pyarrow.parquet as pq -import pyarrow.feather as pf from sumo.wrapper import SumoClient from fmu.sumo.explorer.objects._child import Child -from warnings import warn from typing import Dict @@ -42,6 +37,9 @@ async def _read_table_async(self): return self._construct_table_from_blob(await self._get_blob_async()) def _construct_table_from_blob(self, blob): + import pandas as pd + import pyarrow.feather as pf + try: if self.dataformat == "csv": dataframe = pd.read_csv(blob) @@ -71,7 +69,7 @@ def _construct_table_from_blob(self, blob): pass return dataframe - def to_pandas(self) -> pd.DataFrame: + def to_pandas(self): """Return object as a pandas DataFrame Returns: @@ -81,7 +79,7 @@ def to_pandas(self) -> pd.DataFrame: self._dataframe = self._read_table() return self._dataframe - async def to_pandas_async(self) -> pd.DataFrame: + async def to_pandas_async(self): """Return object as a pandas DataFrame Returns: @@ -98,6 +96,11 @@ async def _read_arrow_async(self): return self._construct_arrow_from_blob(await self._get_blob_async()) def _construct_arrow_from_blob(self, blob): + import pandas as pd + import pyarrow as pa + import pyarrow.parquet as pq + import pyarrow.feather as pf + try: if self.dataformat == "csv": arrowtable = pa.Table.from_pandas(pd.read_csv(blob)) @@ -117,7 +120,7 @@ def _construct_arrow_from_blob(self, blob): arrowtable = pq.read_table(blob) except Exception as ex: try: - arrowtable = pf.read_table(selfblob) + arrowtable = pf.read_table(blob) except Exception as ex: raise TypeError( f"Unable to convert a blob of format {self.dataformat} to arrow; tried csv, parquet and feather." @@ -127,7 +130,7 @@ def _construct_arrow_from_blob(self, blob): pass return arrowtable - def to_arrow(self) -> pa.Table: + def to_arrow(self): """Return object as an arrow Table Returns: @@ -137,7 +140,7 @@ def to_arrow(self) -> pa.Table: self._arrowtable = self._read_arrow() return self._arrowtable - async def to_arrow_async(self) -> pa.Table: + async def to_arrow_async(self): """Return object as an arrow Table Returns: