From 169491b004cb5ce52723205cf9b4c172eadde42a Mon Sep 17 00:00:00 2001 From: mferrera Date: Mon, 13 Nov 2023 15:36:06 +0100 Subject: [PATCH] CLN: Refactor roff `scan_keywords` to use roffio --- src/clib/xtg/grd3d_scan_roffbinary.c | 384 ---------------------- src/clib/xtg/libxtg.h | 10 - src/xtgeo/grid3d/_grid3d_utils.py | 129 +++++--- src/xtgeo/grid3d/grid_properties.py | 26 +- tests/test_grid3d/test_grid_properties.py | 27 +- 5 files changed, 107 insertions(+), 469 deletions(-) delete mode 100644 src/clib/xtg/grd3d_scan_roffbinary.c diff --git a/src/clib/xtg/grd3d_scan_roffbinary.c b/src/clib/xtg/grd3d_scan_roffbinary.c deleted file mode 100644 index 7ecb3f85a..000000000 --- a/src/clib/xtg/grd3d_scan_roffbinary.c +++ /dev/null @@ -1,384 +0,0 @@ -/* - *************************************************************************************** - * - * NAME: - * grd3d_scan_roffbinary.c - * - * - * DESCRIPTION: - * This is a new line of ROFF handling function (from 2018). Here is a - * quick scan ROFF binary output and return for example: - * - * NameEntry ByteposData LenData Datatype - * scale!xscale 94 1 2 (=float) - * zvalues!splitEnz 1122 15990 6 (=byte) - * - * The ByteposData will be to the start of the ACTUAL (numerical) data, - * not the keyword/tag start (differs from Eclipse SCAN result here!) - * - * ARGUMENTS: - * fc i Filehandle (stream) to read from - * swap o SWAP status, 0 of False, 1 if True - * tagletters o A long *char where the tags are separated by a | - * ntagletters i For SWIG bindings - * rectypes o An array with record types: 1 = INT, 2 = FLOAT, - * 3 = DOUBLE, 4 = CHAR(STRING), 5 = BOOL, 6 = BYTE - * reclengths o An array with record lengths (no of elements) - * recstarts o An array with record starts (in bytes) - * maxkw i Max number of tags possible to read - * debug i Debug level - * - * RETURNS: - * Function: Number of keywords read. If problems, a negative value - * Resulting vectors will be updated. - * - * - * NOTE: - * The ROFF format was developed independent of RMS, so integer varables - * in ROFF does not match integer grid parameters in RMS fully. ROFF - * uses a signed int (4 byte). As integer values in RMS are always - * unsigned (non-negative) information will be lost if you try to import - * negative integer values from ROFF into RMS." - * - * TODO/ISSUES/BUGS: - * - * LICENCE: - * cf. XTGeo LICENSE - *************************************************************************************** - */ - -#include "libxtg.h" -#include "libxtg_.h" -#include "logger.h" -#include -#include -#include -#include - -/* ######################################################################### */ -/* LOCAL FUNCTIONS */ -/* ######################################################################### */ - -#define ROFFSTRLEN 200 -#define ROFFARRLEN 15 -#define TAGRECORDMAX 100 -#define TAGDATAMAX 100 - -int -_roffbinstring(FILE *fc, char *mystring) - -{ - /* read a string; return the number of bytes (including 0 termination) */ - int i; - char mybyte; - - strcpy(mystring, ""); - - for (i = 0; i < ROFFSTRLEN; i++) { - if (fread(&mybyte, 1, 1, fc) == 1) { - mystring[i] = mybyte; - if (mybyte == '\0') - return i + 1; - } else { - logger_critical(LI, FI, FU, "Did not reach end of ROFF string"); - return -99; - } - } - - return -1; -} - -int -_scan_roff_bin_record(FILE *fc, - int *swap, - char tagname[ROFFSTRLEN], - long npos1, - long *npos2, - int *numrec, - char cname[ROFFARRLEN][ROFFSTRLEN], - char pname[ROFFARRLEN][ROFFSTRLEN], - int cntype[ROFFARRLEN], - long bytepos[ROFFARRLEN], - long reclen[ROFFARRLEN]) -{ - /* - * tagname: is the name of the tag - * npos1: is the byte INPUT position in the file - * npos2: is the byte OUTPUT position, i.e. ready for next tag - * cname: is the name of the subtag, as "array" - * cntype: is data type: 1=int, 2=float, 3=double, 4=char, 5=byte - * rnlen: is the record length, if > 1 then it is an array type. - * => if 1, then it may have several sub keys - */ - - /* int swap = 0; */ - int nrec, ndat; - int i, n, ic; - int bsize = 0; - const int FAIL = -88; - char tmpname[ROFFSTRLEN] = ""; - long ncum = 0; - - char cdum[ROFFSTRLEN] = ""; - int idum; - float fdum; - double ddum; - unsigned char bdum; - - if (fseek(fc, npos1, SEEK_SET) != 0) - return FAIL; - - ncum = ncum + npos1; - - nrec = 0; /* record counter (subtag) */ - - strcpy(tagname, ""); - - for (i = 0; i < TAGRECORDMAX; i++) { - - ncum += _roffbinstring(fc, tmpname); - - if (npos1 == 0 && i == 0 && strncmp(tmpname, "roff-bin", 8) != 0) { - /* not a ROFF binary file! */ - logger_debug(LI, FI, FU, "Not a valid ROFF binary file!"); - return -9; - } - - if (strncmp(tmpname, "tag", 3) == 0) { - ncum += _roffbinstring(fc, tagname); - - logger_debug(LI, FI, FU, "Tag name %s", tagname); - - if (strncmp(tagname, "eof", 3) == 0) { - return 10; - } - - /* now the rest of the record may contain of multiple e.g.: */ - /* float xoffset 4.61860625E+05 or */ - /* array float data 15990 */ - /* ... until */ - /* endtag */ - for (n = 0; n < TAGDATAMAX; n++) { - ncum += _roffbinstring(fc, tmpname); - - if (strncmp(tmpname, "endtag", 6) == 0) { - *npos2 = ncum; - *numrec = nrec; - return 0; - } - - strcpy(pname[nrec], "NAxxx"); - - if (strncmp(tmpname, "int", 3) == 0) { - ncum += _roffbinstring(fc, cname[nrec]); - bytepos[nrec] = ncum; - ncum += fread(&idum, sizeof(int), 1, fc) * sizeof(int); - - /* special treatment of byteswap */ - if (strncmp(cname[nrec], "byteswaptest", 13) == 0) { - if (idum == 1) - *swap = 0; - if (idum != 1) - *swap = 1; - } - - reclen[nrec] = 1; - cntype[nrec] = 1; - nrec++; - } else if (strncmp(tmpname, "float", 5) == 0) { - ncum += _roffbinstring(fc, cname[nrec]); - bytepos[nrec] = ncum; - ncum += fread(&fdum, sizeof(float), 1, fc) * sizeof(float); - cntype[nrec] = 2; - reclen[nrec] = 1; - nrec++; - - } else if (strncmp(tmpname, "double", 6) == 0) { - /* never in use? */ - ncum += _roffbinstring(fc, cname[nrec]); - bytepos[nrec] = ncum; - ncum += fread(&ddum, sizeof(double), 1, fc) * sizeof(double); - cntype[nrec] = 3; - reclen[nrec] = 1; - nrec++; - } else if (strncmp(tmpname, "char", 4) == 0) { - ncum += _roffbinstring(fc, cname[nrec]); - bytepos[nrec] = ncum; - /* char in ROFF is actually a string: */ - ncum += _roffbinstring(fc, cdum); - cntype[nrec] = 4; - reclen[nrec] = 1; - - /* special treatment of parameter names (extra info) */ - if (strncmp(cname[nrec], "name", 4) == 0) { - if (strnlen(cdum, ROFFSTRLEN) == 0) - strcpy(cdum, "unknown"); - strcpy(pname[nrec], cdum); - } - nrec++; - } else if (strncmp(tmpname, "bool", 4) == 0) { - ncum += _roffbinstring(fc, cname[nrec]); - bytepos[nrec] = ncum; - ncum += fread(&bdum, sizeof(unsigned char), 1, fc) * - sizeof(unsigned char); - cntype[nrec] = 5; - reclen[nrec] = 1; - nrec++; - } else if (strncmp(tmpname, "byte", 4) == 0) { - ncum += _roffbinstring(fc, cname[nrec]); - bytepos[nrec] = ncum; - ncum += fread(&bdum, sizeof(unsigned char), 1, fc) * - sizeof(unsigned char); - cntype[nrec] = 6; - reclen[nrec] = 1; - nrec++; - } else if (strncmp(tmpname, "array", 5) == 0) { - ncum += _roffbinstring(fc, tmpname); - - if (strncmp(tmpname, "int", 3) == 0) { - bsize = 4; - ncum += _roffbinstring(fc, cname[nrec]); - ncum += fread(&ndat, sizeof(int), 1, fc) * sizeof(int); - if (*swap) - SWAP_INT(ndat); - cntype[nrec] = 1; - bytepos[nrec] = ncum; - reclen[nrec] = ndat; - nrec++; - } else if (strncmp(tmpname, "float", 5) == 0) { - bsize = 4; - ncum += _roffbinstring(fc, cname[nrec]); - ncum += fread(&ndat, sizeof(int), 1, fc) * sizeof(int); - if (*swap) - SWAP_INT(ndat); - bytepos[nrec] = ncum; - cntype[nrec] = 2; - reclen[nrec] = ndat; - nrec++; - - } - - /* double never in use? */ - - else if (strncmp(tmpname, "char", 4) == 0) { - /* Note: arrays of type char (ie strings) have UNKNOWN */ - /* lenghts; hence need special processing! -> bsize 0 */ - bsize = 0; - ncum += _roffbinstring(fc, cname[nrec]); - ncum += fread(&ndat, sizeof(int), 1, fc) * sizeof(int); - if (*swap) - SWAP_INT(ndat); - cntype[nrec] = 4; - bytepos[nrec] = ncum; - reclen[nrec] = ndat; - nrec++; - } else if (strncmp(tmpname, "bool", 4) == 0) { - bsize = 1; - ncum += _roffbinstring(fc, cname[nrec]); - ncum += fread(&ndat, sizeof(int), 1, fc) * sizeof(int); - if (*swap) - SWAP_INT(ndat); - bytepos[nrec] = ncum; - cntype[nrec] = 5; - reclen[nrec] = ndat; - nrec++; - } else if (strncmp(tmpname, "byte", 4) == 0) { - bsize = 1; - ncum += _roffbinstring(fc, cname[nrec]); - ncum += fread(&ndat, sizeof(int), 1, fc) * sizeof(int); - if (*swap) - SWAP_INT(ndat); - bytepos[nrec] = ncum; - cntype[nrec] = 6; - reclen[nrec] = ndat; - nrec++; - } - - if (bsize == 0) { - for (ic = 0; ic < ndat; ic++) { - ncum += _roffbinstring(fc, cname[nrec]); - } - } else { - ncum += (long)bsize * (long)ndat; - if (fseek(fc, ncum, SEEK_SET) != 0) - return FAIL; - } - } - } - } - } - - return EXIT_SUCCESS; -} - -/* ######################################################################### */ -/* LIBRARY FUNCTION */ -/* ######################################################################### */ - -long -grd3d_scan_roffbinary(FILE *fc, - int *swap, - char *tagletters, - int ntagletters, - int *rectypes, - long *reclengths, - long *recstarts, - long maxkw) -{ - - char tagname[ROFFSTRLEN] = ""; - char cname[ROFFARRLEN][ROFFSTRLEN]; - char pname[ROFFARRLEN][ROFFSTRLEN]; - int i, j, numrec, ios, cntype[ROFFARRLEN]; - long npos1, npos2, bytepos[ROFFARRLEN], reclen[ROFFARRLEN]; - long nrec = 0; - - if (ntagletters > INT_MAX) { - throw_exception("Unreverable error, number of requested keyword letters " - "exceeds system limit (grd3d_scan_eclbinary)"); - return -3; - } - - npos1 = 0; - ios = 0; - tagletters[0] = '\0'; - rewind(fc); - - for (i = 0; i < maxkw; i++) { - tagname[0] = '\0'; - ios = _scan_roff_bin_record(fc, swap, tagname, npos1, &npos2, &numrec, cname, - pname, cntype, bytepos, reclen); - - if (ios == -9) { - logger_error(LI, FI, FU, "Not a ROFF binary file. STOP!"); - return ios; - } else if (ios < 0) { - return -10; - } - - if (strcmp(tagname, "eof") == 0 || ios == 10) - break; - - for (j = 0; j < numrec; j++) { - strcat(tagletters, tagname); - strcat(tagletters, "!"); - strcat(tagletters, cname[j]); - - /* add a third item if parameter name */ - if (strncmp(cname[j], "name", 4) == 0 && - strncmp(pname[j], "NAxxx", 2) != 0) { - - strcat(tagletters, "!"); - strcat(tagletters, pname[j]); - } - strcat(tagletters, "|"); - rectypes[nrec] = cntype[j]; - reclengths[nrec] = reclen[j]; - recstarts[nrec] = bytepos[j]; - nrec++; - } - - npos1 = npos2; - } - return nrec; -} diff --git a/src/clib/xtg/libxtg.h b/src/clib/xtg/libxtg.h index 7e872ebed..f5eed90a4 100644 --- a/src/clib/xtg/libxtg.h +++ b/src/clib/xtg/libxtg.h @@ -1175,16 +1175,6 @@ grdcp3d_calc_xyz(long ncol, double *swig_np_dbl_aout_v3, // zarr long n_swig_np_dbl_aout_v3); -long -grd3d_scan_roffbinary(FILE *fc, - int *swig_int_out_p1, // *swap, - char *swig_out_char_msize, // tagletters - int nswig_out_char_msize, - int *rectypes, - long *reclengths, - long *recstarts, - long maxkw); - void grd3d_conv_roxapi_grid(int nx, int ny, diff --git a/src/xtgeo/grid3d/_grid3d_utils.py b/src/xtgeo/grid3d/_grid3d_utils.py index 840794faf..d2ad122ce 100644 --- a/src/xtgeo/grid3d/_grid3d_utils.py +++ b/src/xtgeo/grid3d/_grid3d_utils.py @@ -5,7 +5,9 @@ import re from typing import TYPE_CHECKING, Literal +import numpy as np import pandas as pd +import roffio import xtgeo.cxtgeo._cxtgeo as _cxtgeo from xtgeo import XTGeoCLibError @@ -32,21 +34,20 @@ def scan_keywords( Cf. grid_properties.py description """ - - pfile.get_cfhandle() # just to keep cfhanclecounter correct + if fformat not in ("xecl", "roff"): + raise ValueError(f"File format can be either `roff` or `xecl`, given {fformat}") if fformat == "xecl": + pfile.get_cfhandle() # just to keep cfhanclecounter correct if dates: keywords = _scan_ecl_keywords_w_dates( pfile, maxkeys=maxkeys, dataframe=dataframe ) else: keywords = _scan_ecl_keywords(pfile, maxkeys=maxkeys, dataframe=dataframe) + pfile.cfclose() elif fformat == "roff": keywords = _scan_roff_keywords(pfile, maxkeys=maxkeys, dataframe=dataframe) - else: - raise ValueError(f"File format can be either `roff` or `xecl`, given {fformat}") - pfile.cfclose() return keywords @@ -172,49 +173,83 @@ def _scan_ecl_keywords_w_dates( def _scan_roff_keywords( pfile: _XTGeoFile, maxkeys: int = MAXKEYWORDS, dataframe: bool = False ) -> list[KeywordTuple] | pd.DataFrame: - rectypes = _cxtgeo.new_intarray(maxkeys) - reclens = _cxtgeo.new_longarray(maxkeys) - recstarts = _cxtgeo.new_longarray(maxkeys) - - cfhandle = pfile.get_cfhandle() - - # maxkeys*32 is just to give sufficient allocated character space - nkeys, _tmp1, keywords = _cxtgeo.grd3d_scan_roffbinary( - cfhandle, maxkeys * 32, rectypes, reclens, recstarts, maxkeys - ) - - pfile.cfclose() - - keywords = keywords.replace(" ", "") - keywords = keywords.split("|") - - # record types translation (cf: grd3d_scan_eclbinary.c in cxtgeo) - rct = { - "1": "int", - "2": "float", - "3": "double", - "4": "char", - "5": "bool", - "6": "byte", - } - - rc = [] - rl = [] - rs = [] - for i in range(nkeys): - rc.append(rct[str(_cxtgeo.intarray_getitem(rectypes, i))]) - rl.append(_cxtgeo.longarray_getitem(reclens, i)) - rs.append(_cxtgeo.longarray_getitem(recstarts, i)) - - _cxtgeo.delete_intarray(rectypes) - _cxtgeo.delete_longarray(reclens) - _cxtgeo.delete_longarray(recstarts) - - result = list(zip(keywords, rc, rl, rs)) + with open(pfile.file, "rb") as fin: + is_binary = fin.read(8) == b"roff-bin" + + keywords = [] + with roffio.lazy_read(pfile.file) as roff_iter: + SPACE_OR_NUL = 1 + TAG = 3 + SPACE_OR_NUL # "tag" + ENDTAG = 6 + SPACE_OR_NUL # "endtag" + ARRAY_AND_SIZE = 5 + SPACE_OR_NUL + 4 # "array", 4 byte int + + count = 0 + done = False + # 81 is where the standard RMS exported header size ends. + # This offset won't be correct for non-RMS exported roff files, + # but it is a compromise to keep the old functionality of byte + # counting _close enough_ because this data is not made available + # from roffio. + byte_pos = 81 + + for tag_name, tag_group in roff_iter: + byte_pos += TAG + byte_pos += len(tag_name) + SPACE_OR_NUL + + for keyword, value in tag_group: + if isinstance(value, (np.ndarray, bytes)): + byte_pos += ARRAY_AND_SIZE + dtype, size, offset = _get_roff_type_and_size(value, is_binary) + + byte_pos += len(dtype) + SPACE_OR_NUL + byte_pos += len(keyword) + SPACE_OR_NUL + + keyword = f"{tag_name}!{keyword}" + if tag_name == "parameter" and keyword == "name": + keyword += f"!{value}" + keywords.append((keyword, dtype, size, byte_pos)) + + byte_pos += offset + count += 1 + if count == maxkeys: + done = True + break + + byte_pos += ENDTAG + if done: + break if dataframe: cols = ["KEYWORD", "TYPE", "NITEMS", "BYTESTARTDATA"] - df = pd.DataFrame.from_records(result, columns=cols) - return df + return pd.DataFrame.from_records(keywords, columns=cols) - return result + return keywords + + +def _get_roff_type_and_size( + value: str | bool | bytes | np.ndarray, is_binary: bool +) -> tuple[str, int, int]: + # If is_binary is False add a multiplier because values will + # be separated by spaces in the case of numerical/boolean + # data, as opposed to buffer packed, while strings will be + # quoted and not just NUL delimited + if isinstance(value, str): + return "char", 1, len(value) + (1 if is_binary else 3) + if isinstance(value, bool): + return "bool", 1, 1 if is_binary else 2 + if isinstance(value, bytes): + return "byte", len(value), len(value) * (1 if is_binary else 2) + if np.issubdtype(value.dtype, np.bool_): + return "bool", value.size, value.size * (1 if is_binary else 2) + if np.issubdtype(value.dtype, np.int8) or np.issubdtype(value.dtype, np.uint8): + return "byte", value.size, value.size * (1 if is_binary else 2) + if np.issubdtype(value.dtype, np.integer): + return "int", value.size, value.size * (4 if is_binary else 5) + if np.issubdtype(value.dtype, np.float32): + return "float", value.size, value.size * (4 if is_binary else 5) + if np.issubdtype(value.dtype, np.double): + return "double", value.size, value.size * (8 if is_binary else 9) + if np.issubdtype(value.dtype, np.unicode_): + total_bytes = sum(len(val) + (1 if is_binary else 3) for val in value) + return "char", value.size, total_bytes + raise ValueError(f"Could not find suitable roff type for {type(value)}") diff --git a/src/xtgeo/grid3d/grid_properties.py b/src/xtgeo/grid3d/grid_properties.py index 358f6847c..c6624ac74 100644 --- a/src/xtgeo/grid3d/grid_properties.py +++ b/src/xtgeo/grid3d/grid_properties.py @@ -2,10 +2,8 @@ from __future__ import annotations import hashlib -import io -import pathlib import warnings -from typing import List, Literal, Optional, Tuple, Union +from typing import TYPE_CHECKING, List, Literal, Optional, Tuple, Union import deprecation import numpy as np @@ -23,6 +21,9 @@ xtg = XTGeoDialog() logger = null_logger(__name__) +if TYPE_CHECKING: + from xtgeo.common.types import FileLike + KeywordTuple = Tuple[str, str, int, int] KeywordDateTuple = Tuple[str, str, int, int, Union[str, int]] GridPropertiesKeywords = Union[ @@ -731,7 +732,7 @@ def _consistency_check(self): @staticmethod def scan_keywords( - pfile: Union[str, pathlib.Path, io.BytesIO, io.StringIO], + pfile: FileLike, fformat: Literal["roff", "xecl"] = "xecl", maxkeys: int = MAXKEYWORDS, dataframe: bool = False, @@ -754,19 +755,14 @@ def scan_keywords( For Eclipse, the byteposition is to the KEYWORD, while for ROFF the byte position is to the beginning of the actual data. - Parameters: - pfile: - Name or a filehandle to file with properties. - fformat: - xecl (Eclipse INIT, RESTART, ...) or roff for ROFF binary. + Args: + pfile: Name or a filehandle to file with properties. + fformat: xecl (Eclipse INIT, RESTART, ...) or roff for ROFF binary. Default is "xecl". - maxkeys: - Maximum number of keys. Default is + maxkeys: Maximum number of keys. Default is ``xtgeo.commom.constants.MAXKEYWORDS``. - dataframe: - If True, return a Pandas dataframe instead. Default is False. - dates: - If True, the date is the last column (only + dataframe: If True, return a Pandas dataframe instead. Default is False. + dates: If True, the date is the last column (only meaningful for restart files). Default is False. Returns: diff --git a/tests/test_grid3d/test_grid_properties.py b/tests/test_grid3d/test_grid_properties.py index 98f8750b8..a2e80a810 100644 --- a/tests/test_grid3d/test_grid_properties.py +++ b/tests/test_grid3d/test_grid_properties.py @@ -122,18 +122,6 @@ def test_gridproperties_from_roff(grid_property): assert props.names == [grid_property.name] -def test_gridproperties_from_roff_with_name_starting_with_na(): - grid_property = xtgeo.GridProperty(name="NA") - buff = io.BytesIO() - grid_property.to_file(buff, fformat="roff") - buff.seek(0) - props = xtgeo.gridproperties_from_file( - buff, fformat="roff", names=[grid_property.name] - ) - - assert props.names == [grid_property.name] - - @given(gridproperties_elements()) def test_gridproperties_invalid_format(grid_property): buff = io.BytesIO() @@ -192,12 +180,25 @@ def test_scan_keywords_invalid_file(): GridProperties.scan_keywords(TPATH / "notafile.UNRST") +def test_scan_keywords_roff_as_tuple_list(): + """A static method to scan quickly keywords in a ROFF file""" + t1 = xtg.timer() + keywords = GridProperties.scan_keywords(XFILE2, fformat="roff") + t2 = xtg.timer(t1) + logger.info("Keywords scanned in %s seconds", t2) + assert keywords[0] == ("filedata!byteswaptest", "int", 1, 111) + assert keywords[-1] == ("parameter!data", "int", 35840, 806994) + logger.info(keywords) + + def test_scan_keywords_roff(): """A static method to scan quickly keywords in a ROFF file""" t1 = xtg.timer() df = GridProperties.scan_keywords(XFILE2, dataframe=True, fformat="roff") t2 = xtg.timer(t1) - logger.info("Dates scanned in %s seconds", t2) + logger.info("Keywords scanned in %s seconds", t2) + assert tuple(df.iloc[0]) == ("filedata!byteswaptest", "int", 1, 111) + assert tuple(df.iloc[-1]) == ("parameter!data", "int", 35840, 806994) logger.info(df)