diff --git a/toolkit/__init__.py b/toolkit/__init__.py index 4e274e71..e3419e2e 100644 --- a/toolkit/__init__.py +++ b/toolkit/__init__.py @@ -47,9 +47,9 @@ """ __status__ = "Development" -__version__ = "1.0.0" +__version__ = "1.1.0" __license__ = "Apache License 2.0" -__copyright__ = "Copyright 2022, PetrĂ³leo Brasileiro S.A." +__copyright__ = "Copyright 2024, PetrĂ³leo Brasileiro S.A." __authors__ = [ "Ricardo Emanuel Vaz Vargas", "Bruno Guberfain do Amaral", @@ -73,6 +73,9 @@ EventType, LABELS_DESCRIPTIONS, NORMAL_LABEL, + PARQUET_EXTENSION, + PARQUET_ENGINE, + PARQUET_COMPRESSION, PATH_3W_PROJECT, PATH_DATASET, PATH_DATASET_INI, diff --git a/toolkit/base.py b/toolkit/base.py index 6d1f5d47..ff3e72de 100644 --- a/toolkit/base.py +++ b/toolkit/base.py @@ -62,14 +62,14 @@ def load_config_in_dataset_ini(): # DATASET_INI = load_config_in_dataset_ini() -DATASET_VERSION = DATASET_INI.get("Versions").get("DATASET") +DATASET_VERSION = DATASET_INI.get("VERSION").get("DATASET") -COLUMNS_DESCRIPTIONS = dict(DATASET_INI.get("Columns of CSV Data Files")) +COLUMNS_DESCRIPTIONS = dict(DATASET_INI.get("PARQUET_FILE_PROPERTIES")) COLUMNS_DATA_FILES = list(COLUMNS_DESCRIPTIONS.keys()) VARS = COLUMNS_DATA_FILES[1:-1] CLASS = COLUMNS_DATA_FILES[-1] -events_section = DATASET_INI.get("Events") +events_section = DATASET_INI.get("EVENTS") EVENT_NAMES = [n.strip() for n in events_section.get("NAMES").split(",")] EXTRA_INSTANCES_TRAINING = events_section.getint("EXTRA_INSTANCES_TRAINING") TRANSIENT_OFFSET = events_section.getint("TRANSIENT_OFFSET") @@ -96,6 +96,10 @@ def load_config_in_dataset_ini(): else: EVENT_NAMES_OBSERVATION_LABELS[n] = {NORMAL_LABEL, l} +parquet_settings = DATASET_INI.get("PARQUET_SETTINGS") +PARQUET_EXTENSION = parquet_settings.get("PARQUET_EXTENSION") +PARQUET_ENGINE = parquet_settings.get("PARQUET_ENGINE") +PARQUET_COMPRESSION = parquet_settings.get("PARQUET_COMPRESSION") # Classes # diff --git a/toolkit/misc.py b/toolkit/misc.py index 06605f78..1715568c 100644 --- a/toolkit/misc.py +++ b/toolkit/misc.py @@ -31,6 +31,8 @@ PATH_DATASET, VARS, EVENT_NAMES, + PARQUET_EXTENSION, + PARQUET_ENGINE, ) @@ -62,8 +64,8 @@ def label_and_file_generator(real=True, simulated=False, drawn=False): if i.is_dir(): label = int(i.stem) for fp in i.iterdir(): - # Considers only csv files - if fp.suffix == ".csv": + # Considers only Parquet files + if fp.suffix == PARQUET_EXTENSION: # Considers only instances from the requested # source if ( @@ -235,13 +237,14 @@ def load_instance(instance): and contain its label (int) and its full path (Path). Raises: - Exception: Error if the CSV file passed as arg cannot be read. + Exception: Error if the Parquet file passed as arg cannot be + read. Returns: pandas.DataFrame: Its index contains the timestamps loaded from - the CSV file. Its columns contain data loaded from the other - columns of the CSV file and metadata loaded from the - argument `instance` (label, well, and id). + the Parquet file. Its columns contain data loaded from the + other columns of the Parquet file and metadata loaded from + the argument `instance` (label, well, and id). """ # Loads label metadata from the argument `instance` label, fp = instance @@ -250,8 +253,8 @@ def load_instance(instance): # Loads well and id metadata from the argument `instance` well, id = fp.stem.split("_") - # Loads data from the CSV file - df = pd.read_csv(fp, index_col="timestamp", parse_dates=["timestamp"]) + # Loads data from the Parquet file + df = pd.read_parquet(fp, engine=PARQUET_ENGINE) assert ( df.columns == COLUMNS_DATA_FILES[1:] ).all(), f"invalid columns in the file {fp}: {df.columns.tolist()}" @@ -281,9 +284,9 @@ def load_instances(instances): Returns: pandas.DataFrame: Its index contains the timestamps loaded from - the CSV files. Its columns contain data loaded from the - other columns of the CSV files and the metadata label, well, - and id). + the Parquet files. Its columns contain data loaded from the + other columns of the Parquet files and the metadata label, + well, and id). """ # Prepares for multiple parallel loadings pool = ThreadPool() @@ -328,9 +331,7 @@ def create_and_plot_scatter_map(real_instances): well_times = defaultdict(list) well_classes = defaultdict(list) for (well, id, label), (tmin, tmax) in df_time.iterrows(): - well_times[well].append( - (tmin.toordinal(), (tmax.toordinal() - tmin.toordinal())) - ) + well_times[well].append((tmin, (tmax - tmin))) well_classes[well].append(label) wells = df["well"].unique() @@ -349,7 +350,7 @@ def create_and_plot_scatter_map(real_instances): plt.rcParams["axes.labelsize"] = 9 plt.rcParams["font.size"] = 9 plt.rcParams["legend.fontsize"] = 9 - fig, ax = plt.subplots(figsize=(9, 4)) + fig, ax = plt.subplots(figsize=(9, 9)) yticks = [] yticks_labels = [] for well in well_times.keys(): @@ -379,8 +380,8 @@ def create_and_plot_scatter_map(real_instances): frameon=False, handles=legend_colors, loc="upper center", - bbox_to_anchor=(0.5, 1.22), - ncol=3, + bbox_to_anchor=(0.5, 1.12), + ncol=4, ) return first_year, last_year @@ -394,7 +395,8 @@ def count_properties_instance(instance): and contain its label (int) and its full path (Path). Raises: - Exception: Error if the CSV file passed as arg cannot be read. + Exception: Error if the Parquet file passed as arg cannot be + read. Returns: dict: Dict containing the counted properties with the following @@ -408,8 +410,8 @@ def count_properties_instance(instance): p = {"n_vars_missing": 0, "n_vars_frozen": 0} try: - # Read the CSV file - df = pd.read_csv(fp, index_col="timestamp", parse_dates=["timestamp"]) + # Read the Parquet file + df = pd.read_parquet(fp, engine=PARQUET_ENGINE) except Exception as e: raise Exception(f"error reading file {fp}: {e}") @@ -551,14 +553,16 @@ def resample(data, n, class_number): def plot_instance(class_number, instance_index, resample_factor): - """Plot one especific event class and instance. By default the instance is downsampling (n=100) and Z-score Scaler. - In order to help the visualization transient labels was changed to '0.5'. + """Plot one especific event class and instance. By default the + instance is downsampling (n=100) and Z-score Scaler. In order to + help the visualization transient labels was changed to '0.5'. Args: - class_number (integer): integer that represents the event class [0-8] + class_number (integer): integer that represents the event class + [0-8] instance_index (integer): input the instance file index """ - instances_path = os.path.join(PATH_DATASET, str(class_number), "*.csv") + instances_path = os.path.join(PATH_DATASET, str(class_number), "*"+PARQUET_EXTENSION) instances_path_list = glob.glob(instances_path) if class_number > 8 or class_number < 0: print( @@ -569,12 +573,8 @@ def plot_instance(class_number, instance_index, resample_factor): f"instance index {instance_index} out of range - Insert a valid index between 0 and {len(instances_path_list)-1}" ) else: - df_instance = pd.read_csv( - instances_path_list[instance_index], sep=",", header=0 - ) - + df_instance = pd.read_parquet(instances_path_list[instance_index], engine=PARQUET_ENGINE) df_instance_resampled = resample(df_instance, resample_factor, class_number) - df_drop_resampled = df_instance_resampled.drop(["timestamp", "class"], axis=1) df_drop_resampled.interpolate( method="linear", limit_direction="both", axis=0, inplace=True