Skip to content

Commit

Permalink
Merge pull request #222 from KhiopsML/dev
Browse files Browse the repository at this point in the history
Release 10.2.2.3
  • Loading branch information
popescu-v authored Aug 2, 2024
2 parents fe00d30 + 30bfc09 commit 7da3e71
Show file tree
Hide file tree
Showing 9 changed files with 166 additions and 96 deletions.
7 changes: 5 additions & 2 deletions .github/workflows/pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,11 @@ jobs:
kh-samples sklearn -i khiops_classifier -e
# Test that the line containing "MPI command" also contains
# "mpiexec", which means that `mpiexec` has been found
kh-status | grep "MPI command" | grep -wq mpiexec
# an executable name under a /bin directory
# Note: this executable name can be different, depending on the MPI
# backend and OS; for instance, "orterun" for OpenMPI on Ubuntu Linux, but
# "mpiexec" for OpenMPI on Rocky Linux
kh-status | grep "MPI command" | grep -Ewq "(/.+?)/bin/.+"
release:
if: github.ref_type == 'tag'
needs: [build, test]
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ jobs:
OMPI_MCA_rmaps_base_oversubscribe: true
PRTE_MCA_rmaps_default_mapping_policy: :oversubscribe
run: |
# Make sure '/bin' is before '/usr/bin' in PATH
PATH=$(echo "/bin:"$PATH | sed 's#:/bin##')
# This is needed so that the Git tag is parsed and the khiops-python
# version is retrieved
git config --global --add safe.directory $(realpath .)
Expand Down Expand Up @@ -177,6 +180,9 @@ jobs:
# Force > 2 CPU cores to launch mpiexec
KHIOPS_PROC_NUMBER: 4
run: |-
# Make sure '/bin' is before '/usr/bin' in PATH
PATH=$(echo "/bin:"$PATH | sed 's#:/bin##')
# Make sure MPI support is not loaded through env modules
# Note: As Docker container's shell is non-interactive, environment
# modules are currently not initializing the shell anyway
Expand Down
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@
- Example: 10.2.1.4 is the 5th version that supports khiops 10.2.1.
- Internals: Changes in *Internals* sections are unlikely to be of interest for data scientists.

## 10.2.2.3 - 2024-08-02

### Fixed
- (`core`) API functions handling of unknown parameters: they now fail.
- *Internals*:
- Detection of the path to the MPI command: the real path to the executable is
now used.

## 10.2.2.2 - 2024-07-19

### Fixed
Expand Down
14 changes: 9 additions & 5 deletions khiops/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,11 @@ def _preprocess_task_arguments(task_args):
)
)

# Flatten kwargs
if "kwargs" in task_args:
task_args.update(task_args["kwargs"])
del task_args["kwargs"]

return task_called_with_domain


Expand Down Expand Up @@ -336,10 +341,10 @@ def _preprocess_format_spec(detect_format, header_line, field_separator):
def _clean_task_args(task_args):
"""Cleans the task arguments
More precisely:
- It removes command line arguments (they already are in another object).
- It removes parameters removed from the API and warns about it.
- It removes renamed API parameters and warns about it.
More precisely it removes:
- Command line arguments (they already are in another object).
- Parameters removed from the API and warns about it.
- Renamed API parameters and warns about it.
"""
# Remove non-task parameters
command_line_arg_names = [
Expand All @@ -353,7 +358,6 @@ def _clean_task_args(task_args):
"trace",
"stdout_file_path",
"stderr_file_path",
"kwargs",
]
for arg_name in command_line_arg_names + other_arg_names:
if arg_name in task_args:
Expand Down
15 changes: 11 additions & 4 deletions khiops/core/internals/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1146,8 +1146,12 @@ def _initialize_mpi_command_args(self):
installation_method = _infer_khiops_installation_method()
# In Conda-based, but non-Conda environment, specify mpiexec path
if installation_method == "conda-based":
mpiexec_path = os.environ.get("KHIOPS_MPIEXEC_PATH") or os.path.join(
_infer_env_bin_dir_for_conda_based_installations(), "mpiexec"
# Python `os.path.realpath` resolves symlinks recursively, like GNU
# `readlink -f`; Python `os.readlink` does not
mpiexec_path = os.environ.get("KHIOPS_MPIEXEC_PATH") or os.path.realpath(
os.path.join(
_infer_env_bin_dir_for_conda_based_installations(), "mpiexec"
)
)
if platform.system() == "Windows" and not os.path.splitext(mpiexec_path):
mpiexec_path += ".exe"
Expand All @@ -1165,8 +1169,11 @@ def _initialize_mpi_command_args(self):
)
# In Conda or local installations, expect mpiexec in the PATH
else:
mpiexec_path = os.environ.get("KHIOPS_MPIEXEC_PATH") or shutil.which(
"mpiexec"
link_to_mpiexec = shutil.which("mpiexec")
mpiexec_path = (
os.environ.get("KHIOPS_MPIEXEC_PATH")
or link_to_mpiexec
and os.path.realpath(link_to_mpiexec)
)
# If mpiexec is not in the path, and the installation method is local,
# then try to load MPI environment module so that mpiexec is in the path
Expand Down
4 changes: 3 additions & 1 deletion khiops/sklearn/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1464,9 +1464,11 @@ def _fit_prepare_training_function_inputs(self, dataset, computation_dir):
# Build the optional parameters from a copy of the estimator parameters
kwargs = self.get_params()

# Remove 'key' and 'output_dir'
# Remove non core.api params
del kwargs["key"]
del kwargs["output_dir"]
del kwargs["auto_sort"]
del kwargs["internal_sort"]

# Set the sampling percentage to a 100%
kwargs["sample_percentage"] = 100
Expand Down
61 changes: 34 additions & 27 deletions khiops/sklearn/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,27 @@ def __init__(self, X, y=None, categorical_target=True, key=None):
y,
categorical_target=categorical_target,
)
# A sparse matrix
# A scipy.sparse.spmatrix
elif isinstance(X, sp.spmatrix):
self._init_tables_from_sparse_matrix(
X, y, categorical_target=categorical_target
)
# Special rejection for scipy.sparse.sparray (to pass the sklearn tests)
# Note: We don't use scipy.sparse.sparray because it is not implemented in scipy
# 1.10 which is the latest supporting py3.8
elif isinstance(
X,
(
sp.bsr_array,
sp.coo_array,
sp.csc_array,
sp.csr_array,
sp.dia_array,
sp.dok_array,
sp.lil_array,
),
):
check_array(X, accept_sparse=False)
# A tuple spec
elif isinstance(X, tuple):
warnings.warn(
Expand Down Expand Up @@ -1425,32 +1441,23 @@ def _write_sparse_block(self, row_index, stream, target=None):
assert target in self.target_column, "'target' must be in the target column"
stream.write(f"{target}\t")
row = self.matrix.getrow(row_index)
# Empty row in the sparse matrix: use the first variable as missing data
# TODO: remove this part once Khiops bug
# https://github.com/KhiopsML/khiops/issues/235 is solved
if row.size == 0:
for variable_index in self.column_ids:
stream.write(f"{variable_index + 1}: ")
break
# Non-empty row in the sparse matrix: get non-missing data
else:
# Variable indices are not always sorted in `row.indices`
# Khiops needs variable indices to be sorted
sorted_indices = np.sort(row.nonzero()[1], axis=-1, kind="mergesort")

# Flatten row for Python < 3.9 scipy.sparse.lil_matrix whose API
# is not homogeneous with other sparse matrices: it stores
# opaque Python lists as elements
# Thus:
# - if isinstance(self.matrix, sp.lil_matrix) and Python 3.8, then
# row.data is np.array([list([...])])
# - else, row.data is np.array([...])
# TODO: remove this flattening once Python 3.8 support is dropped
sorted_data = np.fromiter(self._flatten(row.data), row.data.dtype)[
sorted_indices.argsort()
]
for variable_index, variable_value in zip(sorted_indices, sorted_data):
stream.write(f"{variable_index + 1}:{variable_value} ")
# Variable indices are not always sorted in `row.indices`
# Khiops needs variable indices to be sorted
sorted_indices = np.sort(row.nonzero()[1], axis=-1, kind="mergesort")

# Flatten row for Python < 3.9 scipy.sparse.lil_matrix whose API
# is not homogeneous with other sparse matrices: it stores
# opaque Python lists as elements
# Thus:
# - if isinstance(self.matrix, sp.lil_matrix) and Python 3.8, then
# row.data is np.array([list([...])])
# - else, row.data is np.array([...])
# TODO: remove this flattening once Python 3.8 support is dropped
sorted_data = np.fromiter(self._flatten(row.data), row.data.dtype)[
sorted_indices.argsort()
]
for variable_index, variable_value in zip(sorted_indices, sorted_data):
stream.write(f"{variable_index + 1}:{variable_value} ")
stream.write("\n")

def create_table_file_for_khiops(self, output_dir, sort=True):
Expand Down
Loading

0 comments on commit 7da3e71

Please sign in to comment.