From 8310de280472b789360833806d0ff1c151df8b84 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Tue, 26 Mar 2024 14:39:59 +0100 Subject: [PATCH 01/37] Remove executables from khiops conda package The `khiops` conda package now contains only the python code. The conda package with the Khiops executables (`khiops-core`) is now created in the Khiops repository. The new `khiops` package is "noarch" so there is now only one artifact. --- .github/workflows/conda.yml | 142 ++++++++---------------- .pre-commit-config.yaml | 2 +- packaging/conda/bld.bat | 34 ------ packaging/conda/build.sh | 121 -------------------- packaging/conda/conda_build_config.yaml | 13 --- packaging/conda/meta.yaml | 56 +--------- 6 files changed, 56 insertions(+), 312 deletions(-) delete mode 100644 packaging/conda/bld.bat delete mode 100644 packaging/conda/build.sh delete mode 100644 packaging/conda/conda_build_config.yaml diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 08ed5950..54e4fc3e 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -1,14 +1,16 @@ --- -name: Conda Packages +name: Conda Package env: - DEFAULT_KHIOPS_REVISION: main DEFAULT_SAMPLES_REVISION: main + # Note: The default Khiops version must never be an alpha release as they are + # ephemeral. To test alpha versions run the workflow manually. + DEFAULT_KHIOPS_CORE_VERSION: 10.2.1 on: workflow_dispatch: inputs: - khiops-revision: - default: main - description: khiops repo revision + khiops-core-version: + default: 10.2.1 + description: khiops-core version for testing samples-revision: default: main description: khiops-samples repo revision @@ -29,18 +31,7 @@ concurrency: cancel-in-progress: true jobs: build: - strategy: - fail-fast: false - matrix: - # Use the oldest supported Mac OS and Ubuntu versions for GLIBC compatibility - include: - - os: ubuntu-20.04 - os-family: linux - - os: windows-latest - os-family: windows - - os: macos-11 - os-family: macos - runs-on: ${{ matrix.os }} + runs-on: ubuntu-22.04 steps: - name: Checkout Sources uses: actions/checkout@v4 @@ -52,34 +43,19 @@ jobs: uses: conda-incubator/setup-miniconda@v3 with: miniconda-version: latest - python-version: '3.11' + python-version: '3.12' - name: Install Dependency Requirements for Building Conda Packages - run: conda install conda-build=3.27.0 conda-verify - # We need MacOS SDK 10.10 to build on Big Sur - - name: Install Mac OS SDK 10.10 - if: runner.os == 'macOS' - run: | - wget https://github.com/phracker/MacOSX-SDKs/releases/download/11.3/MacOSX10.10.sdk.tar.xz - sudo tar -zxvf MacOSX10.10.sdk.tar.xz -C /opt - - name: Set KHIOPS_REVISION build input parameter - run: | - KHIOPS_REVISION="${{ inputs.khiops-revision || env.DEFAULT_KHIOPS_REVISION }}" - echo "KHIOPS_REVISION=$KHIOPS_REVISION" >> "$GITHUB_ENV" - - name: Build Khiops Conda Package (Windows) - if: runner.os == 'Windows' - run: | - mkdir khiops-conda - conda build --output-folder khiops-conda ./packaging/conda - # In Linux/macOS we need the conda-forge channel to install their pinned versions - - name: Build Khiops Conda Package (Linux/macOS) - if: runner.os != 'Windows' + run: conda install conda-build + - name: Build the Conda Package + # Note: The "khiops-dev" conda channel is needed to retrieve the "khiops-core" package. + # The "test" part of the conda recipe needs this package. run: | - mkdir khiops-conda - conda build --channel conda-forge --output-folder khiops-conda ./packaging/conda - - name: Upload Khiops Conda Package + conda build --channel conda-forge --channel khiops-dev \ + --output-folder ./khiops-conda ./packaging/conda + - name: Upload Conda Package Artifact uses: actions/upload-artifact@v4 with: - name: khiops-conda-${{ matrix.os-family }} + name: khiops-conda path: ./khiops-conda retention-days: 7 # Test Conda package on brand new environments @@ -88,16 +64,17 @@ jobs: strategy: fail-fast: false matrix: - env: - - {os: ubuntu-20.04, os-family: linux} - - {os: ubuntu-22.04, os-family: linux} - - {os: windows-2019, os-family: windows} - - {os: windows-2022, os-family: windows} - - {os: macos-11, os-family: macos} - - {os: macos-12, os-family: macos} - - {os: macos-13, os-family: macos} + os: + - ubuntu-20.04 + - ubuntu-22.04 + - windows-2019 + - windows-2022 + - macos-11 + - macos-12 + - macos-13 + - macos-14 python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - runs-on: ${{ matrix.env.os }} + runs-on: ${{ matrix.os }} env: KHIOPS_SAMPLES_DIR: ./khiops-samples-repo steps: @@ -116,15 +93,23 @@ jobs: - name: Download Conda Package Artifact uses: actions/download-artifact@v4 with: - name: khiops-conda-${{ matrix.env.os-family }} - path: khiops-conda + name: khiops-conda + path: ./khiops-conda + - name: Put the khiops-core Version in the Environment + run: | + KHIOPS_CORE_VERSION="${{ inputs.khiops-core-version || env.DEFAULT_KHIOPS_CORE_VERSION }}" + echo "KHIOPS_CORE_VERSION=$KHIOPS_CORE_VERSION" >> "$GITHUB_ENV" - name: Install the Khiops Conda pagkage (Windows) if: runner.os == 'Windows' - run: conda install -c ./khiops-conda/ khiops + run: | + conda install --channel khiops-dev khiops-core=$KHIOPS_CORE_VERSION + conda install --channel ./khiops-conda/ khiops # In Linux/macOS we need the conda-forge channel to install their pinned versions - name: Install the Khiops Conda package (Linux/macOS) if: runner.os != 'Windows' - run: conda install -c conda-forge -c ./khiops-conda/ khiops + run: | + conda install --channel conda-forge --channel khiops-dev khiops-core=$KHIOPS_CORE_VERSION + conda install --channel ./khiops-conda/ khiops - name: Test Khiops Installation Status run: kh-status - name: Test Conda Package Installation on Samples @@ -140,7 +125,7 @@ jobs: release: if: github.ref_type == 'tag' needs: test - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 permissions: contents: write steps: @@ -148,20 +133,19 @@ jobs: uses: actions/download-artifact@v4 with: # See the upload-artifact step in the build job for the explanation of this pattern + name: khiops-conda path: ./khiops-conda - pattern: khiops-conda-* - merge-multiple: true - name: Install Miniconda uses: conda-incubator/setup-miniconda@v3 with: miniconda-version: latest - python-version: '3.11' - - name: Install requirement packages - run: conda install -y anaconda-client conda-build=3.27.0 + python-version: '3.12' + - name: Install Requirement Packages + run: conda install -y anaconda-client conda-index - name: Reindex the package directory - run: conda-index ./khiops-conda - - name: Upload the packages to anaconda.org - run: | + run: python -m conda_index ./khiops-conda + - name: Upload the Package to anaconda.org + run: |- # Set the anaconda.org channel ANACONDA_CHANNEL="${{ inputs.release-channel || 'khiops-dev' }}" @@ -169,37 +153,9 @@ jobs: if [[ "$ANACONDA_CHANNEL" == "khiops" ]] then anaconda --token "${{ secrets.KHIOPS_ANACONDA_CHANNEL_TOKEN }}" upload \ - --user "$ANACONDA_CHANNEL" ./khiops-conda/*/*.tar.bz2 + --user "$ANACONDA_CHANNEL" ./khiops-conda/noarch/*.tar.bz2 # For the dev channel: upload with forcing else anaconda --token "${{ secrets.KHIOPS_DEV_ANACONDA_CHANNEL_TOKEN }}" upload \ - --user "$ANACONDA_CHANNEL" --force ./khiops-conda/*/*.tar.bz2 + --user "$ANACONDA_CHANNEL" --force ./khiops-conda/noarch/*.tar.bz2 fi - - name: Extract package version - run: | - PKG_VERSION=$(\ - conda search --override-channels --channel ./khiops-conda/ khiops \ - | awk '!/#|channels/ {print $2}' \ - | sort -u \ - ) - echo "PKG_VERSION=$PKG_VERSION" >> "$GITHUB_ENV" - - name: Create the release zip archive - uses: thedoctor0/zip-release@0.7.6 - with: - type: zip - path: ./khiops-conda/ - filename: khiops-${{ env.PKG_VERSION }}-conda.zip - - name: Upload conda package artifacts for all platforms - uses: actions/upload-artifact@v4 - with: - name: khiops-conda-all - path: ./khiops-${{ env.PKG_VERSION }}-conda.zip - - name: Release the zip archive - uses: ncipollo/release-action@v1 - with: - allowUpdates: true - artifacts: ./khiops-${{ env.PKG_VERSION }}-conda.zip - draft: false - makeLatest: false - prerelease: true - updateOnlyUnreleased: true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9786ede1..1c217d93 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,7 +20,7 @@ repos: rev: 1.15.0 hooks: - id: yamlfix - exclude: packaging/conda/(meta|conda_build_config).yaml + exclude: packaging/conda/meta.yaml - repo: https://github.com/python-jsonschema/check-jsonschema rev: 0.27.1 hooks: diff --git a/packaging/conda/bld.bat b/packaging/conda/bld.bat deleted file mode 100644 index fd2380a6..00000000 --- a/packaging/conda/bld.bat +++ /dev/null @@ -1,34 +0,0 @@ -REM Echo all output -@echo on - -REM Clone Khiops sources -git clone https://github.com/khiopsml/khiops.git khiops_bin -cd .\khiops_bin\ -git checkout "%KHIOPS_REVISION%" -cd .. - -REM Copy relevant Khiops files to current directory -robocopy .\khiops_bin\src .\src /e -robocopy .\khiops_bin\test .\test /e -mkdir .\packaging -robocopy .\khiops_bin\packaging\common .\packaging\common /e -if errorlevel 8 exit 1 -copy /y .\khiops_bin\CMakeLists.txt . -copy /y .\khiops_bin\CMakePresets.json . -copy /y .\khiops_bin\LICENSE . -copy /y .\khiops_bin\packaging\install.cmake .\packaging\ -copy /y .\khiops_bin\packaging\packaging.cmake .\packaging\ - -REM Build the Khiops binaries -cmake --preset windows-msvc-release -DBUILD_JARS=OFF -DTESTING=OFF -cmake --build --preset windows-msvc-release --parallel --target MODL MODL_Coclustering - -REM Copy the MODL binaries to the Conda PREFIX path -mkdir %PREFIX%\bin -copy build\windows-msvc-release\bin\MODL.exe %PREFIX%\bin -copy build\windows-msvc-release\bin\MODL_Coclustering.exe %PREFIX%\bin - -REM Build the Khiops Python package -"%PYTHON%" -m pip install . --no-deps --ignore-installed --no-cache-dir --no-build-isolation -vvv - -if errorlevel 1 exit 1 diff --git a/packaging/conda/build.sh b/packaging/conda/build.sh deleted file mode 100644 index 3bbb4527..00000000 --- a/packaging/conda/build.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash - -# Set-up the shell to behave more like a general-purpose programming language -set -euo pipefail - -# Clone Khiops sources (we change working directory there) -git clone https://github.com/khiopsml/khiops.git khiops-core -cd khiops-core -git checkout "$KHIOPS_REVISION" - -# Copy License file -cp ./LICENSE .. - -# Build MODL and MODL_Coclustering -# Note on macOS we need the macOS SDK 10.10 for this conda build to work -if [[ "$(uname)" == "Darwin" ]] -then - CMAKE_PRESET="macos-clang-release" -else - CMAKE_PRESET="linux-gcc-release" -fi -cmake --preset $CMAKE_PRESET -DBUILD_JARS=OFF -DTESTING=OFF -DCMAKE_CXX_COMPILER="$PREFIX/bin/mpicxx" -cmake --build --preset $CMAKE_PRESET --parallel --target MODL MODL_Coclustering - -# Copy the MODL binaries to the Conda PREFIX path -cp "./build/$CMAKE_PRESET/bin/MODL" "$PREFIX/bin" -cp "./build/$CMAKE_PRESET/bin/MODL_Coclustering" "$PREFIX/bin" - - -# Build the Khiops Python package in the base directory -cd .. -$PYTHON -m pip install . --no-deps --ignore-installed --no-cache-dir --no-build-isolation -vvv - -# Custom rpath relocation and signing executables for macOS in arm64 -# -# In osx-arm64 executing any binary that is not signed will make appear popups appearing demanding -# "accepting incoming connections". Since our application doesn't need any connections from the -# outside the machine this doesn't affect the execution but since it is launched with MPI the number -# of popups appearing is high. This is difficult to fix for the user because the if the artifact is -# not signed it will reappear even if we click in the "Allow" button. So we sign the MODL -# executables to solve this (only a single popup concerning mpiexec.hydra may appear but for this -# application pressing on "Allow" works). -# -# However, in the default settings, `conda build` relocalizes the executable by changing rpath of -# the library paths at $PREFIX by relative ones and in doing so it nullifies any signature. So we -# do ourselves this procedure first and then sign the binary. -# -# Note that in meta.yaml for osx-arm64 we have custom build.binary_relocation and -# build.detect_binary_files_with_prefix option -# -# This part must be executed in a root machine to be non-interactive (eg. GitHub runner) -# It also needs the following environment variable: -# - KHIOPS_APPLE_CERTIFICATE_COMMON_NAME: The second column of the `security find-identity` command -# A base64 encoded certificate may also be provided, the following 2 variables must be set -# - KHIOPS_APPLE_CERTIFICATE_BASE64: The identity file .p12 (certificate + private key) in base64 -# - KHIOPS_APPLE_CERTIFICATE_PASSWORD: Password for the certificate file -# - KHIOPS_APPLE_TMP_KEYCHAIN_PASSWORD: A temporary password to decrypt the certificate -# -if [[ "$(uname)" == "Darwin" && -n "${KHIOPS_APPLE_CERTIFICATE_COMMON_NAME-}" ]] -then - # Delete the rpath of each executable - # Delete two times for MODL because for some reason it is there 2 times - install_name_tool -delete_rpath "$PREFIX/lib" "$PREFIX/bin/MODL" - install_name_tool -delete_rpath "$PREFIX/lib" "$PREFIX/bin/MODL" - install_name_tool -delete_rpath "$PREFIX/lib" "$PREFIX/bin/MODL_Coclustering" - - # Add the relative rpath as conda build would - install_name_tool -add_rpath "@loader_path/../lib" "$PREFIX/bin/MODL" - install_name_tool -add_rpath "@loader_path/../lib" "$PREFIX/bin/MODL_Coclustering" - - if [[ -n "${KHIOPS_APPLE_CERTIFICATE_BASE64-}" ]] - then - # Keychain setup slightly modified from: https://stackoverflow.com/a/68577995 - # Before importing identity - # - Set the default user login keychain - # - Create a temporary keychain - # - Append temporary keychain to the user domain - # - Remove relock timeout - # - Unlock the temporary keychain - sudo security list-keychains -d user -s login.keychain - sudo security create-keychain -p "$KHIOPS_APPLE_TMP_KEYCHAIN_PASSWORD" kh-tmp.keychain - sudo security list-keychains -d user -s kh-tmp.keychain \ - "$(security list-keychains -d user | sed s/\"//g)" - sudo security set-keychain-settings kh-tmp.keychain - sudo security unlock-keychain -p "$KHIOPS_APPLE_TMP_KEYCHAIN_PASSWORD" kh-tmp.keychain - - # Add identity (certificate + private key) to keychain - echo "$KHIOPS_APPLE_CERTIFICATE_BASE64" \ - | base64 --decode -i - -o kh-cert.p12 - sudo security import kh-cert.p12 \ - -k kh-tmp.keychain \ - -P "$KHIOPS_APPLE_CERTIFICATE_PASSWORD" \ - -A -T "/usr/bin/codesign" - rm -f kh-cert.p12 - - # Enable codesigning from a non user interactive shell - sudo security set-key-partition-list -S apple-tool:,apple:, \ - -s -k "$KHIOPS_APPLE_TMP_KEYCHAIN_PASSWORD" \ - -D "$KHIOPS_APPLE_CERTIFICATE_COMMON_NAME" \ - -t private kh-tmp.keychain - fi - - # We make sure to use the default macOS/Xcode codesign tool. This is because the sigtool python - # package (installed by conda build as a dependency) makes an alias "codesign" which is prioritary - # in the build environment. The alias, however, alias doesn't support signing with a proper - # identity and makes the build fail! - CODESIGN="/usr/bin/codesign" - - # Sign the MODL executable and check - $CODESIGN --force --sign "$KHIOPS_APPLE_CERTIFICATE_COMMON_NAME" "$PREFIX/bin/MODL" - $CODESIGN --force --sign "$KHIOPS_APPLE_CERTIFICATE_COMMON_NAME" "$PREFIX/bin/MODL_Coclustering" - $CODESIGN -d -vvv "$PREFIX/bin/MODL" - $CODESIGN -d -vvv "$PREFIX/bin/MODL_Coclustering" - - # Remove the temporary keychain and restore the login keychain as default if created - if [[ -n "${KHIOPS_APPLE_CERTIFICATE_BASE64-}" ]] - then - sudo security delete-keychain kh-tmp.keychain - sudo security list-keychains -d user -s login.keychain - fi -fi diff --git a/packaging/conda/conda_build_config.yaml b/packaging/conda/conda_build_config.yaml deleted file mode 100644 index d7e92ce4..00000000 --- a/packaging/conda/conda_build_config.yaml +++ /dev/null @@ -1,13 +0,0 @@ ---- -python: - - 3.8 - - 3.9 - - 3.10 - - 3.11 - - 3.12 - -# We need MacOS SDK 10.10 to be able to build on Big Sur for x64 -# Download: https://github.com/phracker/MacOSX-SDKs/releases/download/11.3/MacOSX10.10.sdk.tar.xz -# Decompress then to /opt: tar -zxvf MacOSX10.10.sdk.tar.xz -C /opt -CONDA_BUILD_SYSROOT: - - /opt/MacOSX10.10.sdk # [osx and not arm64] diff --git a/packaging/conda/meta.yaml b/packaging/conda/meta.yaml index a85ec1e8..0e62f11e 100644 --- a/packaging/conda/meta.yaml +++ b/packaging/conda/meta.yaml @@ -7,68 +7,26 @@ source: path: ../../ build: - script_env: - - KHIOPS_REVISION - # Variables for signing the MODL executables in osx-arm64. - {% if "KHIOPS_APPLE_CERTIFICATE_COMMON_NAME" in os.environ %} - - KHIOPS_APPLE_CERTIFICATE_COMMON_NAME # [osx] - # Only available when "KHIOPS_APPLE_CERTIFICATE_BASE64" is defined in the environment. - {% if "KHIOPS_APPLE_CERTIFICATE_BASE64" in os.environ %} - - KHIOPS_APPLE_CERTIFICATE_BASE64 # [osx] - - KHIOPS_APPLE_CERTIFICATE_PASSWORD # [osx] - - KHIOPS_APPLE_TMP_KEYCHAIN_PASSWORD # [osx] - {% endif %} - {% endif %} + number: 0 + noarch: python entry_points: - kh-status = khiops.tools:kh_status_entry_point - kh-samples = khiops.tools:kh_samples_entry_point - kh-download-datasets = khiops.tools:kh_download_datasets_entry_point - pk-status = khiops.tools:pk_status_entry_point # deprecated - number: 0 - # Binary relocation of MODL and MODL_Coclustering is done in build.sh script - # This is to be able to sign it, see the script for more details. - # Only done when "KHIOPS_APPLE_CERTIFICATE_BASE64" is defined in the environment. - {% if "KHIOPS_APPLE_CERTIFICATE_COMMON_NAME" in os.environ %} - binary_relocation: # [osx] - - bin/kh-status # [osx] - - bin/kh-samples # [osx] - - bin/kh-download-datasets # [osx] - - bin/pk-status # [osx] - detect_binary_files_with_prefix: false # [osx] - {% endif %} + script: | + {{ PYTHON }} -m pip install . --no-deps --ignore-installed --no-cache-dir --no-build-isolation -vvv -# Note on version pinning: -# OSX: -# - mpich=3.4.3 because 4.* is still unstable -# - requires conda-forge -# Linux: -# - mpich=4.0.3 because of bugs of the 3.* series -# - requires conda-forge requirements: build: - - mpich 4.0.3 # [linux] - - mpich-mpicxx 4.0.3 # [linux] - - mpich 3.4.3 # [osx] - - mpich-mpicxx 3.4.3 # [osx] - - msmpi # [win] - - cmake - - ninja - python - setuptools - - {{ compiler('cxx') }} host: - - mpich 4.0.3 # [linux] - - mpich-mpicxx 4.0.3 # [linux] - - mpich 3.4.3 # [osx] - - mpich-mpicxx 3.4.3 # [osx] - - msmpi # [win] - python run: - - mpich 4.0.3 # [linux] - - mpich 3.4.3 # [osx] - - msmpi # [win] - - pandas >=0.25.3 - python + - khiops-core >=10.0.0,<11.0.0 + - pandas >=0.25.3 - scikit-learn >=0.22.2 run_constrained: - boto3 >=1.17.39 @@ -78,8 +36,6 @@ outputs: - name: {{ metadata.get('name') }} test: commands: - - MODL -v - - MODL_Coclustering -v - kh-status imports: - khiops.core.api From 0b120514ab739a3819a271b6911cbc94aacac639 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Fri, 5 Apr 2024 15:07:36 +0200 Subject: [PATCH 02/37] Improve release checklist at CONTRIBUTING.md --- CONTRIBUTING.md | 53 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 990c5923..5e639bdb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -227,8 +227,8 @@ git stash pop # only when you have non-committed changes ### Package dependencies We should strive to minimize external package dependencies to minimize installation problems. The current dependency policy is: -- `pykhiops.core` should only depend on python built-in modules. -- `pykhiops.sklearn` should only depend on python built-in modules and the following mainstream +- `khiops.core` should only depend on python built-in modules. +- `khiops.sklearn` should only depend on python built-in modules and the following mainstream data-science packages: - [Scikit-learn](https://scikit-learn.org/stable/) - [Pandas](https://pandas.pydata.org/) @@ -244,22 +244,53 @@ carefree while still trying to not add too many dependencies. We follow a non-standard `MAJOR.MINOR.PATCH.INCREMENT[PRE_RELEASE]` versioning convention. The first three numbers `MAJOR.MINOR.PATCH` are the latest Khiops version that is compatible with the package. The number `INCREMENT` indicates the evolution of `khiops-python` followed by an optional -`[PRE_RELEASE` version for alpha, beta and release candidate releases (eg. `b2`). +`[PRE_RELEASE]` version for alpha, beta and release candidate releases (eg. `b2`). ## Releases + +## Pre-releases When tagging a revision the CI will create the packages and upload them to the `khiops-dev` channel. Prefer to augment the pre-release revision number to re-create a tag because the CI overwrites packages with the same tag in the `khiops-dev` channel. Do not forget to clean any temporary -pre-releases from `khiops-dev` and the releases github page. - -To make a public release, you must execute the `Conda Packages` CI workflow manually on a tag and +pre-releases from `khiops-dev` and the releases GitHub page. + + +## Public Releases +Checklist: +- Release issue and its related PR + - Update the API Docs if necessary + - Update `CHANGELOG.md` + - Update the default `khiops-core` version in [.github/workflows/conda.yml] +- Git manipulations + - Update your local repo and save your work: + - `git stash # if necessary` + - `git fetch --tags --prune --prune-tags` + - `git switch dev` + - `git pull` + - `git switch main` + - `git pull` + - Merge the `dev` branch into `main` + - `git switch main` + - `git merge dev` + - Tag the merge commit with the release version (see Versioning above) + - `git switch main` + - `git tag 10.3.0.1 # Just an example` + - Make `dev` point to the merge commit just created in `main` + - This is necessary to include the merge commit into master to calculate intermediary versions + with Versioneer. + - Steps: + - `git switch dev` + - `git reset --hard main` + - `git push dev` (you need to remove the protections of `dev` for this step) +- Workflows + - Execute the `Conda Package` workflow specifying: + - The release tag + - `khiops` as the release channel + - Execute the `API Docs` workflow specifying "Deploy GH Pages". + +To make a public release, you must execute the `Conda Package` CI workflow manually on a tag and specify the `khiops` anaconda channel for upload. These uploads do not overwrite any packages in this channel, so you must correct any mistake manually. ### Git Manipulations upon a Major Release The following is the check list to be done upon a major release: -- Merge the `dev` branch into `main` -- Tag the merge commit with the release version -- Rebase the `dev` branch onto `main` - - This is necessary to include the merge commit into master to calculate intermediary versions - with versioneer From 6ba7ec5609ecefb18fdded5062c27ea37e7ab4f7 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Mon, 8 Apr 2024 21:50:02 +0200 Subject: [PATCH 03/37] Add conda package tests for Rocky Linux --- .github/workflows/conda.yml | 46 ++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 54e4fc3e..6da1f3df 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -1,7 +1,7 @@ --- name: Conda Package env: - DEFAULT_SAMPLES_REVISION: main + DEFAULT_SAMPLES_VERSION: 10.2.0 # Note: The default Khiops version must never be an alpha release as they are # ephemeral. To test alpha versions run the workflow manually. DEFAULT_KHIOPS_CORE_VERSION: 10.2.1 @@ -11,9 +11,9 @@ on: khiops-core-version: default: 10.2.1 description: khiops-core version for testing - samples-revision: - default: main - description: khiops-samples repo revision + khiops-samples-version: + default: 10.2.0 + description: khiops-samples version release-channel: type: choice default: khiops-dev @@ -64,27 +64,21 @@ jobs: strategy: fail-fast: false matrix: - os: - - ubuntu-20.04 - - ubuntu-22.04 - - windows-2019 - - windows-2022 - - macos-11 - - macos-12 - - macos-13 - - macos-14 python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - runs-on: ${{ matrix.os }} - env: - KHIOPS_SAMPLES_DIR: ./khiops-samples-repo + env: + - {os: ubuntu-20.04, json-image: '{"image": null}'} + - {os: ubuntu-22.04, json-image: '{"image": null}'} + - {os: ubuntu-22.04, json-image: '{"image": "rockylinux:8"}'} + - {os: ubuntu-22.04, json-image: '{"image": "rockylinux:9"}'} + - {os: windows-2019, json-image: '{"image": null}'} + - {os: windows-2022, json-image: '{"image": null}'} + - {os: macos-11, json-image: '{"image": null}'} + - {os: macos-12, json-image: '{"image": null}'} + - {os: macos-13, json-image: '{"image": null}'} + - {os: macos-14, json-image: '{"image": null}'} + runs-on: ${{ matrix.env.os }} + container: ${{ fromJSON(matrix.env.json-image) }} steps: - - name: Checkout Khiops samples - uses: actions/checkout@v4 - with: - repository: khiopsml/khiops-samples - ref: ${{ inputs.samples-revision || env.DEFAULT_SAMPLES_REVISION }} - token: ${{ secrets.GITHUB_TOKEN }} - path: ${{ env.KHIOPS_SAMPLES_DIR }} - name: Install Miniconda uses: conda-incubator/setup-miniconda@v3 with: @@ -99,7 +93,7 @@ jobs: run: | KHIOPS_CORE_VERSION="${{ inputs.khiops-core-version || env.DEFAULT_KHIOPS_CORE_VERSION }}" echo "KHIOPS_CORE_VERSION=$KHIOPS_CORE_VERSION" >> "$GITHUB_ENV" - - name: Install the Khiops Conda pagkage (Windows) + - name: Install the Khiops Conda package (Windows) if: runner.os == 'Windows' run: | conda install --channel khiops-dev khiops-core=$KHIOPS_CORE_VERSION @@ -112,6 +106,10 @@ jobs: conda install --channel ./khiops-conda/ khiops - name: Test Khiops Installation Status run: kh-status + - name: Download Sample Datasets + run: | + kh-download-datasets \ + --version ${{ inputs.khiops-core-version || env.DEFAULT_SAMPLES_VERSION }} - name: Test Conda Package Installation on Samples run: | kh-samples core -i train_predictor -e From fe0ad717b590a56a0d4d627861098ad89b12ee8d Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Tue, 9 Apr 2024 10:45:54 +0200 Subject: [PATCH 04/37] Update pre-commit repos --- .pre-commit-config.yaml | 10 ++++----- doc/conf.py | 1 + doc/convert_tutorials.py | 1 + khiops/core/api.py | 20 +++++++++-------- khiops/core/internals/runner.py | 8 ++++--- khiops/core/internals/task.py | 8 +++---- scripts/update_copyright.py | 1 + tests/test_core.py | 38 +++++++++++++++++++++------------ 8 files changed, 52 insertions(+), 35 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c217d93..beb7663b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,28 +1,28 @@ --- repos: - repo: https://github.com/psf/black-pre-commit-mirror - rev: 23.10.1 + rev: 24.3.0 hooks: - id: black language_version: python3 - repo: https://github.com/pycqa/pylint - rev: v3.0.1 + rev: v3.1.0 hooks: - id: pylint language_version: python3 exclude: doc/convert_samples.py|doc/conf.py|versioneer.py|khiops/_version.py|setup.py - repo: https://github.com/pycqa/isort - rev: 5.12.0 + rev: 5.13.2 hooks: - id: isort language_version: python3 - repo: https://github.com/lyz-code/yamlfix/ - rev: 1.15.0 + rev: 1.16.0 hooks: - id: yamlfix exclude: packaging/conda/meta.yaml - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.27.1 + rev: 0.28.1 hooks: - id: check-github-workflows args: [--verbose] diff --git a/doc/conf.py b/doc/conf.py index 51a0c670..ab39df32 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,4 +1,5 @@ """Khiops Python Sphinx configuration file""" + import os import sys from datetime import datetime diff --git a/doc/convert_tutorials.py b/doc/convert_tutorials.py index 349f1f93..452e45be 100644 --- a/doc/convert_tutorials.py +++ b/doc/convert_tutorials.py @@ -1,4 +1,5 @@ """Converts the Jupyter notebooks of the Khiops Python tutorial to reST""" + import argparse import glob import os diff --git a/khiops/core/api.py b/khiops/core/api.py index 5f304c95..ad99487f 100644 --- a/khiops/core/api.py +++ b/khiops/core/api.py @@ -128,15 +128,17 @@ def _run_task(task_name, task_args): # Create a command line options object command_line_options = CommandLineOptions( batch_mode=task_args["batch_mode"] if "batch_mode" in task_args else True, - log_file_path=task_args["log_file_path"] - if "log_file_path" in task_args - else "", - output_scenario_path=task_args["output_scenario_path"] - if "output_scenario_path" in task_args - else "", - task_file_path=task_args["task_file_path"] - if "task_file_path" in task_args - else "", + log_file_path=( + task_args["log_file_path"] if "log_file_path" in task_args else "" + ), + output_scenario_path=( + task_args["output_scenario_path"] + if "output_scenario_path" in task_args + else "" + ), + task_file_path=( + task_args["task_file_path"] if "task_file_path" in task_args else "" + ), ) # Clean the task_args to leave only the task arguments diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index 0ae11f01..83181012 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -833,9 +833,11 @@ def _write_task_scenario_file( writer, task, task_args, - general_options - if general_options is not None - else self.general_options, + ( + general_options + if general_options is not None + else self.general_options + ), ) fs.write(scenario_path, scenario_stream.getvalue()) diff --git a/khiops/core/internals/task.py b/khiops/core/internals/task.py index b603d02c..80d33b81 100644 --- a/khiops/core/internals/task.py +++ b/khiops/core/internals/task.py @@ -304,10 +304,10 @@ def write_execution_scenario(self, writer, args): and args["output_additional_data_tables"] is not None ): for data_path in args["output_additional_data_tables"].keys(): - args["output_additional_data_tables"][ - data_path - ] = create_unambiguous_khiops_path( - args["output_additional_data_tables"][data_path] + args["output_additional_data_tables"][data_path] = ( + create_unambiguous_khiops_path( + args["output_additional_data_tables"][data_path] + ) ) # Transform to string-like parameters diff --git a/scripts/update_copyright.py b/scripts/update_copyright.py index 72c9b1fe..f31d1028 100644 --- a/scripts/update_copyright.py +++ b/scripts/update_copyright.py @@ -1,4 +1,5 @@ """Updates the copyright notice of the input files""" + import argparse from datetime import datetime diff --git a/tests/test_core.py b/tests/test_core.py index 78926ce4..1bab6354 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -404,20 +404,30 @@ def test_api_scenario_generation(self): bytes(f"{dataset}Deployed.csv", encoding="ascii"), ], "kwargs": { - "additional_data_tables": { - bytes(key, encoding="ascii"): bytes(value, encoding="ascii") - for key, value in additional_data_tables[dataset].items() - } - if additional_data_tables[dataset] is not None - else None, - "output_additional_data_tables": { - bytes(key, encoding="ascii"): bytes(value, encoding="ascii") - for key, value in output_additional_data_tables[ - dataset - ].items() - } - if output_additional_data_tables[dataset] is not None - else None, + "additional_data_tables": ( + { + bytes(key, encoding="ascii"): bytes( + value, encoding="ascii" + ) + for key, value in additional_data_tables[ + dataset + ].items() + } + if additional_data_tables[dataset] is not None + else None + ), + "output_additional_data_tables": ( + { + bytes(key, encoding="ascii"): bytes( + value, encoding="ascii" + ) + for key, value in output_additional_data_tables[ + dataset + ].items() + } + if output_additional_data_tables[dataset] is not None + else None + ), }, } for dataset in datasets From b1416e23280ea5ef8394f74c5ba3a2bcb124a0ab Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Tue, 2 Apr 2024 11:13:30 +0200 Subject: [PATCH 05/37] Rename PyKhiops* test class to Khiops* --- tests/test_sklearn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index 55b33d76..963abb54 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -2249,7 +2249,7 @@ def test_parameter_transfer_coclustering_predict_from_file_dataset(self): ) -class PyKhiopsSklearnEstimatorStandardTests(unittest.TestCase): +class KhiopsSklearnEstimatorStandardTests(unittest.TestCase): """Tests to comply with `sklearn.util.estimator_checks.check_estimator`""" def test_sklearn_check_estimator(self): From 9bfa2b690c5c88ec7b71eace560e2000692cfec2 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Wed, 3 Apr 2024 19:39:48 +0200 Subject: [PATCH 06/37] Give each particular *Table its own `__repr__` method The `dtypes` attribute has been removed in changeset 1e1422dd --- khiops/sklearn/tables.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/khiops/sklearn/tables.py b/khiops/sklearn/tables.py index 72287f6e..33dac075 100644 --- a/khiops/sklearn/tables.py +++ b/khiops/sklearn/tables.py @@ -962,13 +962,6 @@ def create_khiops_dictionary(self): def _get_all_column_ids(self): """Returns the column ids including the target""" - def __repr__(self): - dtypes_str = str(self.dtypes).replace("\n", ", ")[:-16].replace(" ", ":") - return ( - f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " - f"dtypes={dtypes_str}; target={self.target_column_id}>" - ) - class PandasTable(DatasetTable): """Table encapsulating (X,y) pair with types (pandas.DataFrame, pandas.Series) @@ -1067,6 +1060,15 @@ def __init__( # Check key integrity self.check_key() + def __repr__(self): + dtypes_str = ( + str(self.dataframe.dtypes).replace("\n", ", ")[:-16].replace(" ", ":") + ) + return ( + f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " + f"dtypes={dtypes_str}; target={self.target_column_id}>" + ) + def _get_all_column_ids(self): if self.target_column is not None: all_column_ids = list(self.column_ids) + [self.target_column_id] @@ -1197,6 +1199,13 @@ def __init__( } self.n_samples = len(self.array) + def __repr__(self): + dtype_str = str(self.array.dtype) + return ( + f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " + f"dtype={dtype_str}; target={self.target_column_id}>" + ) + def _get_all_column_ids(self): n_columns = len(self.column_ids) if self.target_column is not None: From 62961d3c52de8625386ccc92fc1afd364dd7fd34 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Wed, 3 Apr 2024 19:41:46 +0200 Subject: [PATCH 07/37] Update PandasTable docstring to account for dataframe label support --- khiops/sklearn/tables.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/khiops/sklearn/tables.py b/khiops/sklearn/tables.py index 33dac075..7a89de0c 100644 --- a/khiops/sklearn/tables.py +++ b/khiops/sklearn/tables.py @@ -964,7 +964,10 @@ def _get_all_column_ids(self): class PandasTable(DatasetTable): - """Table encapsulating (X,y) pair with types (pandas.DataFrame, pandas.Series) + """Table encapsulating the features dataframe X and the target labels y + + X is of type pandas.DataFrame. + y is of type pandas.Series or pandas.DataFrame. Parameters ---------- From 1cdc65f5f615362adb1e2d79694c5c754067c0ac Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Wed, 3 Apr 2024 19:54:02 +0200 Subject: [PATCH 08/37] Add SciPy sparse matrix support All types mandated by SciPy / Sklearn are supported Empty rows in sparse matrices are supported as rows where all variables have missing values. This is needed so that Khiops yields predictions for such empty data as well, in order to comply with Scikit Learn standard estimator tests. related_to #42 --- doc/conf.py | 1 + khiops/sklearn/tables.py | 189 +++++++++++++++++++++++++++++++++++- tests/test_dataset_class.py | 73 +++++++++++++- tests/test_sklearn.py | 14 ++- 4 files changed, 265 insertions(+), 12 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index ab39df32..e4027f58 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -52,6 +52,7 @@ "pandas": ("https://pandas.pydata.org/pandas-docs/dev", None), "sklearn": ("https://scikit-learn.org/stable", None), "numpy": ("https://numpy.org/doc/stable", None), + "scipy": ("https://docs.scipy.org/doc/scipy", None), } ## Autosummary extension config diff --git a/khiops/sklearn/tables.py b/khiops/sklearn/tables.py index 7a89de0c..9bccad55 100644 --- a/khiops/sklearn/tables.py +++ b/khiops/sklearn/tables.py @@ -13,11 +13,13 @@ import numpy as np import pandas as pd +import scipy.sparse as sp from sklearn.utils import check_array from sklearn.utils.validation import column_or_1d import khiops.core as kh import khiops.core.internals.filesystems as fs +from khiops.core.dictionary import VariableBlock from khiops.core.internals.common import ( deprecation_message, is_dict_like, @@ -164,6 +166,11 @@ def __init__(self, X, y=None, categorical_target=True, key=None): y, categorical_target=categorical_target, ) + # A sparse matrix + elif isinstance(X, sp.spmatrix): + self._init_tables_from_sparse_matrix( + X, y, categorical_target=categorical_target + ) # A tuple spec elif isinstance(X, tuple): warnings.warn( @@ -226,6 +233,17 @@ def _init_tables_from_dataframe(self, X, y=None, categorical_target=True): ) self.secondary_tables = [] + def _init_tables_from_sparse_matrix(self, X, y=None, categorical_target=True): + """Initializes the dataset from a 'X' of type scipy.sparse.spmatrix""" + assert isinstance(X, sp.spmatrix), "'X' must be a scipy.sparse.spmatrix" + if y is not None and not hasattr(y, "__array__"): + raise TypeError(type_error_message("y", y, "array-like")) + + self.main_table = SparseTable( + "main_table", X, target_column=y, categorical_target=categorical_target + ) + self.secondary_tables = [] + def _init_tables_from_numpy_array(self, X, y=None, categorical_target=True): assert hasattr( X, "__array__" @@ -710,14 +728,14 @@ def is_in_memory(self): """Tests whether the dataset is in memory A dataset is in memory if it is constituted either of only pandas.DataFrame - tables or numpy.ndarray tables. + tables, numpy.ndarray, or scipy.sparse.spmatrix tables. Returns ------- bool `True` if the dataset is constituted of pandas.DataFrame tables. """ - return isinstance(self.main_table, (PandasTable, NumpyTable)) + return isinstance(self.main_table, (PandasTable, NumpyTable, SparseTable)) def is_multitable(self): """Tests whether the dataset is a multi-table one @@ -1261,6 +1279,173 @@ def create_table_file_for_khiops(self, output_dir, sort=True): return output_table_path +class SparseTable(DatasetTable): + """Table encapsulating feature matrix X and target array y + + X is of type scipy.sparse.spmatrix. + y is array-like. + + Parameters + ---------- + name : str + Name for the table. + matrix : `scipy.sparse.spmatrix` + The sparse matrix to be encapsulated. + key : list-like of str, optional + The names of the columns composing the key + target_column : :external:term:`array-like`, optional + The array containing the target column. + categorical_target : bool, default ``True``. + ``True`` if the target column is categorical. + """ + + def __init__( + self, name, matrix, key=None, target_column=None, categorical_target=True + ): + assert key is None, "'key' must be unset for sparse matrix tables" + # Call the parent method + super().__init__(name, key=key, categorical_target=categorical_target) + + # Check the sparse matrix types + if not isinstance(matrix, sp.spmatrix): + raise TypeError( + type_error_message("matrix", matrix, "scipy.sparse.spmatrix") + ) + if not np.issubdtype(matrix.dtype, np.number): + raise TypeError( + type_error_message("'matrix' dtype", matrix.dtype, "numeric") + ) + + # Check the target's types + if target_column is not None and not hasattr(target_column, "__array__"): + raise TypeError( + type_error_message("target_column", target_column, "array-like") + ) + + # Initialize the members + self.matrix = matrix + self.column_ids = list(range(self.matrix.shape[1])) + self.target_column_id = self.matrix.shape[1] + self.target_column = target_column + self.categorical_target = categorical_target + self.khiops_types = { + column_id: get_khiops_type(self.matrix.dtype) + for column_id in self.column_ids + } + self.n_samples = self.matrix.shape[0] + + def __repr__(self): + dtype_str = str(self.matrix.dtype) + return ( + f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " + f"dtype={dtype_str}; target={self.target_column_id}>" + ) + + def create_khiops_dictionary(self): + """Creates a Khiops dictionary representing this sparse table + + Adds metadata to each sparse variable + + Returns + ------- + `.Dictionary`: + The Khiops Dictionary object describing this table's schema + + """ + + # create dictionary as usual + dictionary = super().create_khiops_dictionary() + + # create variable block for containing the sparse variables + variable_block = VariableBlock() + variable_block.name = "SparseVariables" + + # For each variable, add metadata, named `VarKey` + variable_names = [variable.name for variable in dictionary.variables] + target_column_variable_name = self.get_khiops_variable_name( + self.target_column_id + ) + for i, variable_name in enumerate(variable_names, 1): + if variable_name != target_column_variable_name: + variable = dictionary.remove_variable(variable_name) + variable.meta_data.add_value("VarKey", i) + variable_block.add_variable(variable) + dictionary.add_variable_block(variable_block) + + return dictionary + + def _get_all_column_ids(self): + n_columns = len(self.column_ids) + if self.target_column is not None: + n_columns += 1 + return list(range(n_columns)) + + def get_khiops_variable_name(self, column_id): + """Return the khiops variable name associated to a column id""" + assert column_id == self.target_column_id or column_id in self.column_ids + if isinstance(column_id, str): + variable_name = column_id + else: + assert isinstance(column_id, (np.int64, int)) + variable_name = f"Var{column_id}" + return variable_name + + def _write_sparse_block(self, row_index, stream, target=None): + assert row_index in range( + self.matrix.shape[0] + ), "'row_index' must be coherent with the shape of the sparse matrix" + if target is not None: + assert target in self.target_column, "'target' must be in the target column" + stream.write(f"{target}\t") + row = self.matrix.getrow(row_index) + # Empty row in the sparse matrix: use the first variable as missing data + # TODO: remove this part once Khiops bug + # https://github.com/KhiopsML/khiops/issues/235 is solved + if row.data.size == 0: + for variable_index in self.column_ids: + stream.write(f"{variable_index + 1}: ") + break + # Non-empty row in the sparse matrix: get non-missing data + else: + # Variable indices are not always sorted in `row.indices` + # Khiops needs variable indices to be sorted + sorted_indices = np.sort(row.indices, axis=-1, kind="mergesort") + sorted_data = row.data[sorted_indices.argsort()] + for variable_index, variable_value in zip(sorted_indices, sorted_data): + stream.write(f"{variable_index + 1}:{variable_value} ") + stream.write("\n") + + def create_table_file_for_khiops(self, output_dir, sort=True): + # Create the output table resource object + output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") + + # Write the sparse matrix to an internal table file + with io.StringIO() as output_sparse_matrix_stream: + if self.target_column is not None: + target_column_name = self.get_khiops_variable_name( + self.target_column_id + ) + output_sparse_matrix_stream.write( + f"{target_column_name}\tSparseVariables\n" + ) + for target, row_index in zip( + self.target_column, range(self.matrix.shape[0]) + ): + self._write_sparse_block( + row_index, output_sparse_matrix_stream, target=target + ) + else: + output_sparse_matrix_stream.write("SparseVariables\n") + for row_index in range(self.matrix.shape[0]): + self._write_sparse_block(row_index, output_sparse_matrix_stream) + fs.write( + output_table_path, + output_sparse_matrix_stream.getvalue().encode("utf-8"), + ) + + return output_table_path + + class FileTable(DatasetTable): """A table representing a delimited text file diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index 0fdeeeba..8d071998 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd +import scipy.sparse as sp from numpy.testing import assert_equal from pandas.testing import assert_frame_equal from sklearn import datasets @@ -521,8 +522,8 @@ def test_created_file_from_dataframe_monotable(self): ) def test_created_file_from_numpy_array_monotable(self): - """Test consistency of the created data file with the input dataframe""" - # Create a monotable dahaset from a numpy array + """Test consistency of the created data file with the input numpy array""" + # Create a monotable dataset from a numpy array iris = datasets.load_iris() spec = {"tables": {"iris": (iris.data, None)}} dataset = Dataset(spec, y=iris.target, categorical_target=True) @@ -541,6 +542,74 @@ def test_created_file_from_numpy_array_monotable(self): ), ) + def _create_test_sparse_matrix_with_target(self): + # Create sparse array that also contains missing data-only rows + sparse_array = np.eye(N=100, k=2) + np.eye(N=100, k=5) + + # Create scipy sparse (CSR) matrix from the sparse array + sparse_matrix = sp.csr_matrix(sparse_array) + + # Create targets: -1 for left-sided values; +1 for right-sided values, + # 0 for missing-data-only rows + target_array = np.array(50 * [-1] + 45 * [1] + 5 * [0]) + return sparse_matrix, target_array + + def _load_khiops_sparse_file(self, stream): + # skip header + next(stream) + target_vector = [] + feature_matrix = [] + for line in stream: + target, features = line.split(b"\t") + feature_row = np.zeros(100) + for feature in features.strip().split(b" "): + feature_index, feature_value = feature.split(b":") + try: + feature_value = float(feature_value) + # missing value, whence empty string + except ValueError: + feature_value = 0.0 + feature_row[int(feature_index) - 1] = feature_value + feature_matrix.append(feature_row) + target_vector.append(float(target)) + target_array = np.array(target_vector) + sparse_matrix = sp.csr_matrix(feature_matrix) + return sparse_matrix, target_array + + def test_created_file_from_sparse_matrix_monotable(self): + """Test consistency of the created data file with the input sparse matrix""" + + # Load input sparse matrix and target array + ( + input_sparse_matrix, + input_target, + ) = self._create_test_sparse_matrix_with_target() + + # Create monotable dataset from the sparse matrix + dataset = Dataset( + X=input_sparse_matrix, y=input_target, categorical_target=True + ) + # Create and load the intermediary Khiops file + created_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) + with open(created_table_path, "rb") as created_table_stream: + sparse_matrix, target_array = self._load_khiops_sparse_file( + created_table_stream + ) + + # Check that the arrays are equal + assert_equal( + np.concatenate( + ( + sparse_matrix.toarray(), + target_array.reshape(-1, 1), + ), + axis=1, + ), + np.concatenate( + (input_sparse_matrix.toarray(), input_target.reshape(-1, 1)), axis=1 + ), + ) + def test_created_file_from_data_file_monotable(self): """Test consistency of the created data file with the input data file diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index 963abb54..0024f1b3 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -2271,15 +2271,13 @@ def test_sklearn_check_estimator(self): for estimator, check in check_estimator( khiops_estimator, generate_only=True ): - # Skip: - # - sparse data tests (not yet supported) - # - some checks for KhiopsEncoder as they yield "empty" deployed tables - # - To be implemented manually + # Skip some checks for KhiopsEncoder as they yield "empty" + # deployed tables; they need to be implemented manually check_name = check.func.__name__ - if check_name == "check_estimator_sparse_data" or ( - check_name in ["check_fit_score_takes_y", "check_fit_idempotent"] - and isinstance(estimator, KhiopsEncoder) - ): + if check_name in [ + "check_fit_score_takes_y", + "check_fit_idempotent", + ] and isinstance(estimator, KhiopsEncoder): continue with self.subTest( sklearn_check_name=check_name, sklearn_check_kwargs=check.keywords From a11fe422f0010d81446c38c7061da049d32bf2d6 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Wed, 10 Apr 2024 18:13:23 +0200 Subject: [PATCH 09/37] Add sparse sklearn KhiopsClassifier sample related_to #42 --- doc/convert_samples.py | 2 + doc/samples/samples_sklearn.rst | 52 ++++++++++++++++++++++++ khiops/samples/samples_sklearn.ipynb | 60 ++++++++++++++++++++++++++++ khiops/samples/samples_sklearn.py | 53 ++++++++++++++++++++++++ 4 files changed, 167 insertions(+) diff --git a/doc/convert_samples.py b/doc/convert_samples.py index d4e472aa..ae204a8a 100644 --- a/doc/convert_samples.py +++ b/doc/convert_samples.py @@ -30,6 +30,8 @@ def create_boilerplate_code(script_name): "from sklearn.compose import ColumnTransformer\n", "from sklearn.experimental import enable_hist_gradient_boosting\n", "from sklearn.ensemble import HistGradientBoostingClassifier\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "from sklearn.feature_extraction.text import HashingVectorizer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import OneHotEncoder\n", diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index 68d96603..2afed2c9 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -44,6 +44,8 @@ preamble: from sklearn.compose import ColumnTransformer from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier + from sklearn.datasets import fetch_20newsgroups + from sklearn.feature_extraction.text import HashingVectorizer from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder @@ -105,6 +107,56 @@ Samples print(f"Test accuracy = {test_accuracy}") print(f"Test auc = {test_auc}") +.. autofunction:: khiops_classifier_sparse +.. code-block:: python + + def khiops_classifier_sparse(): + + # Load 3 classes of the 20newsgroups dataset + categories = ["comp.graphics", "sci.space", "misc.forsale"] + data_train, y_train = fetch_20newsgroups( + subset="train", + categories=categories, + return_X_y=True, + ) + data_test, y_test = fetch_20newsgroups( + subset="test", + categories=categories, + return_X_y=True, + ) + + # Extract features from the training data using a sparse vectorizer + vectorizer = HashingVectorizer(n_features=2**10, stop_words="english") + X_train = vectorizer.fit_transform(data_train) + + # Extract features from the test data using the same vectorizer + X_test = vectorizer.transform(data_test) + + # Create the classifier object + khc = KhiopsClassifier() + + # Train the classifier + khc.fit(X_train, y_train) + + # Predict the classes on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[0:10]) + print("---") + + # Predict the class probabilities on the test dataset + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[0:10]) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred) + test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class="ovr") + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") + .. autofunction:: khiops_classifier_multiclass .. code-block:: python diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index 3d5e58a0..dd541edb 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -26,6 +26,8 @@ "from sklearn.compose import ColumnTransformer\n", "from sklearn.experimental import enable_hist_gradient_boosting\n", "from sklearn.ensemble import HistGradientBoostingClassifier\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "from sklearn.feature_extraction.text import HashingVectorizer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import OneHotEncoder\n", @@ -93,6 +95,64 @@ "khiops_classifier()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def khiops_classifier_sparse():\n", + " \"\"\"Trains a `.KhiopsClassifier` on a monotable sparse matrix\"\"\"\n", + "\n", + " # Load 3 classes of the 20newsgroups dataset\n", + " categories = [\"comp.graphics\", \"sci.space\", \"misc.forsale\"]\n", + " data_train, y_train = fetch_20newsgroups(\n", + " subset=\"train\",\n", + " categories=categories,\n", + " return_X_y=True,\n", + " )\n", + " data_test, y_test = fetch_20newsgroups(\n", + " subset=\"test\",\n", + " categories=categories,\n", + " return_X_y=True,\n", + " )\n", + "\n", + " # Extract features from the training data using a sparse vectorizer\n", + " vectorizer = HashingVectorizer(n_features=2**10, stop_words=\"english\")\n", + " X_train = vectorizer.fit_transform(data_train)\n", + "\n", + " # Extract features from the test data using the same vectorizer\n", + " X_test = vectorizer.transform(data_test)\n", + "\n", + " # Create the classifier object\n", + " khc = KhiopsClassifier()\n", + "\n", + " # Train the classifier\n", + " khc.fit(X_train, y_train)\n", + "\n", + " # Predict the classes on the test dataset\n", + " y_test_pred = khc.predict(X_test)\n", + " print(\"Predicted classes (first 10):\")\n", + " print(y_test_pred[0:10])\n", + " print(\"---\")\n", + "\n", + " # Predict the class probabilities on the test dataset\n", + " y_test_probas = khc.predict_proba(X_test)\n", + " print(f\"Class order: {khc.classes_}\")\n", + " print(\"Predicted class probabilities (first 10):\")\n", + " print(y_test_probas[0:10])\n", + " print(\"---\")\n", + "\n", + " # Evaluate accuracy and auc metrics on the test dataset\n", + " test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", + " test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class=\"ovr\")\n", + " print(f\"Test accuracy = {test_accuracy}\")\n", + " print(f\"Test auc = {test_auc}\")\n", + "\n", + "#Run sample\n", + "khiops_classifier_sparse()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index 189d8554..112bd95f 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -23,6 +23,9 @@ from sklearn.ensemble import HistGradientBoostingClassifier # isort: on +from sklearn.datasets import fetch_20newsgroups +from sklearn.feature_extraction.text import HashingVectorizer + # pylint: enable=unused-import from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline @@ -88,6 +91,55 @@ def khiops_classifier(): print(f"Test auc = {test_auc}") +def khiops_classifier_sparse(): + """Trains a `.KhiopsClassifier` on a monotable sparse matrix""" + + # Load 3 classes of the 20newsgroups dataset + categories = ["comp.graphics", "sci.space", "misc.forsale"] + data_train, y_train = fetch_20newsgroups( + subset="train", + categories=categories, + return_X_y=True, + ) + data_test, y_test = fetch_20newsgroups( + subset="test", + categories=categories, + return_X_y=True, + ) + + # Extract features from the training data using a sparse vectorizer + vectorizer = HashingVectorizer(n_features=2**10, stop_words="english") + X_train = vectorizer.fit_transform(data_train) + + # Extract features from the test data using the same vectorizer + X_test = vectorizer.transform(data_test) + + # Create the classifier object + khc = KhiopsClassifier() + + # Train the classifier + khc.fit(X_train, y_train) + + # Predict the classes on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[0:10]) + print("---") + + # Predict the class probabilities on the test dataset + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[0:10]) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred) + test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class="ovr") + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") + + def khiops_classifier_multiclass(): """Trains a multiclass `.KhiopsClassifier` on a monotable dataframe""" # Load the dataset into a pandas dataframe @@ -761,6 +813,7 @@ def khiops_classifier_multitable_star_file(): exported_samples = [ khiops_classifier, + khiops_classifier_sparse, khiops_classifier_multiclass, khiops_classifier_multitable_star, khiops_classifier_multitable_snowflake, From e6ba8222ffa1bdbeaeb6be45c7f9e9b10c909aef Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Fri, 5 Apr 2024 18:03:25 +0200 Subject: [PATCH 10/37] Support mapping-based dataset input with sparse matrices closes #42 --- khiops/sklearn/tables.py | 27 +++++++++++++++++++++++---- tests/test_dataset_class.py | 34 ++++++++++++++++++++++++++++++++++ tests/test_dataset_errors.py | 4 ++-- 3 files changed, 59 insertions(+), 6 deletions(-) diff --git a/khiops/sklearn/tables.py b/khiops/sklearn/tables.py index 9bccad55..a20631fe 100644 --- a/khiops/sklearn/tables.py +++ b/khiops/sklearn/tables.py @@ -437,6 +437,16 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): self.secondary_tables.append( PandasTable(table_name, table_source, key=table_key) ) + # Case of sparse matrices + elif isinstance(main_table_source, sp.spmatrix): + self.main_table = SparseTable( + main_table_name, + main_table_source, + key=main_table_key, + target_column=y, + categorical_target=categorical_target, + ) + self.secondary_tables = [] # Case of numpyarray else: self.main_table = NumpyTable( @@ -596,14 +606,14 @@ def _check_input_mapping(self, X, y=None): f"must have size 2 not {len(table_input)}" ) table_source, table_key = table_input - if not isinstance(table_source, (pd.DataFrame, str)) and not hasattr( - table_source, "__array__" - ): + if not isinstance( + table_source, (pd.DataFrame, sp.spmatrix, str) + ) and not hasattr(table_source, "__array__"): raise TypeError( type_error_message( f"Table source at X['tables']['{table_name}']", table_source, - "array-like", + "array-like or scipy.sparse.spmatrix", str, ) ) @@ -718,6 +728,15 @@ def _check_input_mapping(self, X, y=None): type_error_message("y", y, pd.Series, pd.DataFrame) + " (X's tables are of type pandas.DataFrame)" ) + if ( + isinstance(main_table_source, sp.spmatrix) + or hasattr(main_table_source, "__array__") + ) and not hasattr(y, "__array__"): + raise TypeError( + type_error_message("y", y, "array-like") + + " (X's tables are of type numpy.ndarray" + + " or scipy.sparse.spmatrix)" + ) if isinstance(main_table_source, str) and not isinstance(y, str): raise TypeError( type_error_message("y", y, str) diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index 8d071998..ae095c86 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -610,6 +610,40 @@ def test_created_file_from_sparse_matrix_monotable(self): ), ) + def test_created_file_from_sparse_matrix_monotable_specification(self): + """Test consistency of the created data file with the input sparse matrix""" + + # Load input sparse matrix and target array + ( + input_sparse_matrix, + input_target, + ) = self._create_test_sparse_matrix_with_target() + + # Create monotable dataset from input mapping with the sparse matrix + spec = {"tables": {"example_sparse_matrix": (input_sparse_matrix, None)}} + dataset = Dataset(spec, y=input_target, categorical_target=True) + + # Create and load the intermediary Khiops file + created_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) + with open(created_table_path, "rb") as created_table_stream: + sparse_matrix, target_array = self._load_khiops_sparse_file( + created_table_stream + ) + + # Check that the arrays are equal + assert_equal( + np.concatenate( + ( + sparse_matrix.toarray(), + target_array.reshape(-1, 1), + ), + axis=1, + ), + np.concatenate( + (input_sparse_matrix.toarray(), input_target.reshape(-1, 1)), axis=1 + ), + ) + def test_created_file_from_data_file_monotable(self): """Test consistency of the created data file with the input data file diff --git a/tests/test_dataset_errors.py b/tests/test_dataset_errors.py index 473f8aa6..3cd09f40 100644 --- a/tests/test_dataset_errors.py +++ b/tests/test_dataset_errors.py @@ -579,14 +579,14 @@ def test_dict_spec_table_input_tuple_must_have_size_2(self): expected_msg = "Table input tuple at X['tables']['D'] must have size 2 not 4" self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) - def test_dict_spec_source_table_type_must_be_array_like_or_str(self): + def test_dict_spec_source_table_type_must_be_adequate(self): """Test Dataset raising TypeError when a table entry is not str nor DataFrame""" bad_spec, y = self.create_fixture_dataset_spec() bad_spec["tables"]["D"] = (AnotherType(), bad_spec["tables"]["D"][-1]) expected_msg = type_error_message( "Table source at X['tables']['D']", bad_spec["tables"]["D"][0], - "array-like", + "array-like or scipy.sparse.spmatrix", str, ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) From 99d00197d3b2ca5307f9c7ee89e7c8f1c6f815f2 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Thu, 11 Apr 2024 19:56:51 +0200 Subject: [PATCH 11/37] Fix metric name search in performance report --- khiops/core/analysis_results.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/khiops/core/analysis_results.py b/khiops/core/analysis_results.py index ea018a27..8cbb4cf4 100644 --- a/khiops/core/analysis_results.py +++ b/khiops/core/analysis_results.py @@ -2841,14 +2841,13 @@ def get_metric(self, metric_name): """ # Search the lower cased metric name in the list, report error if not found lowercase_metric_name = metric_name.lower() - metric = None - for name in self.get_metric_names(): - if lowercase_metric_name == name: - metric = getattr(self, lowercase_metric_name) - if metric is None: + metric_found = lowercase_metric_name in self.get_metric_names() + if metric_found: + metric = getattr(self, lowercase_metric_name) + else: metric_list_msg = ",".join(self.get_metric_names()) raise ValueError( - f"Invalid metric: '{metric_name}'. Choose among {metric_list_msg}" + f"Invalid metric: '{metric_name}'. Choose among {metric_list_msg}." ) return metric From 5cc1d79b054c9c916df005f363ddcee6482b2b8f Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Fri, 12 Apr 2024 09:47:20 +0200 Subject: [PATCH 12/37] Fix interval number in check DataGridDimension Change the minimal size of an interval partition from 2 to 1. --- khiops/core/analysis_results.py | 4 +- tests/resources/README.md | 4 +- .../ref_json_reports/IrisU2D.khj | 2714 +++++++++++++++++ .../analysis_results/ref_reports/IrisU2D.txt | 1159 +++++++ tests/test_core.py | 1 + 5 files changed, 3878 insertions(+), 4 deletions(-) create mode 100644 tests/resources/analysis_results/ref_json_reports/IrisU2D.khj create mode 100644 tests/resources/analysis_results/ref_reports/IrisU2D.txt diff --git a/khiops/core/analysis_results.py b/khiops/core/analysis_results.py index 8cbb4cf4..bf980730 100644 --- a/khiops/core/analysis_results.py +++ b/khiops/core/analysis_results.py @@ -2178,9 +2178,9 @@ def __init__(self, json_data=None): # Numerical partition if self.partition_type == "Intervals": # Check the length of the partition - if len(json_partition) < 2: + if len(json_partition) < 1: raise KhiopsJSONError( - "'partition' for interval must have length at least 2" + "'partition' for interval must have length at least 1" ) # Initialize intervals diff --git a/tests/resources/README.md b/tests/resources/README.md index dba20a96..0c7ea1fd 100644 --- a/tests/resources/README.md +++ b/tests/resources/README.md @@ -1,6 +1,6 @@ Test Resources -------------- -The file `reference_paths.txt` contain the origin and destination of reports and dictionary files. The -origin is a path relative to the `LearningTest` directory that contain the Khiops tests. The +The file `reference_paths.txt` contain the origin and destination of reports and dictionary files. +The origin is a path relative to the `LearningTest` directory that contain the Khiops tests. The destination paths are relative to this directory. diff --git a/tests/resources/analysis_results/ref_json_reports/IrisU2D.khj b/tests/resources/analysis_results/ref_json_reports/IrisU2D.khj new file mode 100644 index 00000000..73582195 --- /dev/null +++ b/tests/resources/analysis_results/ref_json_reports/IrisU2D.khj @@ -0,0 +1,2714 @@ +{ + "tool": "Khiops", + "version": "10.5.0-a1", + "shortDescription": "", + "preparationReport": { + "reportType": "Preparation", + "summary": { + "dictionary": "Iris", + "variables": { + "types": [ + "Categorical", + "Numerical" + ], + "numbers": [ + 4, + 8 + ] + }, + "database": "..\/..\/..\/datasets\/Iris\/Iris.txt", + "samplePercentage": 70, + "samplingMode": "Include sample", + "selectionVariable": "", + "selectionValue": "", + "instances": 105, + "learningTask": "Unsupervised analysis", + "evaluatedVariables": 12, + "nativeVariables": 5, + "constructedVariables": 7, + "featureEngineering": { + "maxNumberOfConstructedVariables": 0, + "maxNumberOfTextFeatures": 0, + "maxNumberOfTrees": 0, + "maxNumberOfVariablePairs": 100 + }, + "discretization": "MODL", + "valueGrouping": "MODL" + }, + "variablesStatistics": [ + { + "rank": "R01", + "name": "Class", + "type": "Categorical", + "parts": 3, + "values": 3, + "mode": "Iris-setosa", + "modeFrequency": 38, + "missingNumber": 0, + "sparseMissingNumber": 0, + "constructionCost": 3.17805 + }, + { + "rank": "R02", + "name": "Class1", + "type": "Categorical", + "parts": 2, + "values": 2, + "mode": "", + "modeFrequency": 67, + "missingNumber": 67, + "sparseMissingNumber": 0, + "constructionCost": 3.17805, + "derivationRule": "IfC(EQc(Class, \"Iris-setosa\"), \"setosa\", \"\")" + }, + { + "rank": "R03", + "name": "Class2", + "type": "Categorical", + "parts": 2, + "values": 2, + "mode": "", + "modeFrequency": 73, + "missingNumber": 73, + "sparseMissingNumber": 0, + "constructionCost": 3.17805, + "derivationRule": "IfC(EQc(Class, \"Iris-versicolor\"), \"versicolor\", \"\")" + }, + { + "rank": "R04", + "name": "Dummy1", + "type": "Numerical", + "parts": 1, + "values": 1, + "min": 0, + "max": 0, + "mean": 0, + "stdDev": 0, + "missingNumber": 0, + "sparseMissingNumber": 0, + "constructionCost": 3.17805, + "derivationRule": "Copy(0)" + }, + { + "rank": "R05", + "name": "Dummy2", + "type": "Numerical", + "parts": 1, + "values": 105, + "min": 0.005121241265, + "max": 0.9859650261, + "mean": 0.5173966838, + "stdDev": 0.2650019122, + "missingNumber": 0, + "sparseMissingNumber": 0, + "constructionCost": 3.17805, + "derivationRule": "Random()" + }, + { + "rank": "R06", + "name": "LowerPetalLength", + "type": "Numerical", + "parts": 4, + "values": 10, + "min": 1, + "max": 3, + "mean": 2.446666667, + "stdDev": 0.7433600251, + "missingNumber": 0, + "sparseMissingNumber": 0, + "constructionCost": 3.17805, + "derivationRule": "If(LE(PetalLength, 3), PetalLength, 3)" + }, + { + "rank": "R07", + "name": "PetalLength", + "type": "Numerical", + "parts": 5, + "values": 36, + "min": 1, + "max": 6.9, + "mean": 3.686666667, + "stdDev": 1.80132579, + "missingNumber": 0, + "sparseMissingNumber": 0, + "constructionCost": 3.17805 + }, + { + "rank": "R08", + "name": "PetalWidth", + "type": "Numerical", + "parts": 5, + "values": 21, + "min": 0.1, + "max": 2.5, + "mean": 1.175238095, + "stdDev": 0.7880996979, + "missingNumber": 0, + "sparseMissingNumber": 0, + "constructionCost": 3.17805 + }, + { + "rank": "R09", + "name": "SPetalLength", + "type": "Categorical", + "parts": 5, + "values": 5, + "mode": "1", + "modeFrequency": 38, + "missingNumber": 0, + "sparseMissingNumber": 0, + "constructionCost": 3.17805, + "derivationRule": "AsCategorical(Floor(PetalLength))" + }, + { + "rank": "R10", + "name": "SepalLength", + "type": "Numerical", + "parts": 2, + "values": 31, + "min": 4.3, + "max": 7.7, + "mean": 5.827619048, + "stdDev": 0.8375127846, + "missingNumber": 0, + "sparseMissingNumber": 0, + "constructionCost": 3.17805 + }, + { + "rank": "R11", + "name": "SepalWidth", + "type": "Numerical", + "parts": 3, + "values": 23, + "min": 2, + "max": 4.4, + "mean": 3.081904762, + "stdDev": 0.4284592446, + "missingNumber": 0, + "sparseMissingNumber": 0, + "constructionCost": 3.17805 + }, + { + "rank": "R12", + "name": "UpperPetalWidth", + "type": "Numerical", + "parts": 2, + "values": 11, + "min": 1.5, + "max": 2.5, + "mean": 1.692380952, + "stdDev": 0.2962287527, + "missingNumber": 0, + "sparseMissingNumber": 0, + "constructionCost": 3.17805, + "derivationRule": "If(GE(PetalWidth, 1.5), PetalWidth, 1.5)" + } + ], + "variablesDetailedStatistics": { + "R01": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["Iris-setosa"], + ["Iris-virginica"], + ["Iris-versicolor"] + ], + "defaultGroupIndex": 2 + } + ], + "frequencies": [38,35,32] + }, + "inputValues": { + "values": ["Iris-setosa","Iris-virginica","Iris-versicolor"], + "frequencies": [38,35,32] + } + }, + "R02": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class1", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["setosa"] + ], + "defaultGroupIndex": 1 + } + ], + "frequencies": [67,38] + }, + "inputValues": { + "values": ["","setosa"], + "frequencies": [67,38] + } + }, + "R03": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class2", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["versicolor"] + ], + "defaultGroupIndex": 1 + } + ], + "frequencies": [73,32] + }, + "inputValues": { + "values": ["","versicolor"], + "frequencies": [73,32] + } + }, + "R05": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Dummy2", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [0.00390625,1] + ] + } + ], + "frequencies": [105] + }, + "modlHistograms": { + "histogramNumber": 1, + "interpretableHistogramNumber": 1, + "truncationEpsilon": 0, + "removedSingularIntervalNumber": 0, + "granularities": [0], + "intervalNumbers": [1], + "peakIntervalNumbers": [0], + "spikeIntervalNumbers": [0], + "emptyIntervalNumbers": [0], + "levels": [0], + "informationRates": [0], + "histograms": [ + { + "bounds": [0.00390625,1], + "frequencies": [105] + } + ] + } + }, + "R06": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "LowerPetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [0.95,1.25], + [1.25,1.75], + [1.75,2.95], + [2.95,3.05] + ] + } + ], + "frequencies": [4,32,2,67] + }, + "modlHistograms": { + "histogramNumber": 5, + "interpretableHistogramNumber": 4, + "truncationEpsilon": 0.1, + "removedSingularIntervalNumber": 0, + "granularities": [0,2,3,5,28], + "intervalNumbers": [1,3,5,4,12], + "peakIntervalNumbers": [0,0,1,1,5], + "spikeIntervalNumbers": [0,0,0,0,5], + "emptyIntervalNumbers": [0,0,1,0,4], + "levels": [0,0.01356921313,0.04765003824,0.06838973265,0.6853858332], + "informationRates": [0,19.84100917,69.67425722,100,1002.176506], + "histograms": [ + { + "bounds": [0.95,3.05], + "frequencies": [105] + }, + { + "bounds": [0.95,1.65,2.45,3.05], + "frequencies": [32,6,67] + }, + { + "bounds": [0.95,1.25,1.65,2.05,2.85,3.05], + "frequencies": [4,28,6,0,67] + }, + { + "bounds": [0.95,1.25,1.75,2.95,3.05], + "frequencies": [4,32,2,67] + }, + { + "bounds": [0.9375,1.299999997,1.300000012,1.399999991,1.400000006,1.499999985,1.5,1.599999994,1.600000009,1.699999988,1.700000003,2.999999985,3], + "frequencies": [4,4,0,9,0,10,0,5,0,4,2,67] + } + ] + } + }, + "R07": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "PetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [0.95,1.25], + [1.25,1.75], + [1.75,3.85], + [3.85,6.15], + [6.15,6.95] + ] + } + ], + "frequencies": [4,32,7,59,3] + }, + "modlHistograms": { + "histogramNumber": 5, + "interpretableHistogramNumber": 4, + "truncationEpsilon": 0.1, + "removedSingularIntervalNumber": 0, + "granularities": [0,3,5,7,29], + "intervalNumbers": [1,3,7,5,41], + "peakIntervalNumbers": [0,0,2,2,20], + "spikeIntervalNumbers": [0,0,0,0,20], + "emptyIntervalNumbers": [0,0,0,0,14], + "levels": [0,0.005142168327,0.006968300144,0.00863976387,0.2871300831], + "informationRates": [0,59.51746372,80.6538263,100,3323.355678], + "histograms": [ + { + "bounds": [0.95,6.95], + "frequencies": [105] + }, + { + "bounds": [0.95,1.65,3.25,6.95], + "frequencies": [32,7,66] + }, + { + "bounds": [0.95,1.25,1.65,2.05,3.25,4.45,5.65,6.95], + "frequencies": [4,28,6,1,17,37,12] + }, + { + "bounds": [0.95,1.25,1.75,3.85,6.15,6.95], + "frequencies": [4,32,7,59,3] + }, + { + "bounds": [0.875,1.299999997,1.300000012,1.399999991,1.400000006,1.499999985,1.5,1.599999994,1.600000009,1.699999988,1.700000003,1.899999991,1.900000006,3.499999985,3.5,3.899999991,3.900000006,3.999999985,4,4.099999994,4.100000009,4.499999985,4.5,4.699999988,4.700000003,4.799999997,4.800000012,4.899999991,4.900000006,4.999999985,5,5.099999994,5.100000009,5.499999985,5.5,5.599999994,5.600000009,6.099999994,6.100000009,6.699999988,6.700000003,7], + "frequencies": [4,4,0,9,0,10,0,5,0,4,0,2,2,2,1,3,0,4,0,3,3,5,0,4,0,3,0,3,0,3,0,5,5,3,0,6,6,3,0,2,1] + } + ] + } + }, + "R08": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "PetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [0.05,0.15], + [0.15,0.25], + [0.25,0.45], + [0.45,0.85], + [0.85,2.55] + ] + } + ], + "frequencies": [6,20,11,1,67] + }, + "modlHistograms": { + "histogramNumber": 5, + "interpretableHistogramNumber": 4, + "truncationEpsilon": 0.1, + "removedSingularIntervalNumber": 0, + "granularities": [0,1,2,3,30], + "intervalNumbers": [1,2,2,5,28], + "peakIntervalNumbers": [0,0,0,1,13], + "spikeIntervalNumbers": [0,0,0,1,13], + "emptyIntervalNumbers": [0,0,0,0,9], + "levels": [0,0.003138504791,0.003358793296,0.004913113283,0.5003837405], + "informationRates": [0,63.88016335,68.36384798,100,10184.65709], + "histograms": [ + { + "bounds": [0.05,2.55], + "frequencies": [105] + }, + { + "bounds": [0.05,0.45,2.55], + "frequencies": [37,68] + }, + { + "bounds": [0.05,0.25,2.55], + "frequencies": [26,79] + }, + { + "bounds": [0.05,0.15,0.25,0.45,0.85,2.55], + "frequencies": [6,20,11,1,67] + }, + { + "bounds": [0.09999999963,0.1000000001,0.1999999993,0.2000000002,0.2999999989,0.3000000007,0.3999999985,0.4000000004,0.9999999963,1,1.299999997,1.300000004,1.399999999,1.400000006,1.499999993,1.5,1.599999994,1.600000001,1.799999997,1.800000004,1.999999993,2,2.099999994,2.100000009,2.199999988,2.200000003,2.299999997,2.300000012,2.5], + "frequencies": [6,0,20,0,5,0,6,1,5,4,9,0,6,0,5,0,3,1,9,2,5,0,4,0,3,0,8,3] + } + ] + } + }, + "R09": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "SPetalLength", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["1"], + ["5"], + ["4"], + ["3"], + ["6"] + ], + "defaultGroupIndex": 4 + } + ], + "frequencies": [38,27,25,8,7] + }, + "inputValues": { + "values": ["1","5","4","3","6"], + "frequencies": [38,27,25,8,7] + } + }, + "R10": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "SepalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [4.25,7], + [7,7.75] + ] + } + ], + "frequencies": [97,8] + }, + "modlHistograms": { + "histogramNumber": 2, + "interpretableHistogramNumber": 2, + "truncationEpsilon": 0, + "removedSingularIntervalNumber": 0, + "granularities": [0,2], + "intervalNumbers": [1,2], + "peakIntervalNumbers": [0,0], + "spikeIntervalNumbers": [0,0], + "emptyIntervalNumbers": [0,0], + "levels": [0,0.0003815098758], + "informationRates": [0,100], + "histograms": [ + { + "bounds": [4.25,7.75], + "frequencies": [105] + }, + { + "bounds": [4.25,7,7.75], + "frequencies": [97,8] + } + ] + } + }, + "R11": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "SepalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1.95,2.45], + [2.45,3.25], + [3.25,4.45] + ] + } + ], + "frequencies": [5,72,28] + }, + "modlHistograms": { + "histogramNumber": 4, + "interpretableHistogramNumber": 3, + "truncationEpsilon": 0.1, + "removedSingularIntervalNumber": 0, + "granularities": [0,1,2,29], + "intervalNumbers": [1,2,3,23], + "peakIntervalNumbers": [0,0,1,11], + "spikeIntervalNumbers": [0,0,0,11], + "emptyIntervalNumbers": [0,0,0,8], + "levels": [0,0.002413792626,0.009713296272,0.471063928], + "informationRates": [0,24.85039639,100,4849.681455], + "histograms": [ + { + "bounds": [1.95,4.45], + "frequencies": [105] + }, + { + "bounds": [1.95,3.25,4.45], + "frequencies": [77,28] + }, + { + "bounds": [1.95,2.45,3.25,4.45], + "frequencies": [5,72,28] + }, + { + "bounds": [1.9921875,2.499999985,2.5,2.599999994,2.600000009,2.699999988,2.700000003,2.799999997,2.800000012,2.899999991,2.900000006,2.999999985,3,3.099999994,3.100000009,3.199999988,3.200000003,3.399999991,3.400000006,3.499999985,3.5,3.799999997,3.800000012,4.40625], + "frequencies": [5,6,0,3,0,5,0,11,0,6,0,20,0,10,0,11,2,10,0,3,3,4,6] + } + ] + } + }, + "R12": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "UpperPetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1.45,1.55], + [1.55,2.55] + ] + } + ], + "frequencies": [67,38] + }, + "modlHistograms": { + "histogramNumber": 4, + "interpretableHistogramNumber": 3, + "truncationEpsilon": 0.1, + "removedSingularIntervalNumber": 0, + "granularities": [0,1,5,28], + "intervalNumbers": [1,2,2,14], + "peakIntervalNumbers": [0,0,0,6], + "spikeIntervalNumbers": [0,0,0,6], + "emptyIntervalNumbers": [0,0,0,4], + "levels": [0,0.02500256671,0.03972305577,0.6316260053], + "informationRates": [0,62.94220379,100,1590.074059], + "histograms": [ + { + "bounds": [1.45,2.55], + "frequencies": [105] + }, + { + "bounds": [1.45,1.65,2.55], + "frequencies": [70,35] + }, + { + "bounds": [1.45,1.55,2.55], + "frequencies": [67,38] + }, + { + "bounds": [1.499999985,1.5,1.599999994,1.600000009,1.799999997,1.800000012,1.999999985,2,2.099999994,2.100000009,2.199999988,2.200000003,2.299999997,2.300000012,2.5], + "frequencies": [67,0,3,1,9,2,5,0,4,0,3,0,8,3] + } + ] + } + } + } + }, + "bivariatePreparationReport": { + "reportType": "BivariatePreparation", + "summary": { + "dictionary": "Iris", + "variables": { + "types": [ + "Categorical", + "Numerical" + ], + "numbers": [ + 4, + 8 + ] + }, + "database": "..\/..\/..\/datasets\/Iris\/Iris.txt", + "samplePercentage": 70, + "samplingMode": "Include sample", + "selectionVariable": "", + "selectionValue": "", + "instances": 105, + "learningTask": "Unsupervised analysis", + "evaluatedVariablePairs": 55, + "informativeVariablePairs": 38 + }, + "variablesPairsStatistics": [ + { + "rank": "R01", + "name1": "Class", + "name2": "Class1", + "level": 0.286471, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 2, + "constructionCost": 6.71557, + "preparationCost": 18.9311, + "dataCost": 110.25 + }, + { + "rank": "R02", + "name1": "Class", + "name2": "Class2", + "level": 0.270234, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 2, + "constructionCost": 6.71557, + "preparationCost": 19.0156, + "dataCost": 110.25 + }, + { + "rank": "R03", + "name1": "Class", + "name2": "SPetalLength", + "level": 0.258511, + "variables": 2, + "parts1": 3, + "parts2": 3, + "cells": 5, + "constructionCost": 6.71557, + "preparationCost": 41.7647, + "dataCost": 157.188 + }, + { + "rank": "R04", + "name1": "Class1", + "name2": "SPetalLength", + "level": 0.231831, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 2, + "constructionCost": 6.71557, + "preparationCost": 27.2099, + "dataCost": 142.253 + }, + { + "rank": "R05", + "name1": "PetalLength", + "name2": "SPetalLength", + "level": 0.151582, + "variables": 2, + "parts1": 5, + "parts2": 5, + "cells": 5, + "constructionCost": 6.71557, + "preparationCost": 69.091, + "dataCost": 386.913 + }, + { + "rank": "R06", + "name1": "Class2", + "name2": "SPetalLength", + "level": 0.142436, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 4, + "constructionCost": 6.71557, + "preparationCost": 27.7273, + "dataCost": 158.704 + }, + { + "rank": "R07", + "name1": "Class", + "name2": "PetalWidth", + "level": 0.14197, + "variables": 2, + "parts1": 3, + "parts2": 3, + "cells": 5, + "constructionCost": 6.71557, + "preparationCost": 31.1679, + "dataCost": 396.708 + }, + { + "rank": "R08", + "name1": "Class", + "name2": "PetalLength", + "level": 0.136908, + "variables": 2, + "parts1": 3, + "parts2": 3, + "cells": 5, + "constructionCost": 6.71557, + "preparationCost": 31.1679, + "dataCost": 399.272 + }, + { + "rank": "R09", + "name1": "Class1", + "name2": "LowerPetalLength", + "level": 0.111506, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 2, + "constructionCost": 6.71557, + "preparationCost": 13.7255, + "dataCost": 386.913 + }, + { + "rank": "R10", + "name1": "Class1", + "name2": "PetalLength", + "level": 0.111506, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 2, + "constructionCost": 6.71557, + "preparationCost": 13.7255, + "dataCost": 386.913 + }, + { + "rank": "R11", + "name1": "Class1", + "name2": "PetalWidth", + "level": 0.111506, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 2, + "constructionCost": 6.71557, + "preparationCost": 13.7255, + "dataCost": 386.913 + }, + { + "rank": "R12", + "name1": "PetalWidth", + "name2": "SPetalLength", + "level": 0.109807, + "variables": 2, + "parts1": 3, + "parts2": 3, + "cells": 5, + "constructionCost": 6.71557, + "preparationCost": 40.5555, + "dataCost": 438.232 + }, + { + "rank": "R13", + "name1": "Class", + "name2": "LowerPetalLength", + "level": 0.0982915, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 2, + "constructionCost": 6.71557, + "preparationCost": 19.0436, + "dataCost": 430.955 + }, + { + "rank": "R14", + "name1": "LowerPetalLength", + "name2": "SPetalLength", + "level": 0.0887331, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 2, + "constructionCost": 6.71557, + "preparationCost": 27.3225, + "dataCost": 462.959 + }, + { + "rank": "R15", + "name1": "PetalLength", + "name2": "PetalWidth", + "level": 0.0785935, + "variables": 2, + "parts1": 3, + "parts2": 3, + "cells": 4, + "constructionCost": 6.71557, + "preparationCost": 29.9587, + "dataCost": 676.972 + }, + { + "rank": "R16", + "name1": "Class", + "name2": "UpperPetalWidth", + "level": 0.0721164, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 4, + "constructionCost": 6.71557, + "preparationCost": 19.0868, + "dataCost": 444.17 + }, + { + "rank": "R17", + "name1": "PetalWidth", + "name2": "UpperPetalWidth", + "level": 0.0703191, + "variables": 2, + "parts1": 3, + "parts2": 3, + "cells": 3, + "constructionCost": 6.71557, + "preparationCost": 29.9587, + "dataCost": 683.381 + }, + { + "rank": "R18", + "name1": "LowerPetalLength", + "name2": "PetalLength", + "level": 0.0701201, + "variables": 2, + "parts1": 3, + "parts2": 3, + "cells": 3, + "constructionCost": 6.71557, + "preparationCost": 29.9587, + "dataCost": 683.535 + }, + { + "rank": "R19", + "name1": "Class2", + "name2": "PetalWidth", + "level": 0.0662843, + "variables": 2, + "parts1": 2, + "parts2": 3, + "cells": 5, + "constructionCost": 6.71557, + "preparationCost": 20.8147, + "dataCost": 396.708 + }, + { + "rank": "R20", + "name1": "SPetalLength", + "name2": "SepalLength", + "level": 0.0654694, + "variables": 2, + "parts1": 3, + "parts2": 4, + "cells": 5, + "constructionCost": 6.71557, + "preparationCost": 49.4973, + "dataCost": 453.472 + }, + { + "rank": "R21", + "name1": "Class2", + "name2": "PetalLength", + "level": 0.0606416, + "variables": 2, + "parts1": 2, + "parts2": 3, + "cells": 5, + "constructionCost": 6.71557, + "preparationCost": 20.8147, + "dataCost": 399.272 + }, + { + "rank": "R22", + "name1": "LowerPetalLength", + "name2": "PetalWidth", + "level": 0.0598398, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 2, + "constructionCost": 6.71557, + "preparationCost": 13.838, + "dataCost": 707.618 + }, + { + "rank": "R23", + "name1": "Class", + "name2": "SepalLength", + "level": 0.059526, + "variables": 2, + "parts1": 3, + "parts2": 3, + "cells": 7, + "constructionCost": 6.71557, + "preparationCost": 31.1679, + "dataCost": 438.466 + }, + { + "rank": "R24", + "name1": "Class1", + "name2": "Class2", + "level": 0.0559199, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 3, + "constructionCost": 6.71557, + "preparationCost": 13.6129, + "dataCost": 110.25 + }, + { + "rank": "R25", + "name1": "Class1", + "name2": "SepalLength", + "level": 0.0531576, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 4, + "constructionCost": 6.71557, + "preparationCost": 13.7255, + "dataCost": 413.664 + }, + { + "rank": "R26", + "name1": "SPetalLength", + "name2": "UpperPetalWidth", + "level": 0.0466723, + "variables": 2, + "parts1": 3, + "parts2": 2, + "cells": 5, + "constructionCost": 6.71557, + "preparationCost": 31.8478, + "dataCost": 481.373 + }, + { + "rank": "R27", + "name1": "PetalLength", + "name2": "SepalLength", + "level": 0.0407398, + "variables": 2, + "parts1": 4, + "parts2": 4, + "cells": 8, + "constructionCost": 6.71557, + "preparationCost": 47.7303, + "dataCost": 688.519 + }, + { + "rank": "R28", + "name1": "PetalLength", + "name2": "UpperPetalWidth", + "level": 0.0401281, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 3, + "constructionCost": 6.71557, + "preparationCost": 13.838, + "dataCost": 722.885 + }, + { + "rank": "R29", + "name1": "PetalWidth", + "name2": "SepalLength", + "level": 0.0303985, + "variables": 2, + "parts1": 3, + "parts2": 3, + "cells": 8, + "constructionCost": 6.71557, + "preparationCost": 29.9587, + "dataCost": 714.3 + }, + { + "rank": "R30", + "name1": "LowerPetalLength", + "name2": "SepalLength", + "level": 0.0253003, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 4, + "constructionCost": 6.71557, + "preparationCost": 13.838, + "dataCost": 734.369 + }, + { + "rank": "R31", + "name1": "Class1", + "name2": "UpperPetalWidth", + "level": 0.0166012, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 3, + "constructionCost": 6.71557, + "preparationCost": 13.7255, + "dataCost": 430.424 + }, + { + "rank": "R32", + "name1": "SepalLength", + "name2": "UpperPetalWidth", + "level": 0.0164148, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 4, + "constructionCost": 6.71557, + "preparationCost": 13.838, + "dataCost": 741.251 + }, + { + "rank": "R33", + "name1": "Class1", + "name2": "SepalWidth", + "level": 0.00749643, + "variables": 2, + "parts1": 2, + "parts2": 3, + "cells": 5, + "constructionCost": 6.71557, + "preparationCost": 20.8147, + "dataCost": 427.509 + }, + { + "rank": "R34", + "name1": "Class2", + "name2": "LowerPetalLength", + "level": 0.0065114, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 3, + "constructionCost": 6.71557, + "preparationCost": 13.7255, + "dataCost": 430.955 + }, + { + "rank": "R35", + "name1": "Class", + "name2": "SepalWidth", + "level": 0.00543684, + "variables": 2, + "parts1": 3, + "parts2": 2, + "cells": 6, + "constructionCost": 6.71557, + "preparationCost": 22.1365, + "dataCost": 474.893 + }, + { + "rank": "R36", + "name1": "LowerPetalLength", + "name2": "UpperPetalWidth", + "level": 0.00366071, + "variables": 2, + "parts1": 2, + "parts2": 2, + "cells": 3, + "constructionCost": 6.71557, + "preparationCost": 13.838, + "dataCost": 751.129 + }, + { + "rank": "R37", + "name1": "PetalWidth", + "name2": "SepalWidth", + "level": 0.00221737, + "variables": 2, + "parts1": 3, + "parts2": 2, + "cells": 6, + "constructionCost": 6.71557, + "preparationCost": 20.9273, + "dataCost": 745.158 + }, + { + "rank": "R38", + "name1": "SPetalLength", + "name2": "SepalWidth", + "level": 0.00143264, + "variables": 2, + "parts1": 3, + "parts2": 3, + "cells": 9, + "constructionCost": 6.71557, + "preparationCost": 40.2319, + "dataCost": 497.662 + }, + { + "rank": "R39", + "name1": "Class", + "name2": "Dummy2", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 8.64312, + "dataCost": 497.163 + }, + { + "rank": "R40", + "name1": "Class1", + "name2": "Dummy2", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 4.66344, + "dataCost": 453.12 + }, + { + "rank": "R41", + "name1": "Class2", + "name2": "Dummy2", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 4.66344, + "dataCost": 448.998 + }, + { + "rank": "R42", + "name1": "Class2", + "name2": "SepalLength", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 4.66344, + "dataCost": 448.998 + }, + { + "rank": "R43", + "name1": "Class2", + "name2": "SepalWidth", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 4.66344, + "dataCost": 448.998 + }, + { + "rank": "R44", + "name1": "Class2", + "name2": "UpperPetalWidth", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 4.66344, + "dataCost": 448.998 + }, + { + "rank": "R45", + "name1": "Dummy2", + "name2": "LowerPetalLength", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 0, + "dataCost": 773.825 + }, + { + "rank": "R46", + "name1": "Dummy2", + "name2": "PetalLength", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 0, + "dataCost": 773.825 + }, + { + "rank": "R47", + "name1": "Dummy2", + "name2": "PetalWidth", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 0, + "dataCost": 773.825 + }, + { + "rank": "R48", + "name1": "Dummy2", + "name2": "SPetalLength", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 15.5317, + "dataCost": 529.166 + }, + { + "rank": "R49", + "name1": "Dummy2", + "name2": "SepalLength", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 0, + "dataCost": 773.825 + }, + { + "rank": "R50", + "name1": "Dummy2", + "name2": "SepalWidth", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 0, + "dataCost": 773.825 + }, + { + "rank": "R51", + "name1": "Dummy2", + "name2": "UpperPetalWidth", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 0, + "dataCost": 773.825 + }, + { + "rank": "R52", + "name1": "LowerPetalLength", + "name2": "SepalWidth", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 0, + "dataCost": 773.825 + }, + { + "rank": "R53", + "name1": "PetalLength", + "name2": "SepalWidth", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 0, + "dataCost": 773.825 + }, + { + "rank": "R54", + "name1": "SepalLength", + "name2": "SepalWidth", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 0, + "dataCost": 773.825 + }, + { + "rank": "R55", + "name1": "SepalWidth", + "name2": "UpperPetalWidth", + "level": 0, + "variables": 0, + "parts1": 1, + "parts2": 1, + "cells": 1, + "constructionCost": 0.693147, + "preparationCost": 0, + "dataCost": 773.825 + } + ], + "variablesPairsDetailedStatistics": { + "R01": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["Iris-virginica","Iris-versicolor"], + ["Iris-setosa"] + ], + "defaultGroupIndex": 0 + }, + { + "variable": "Class1", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["setosa"] + ], + "defaultGroupIndex": 1 + } + ], + "cellIds": ["C1","C4"], + "cellPartIndexes": [ + [0,0], + [1,1] + ], + "cellFrequencies": [67,38] + } + }, + "R02": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["Iris-setosa","Iris-virginica"], + ["Iris-versicolor"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "Class2", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["versicolor"] + ], + "defaultGroupIndex": 1 + } + ], + "cellIds": ["C1","C4"], + "cellPartIndexes": [ + [0,0], + [1,1] + ], + "cellFrequencies": [73,32] + } + }, + "R03": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["Iris-setosa"], + ["Iris-virginica"], + ["Iris-versicolor"] + ], + "defaultGroupIndex": 2 + }, + { + "variable": "SPetalLength", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["1"], + ["5","6"], + ["4","3"] + ], + "defaultGroupIndex": 1 + } + ], + "cellIds": ["C1","C5","C6","C8","C9"], + "cellPartIndexes": [ + [0,0], + [1,1], + [2,1], + [1,2], + [2,2] + ], + "cellFrequencies": [38,32,2,3,30] + } + }, + "R04": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class1", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["setosa"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "SPetalLength", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["5","4","3","6"], + ["1"] + ], + "defaultGroupIndex": 0 + } + ], + "cellIds": ["C1","C4"], + "cellPartIndexes": [ + [0,0], + [1,1] + ], + "cellFrequencies": [67,38] + } + }, + "R05": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "PetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,2.4], + [2.4,3.95], + [3.95,4.95], + [4.95,5.95], + [5.95,6.9] + ] + }, + { + "variable": "SPetalLength", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["1"], + ["5"], + ["4"], + ["3"], + ["6"] + ], + "defaultGroupIndex": 4 + } + ], + "cellIds": ["C1","C9","C13","C17","C25"], + "cellPartIndexes": [ + [0,0], + [3,1], + [2,2], + [1,3], + [4,4] + ], + "cellFrequencies": [38,27,25,8,7] + } + }, + "R06": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class2", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["versicolor"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "SPetalLength", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["1","5","6"], + ["4","3"] + ], + "defaultGroupIndex": 0 + } + ], + "cellIds": ["C1","C2","C3","C4"], + "cellPartIndexes": [ + [0,0], + [1,0], + [0,1], + [1,1] + ], + "cellFrequencies": [70,2,3,30] + } + }, + "R07": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["Iris-setosa"], + ["Iris-virginica"], + ["Iris-versicolor"] + ], + "defaultGroupIndex": 2 + }, + { + "variable": "PetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [0.1,0.75], + [0.75,1.75], + [1.75,2.5] + ] + } + ], + "cellIds": ["C1","C5","C6","C8","C9"], + "cellPartIndexes": [ + [0,0], + [1,1], + [2,1], + [1,2], + [2,2] + ], + "cellFrequencies": [38,2,31,33,1] + } + }, + "R08": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["Iris-setosa"], + ["Iris-virginica"], + ["Iris-versicolor"] + ], + "defaultGroupIndex": 2 + }, + { + "variable": "PetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,2.4], + [2.4,4.85], + [4.85,6.9] + ] + } + ], + "cellIds": ["C1","C5","C6","C8","C9"], + "cellPartIndexes": [ + [0,0], + [1,1], + [2,1], + [1,2], + [2,2] + ], + "cellFrequencies": [38,1,29,34,3] + } + }, + "R09": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class1", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["setosa"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "LowerPetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,2.4], + [2.4,3] + ] + } + ], + "cellIds": ["C2","C3"], + "cellPartIndexes": [ + [1,0], + [0,1] + ], + "cellFrequencies": [38,67] + } + }, + "R10": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class1", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["setosa"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "PetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,2.4], + [2.4,6.9] + ] + } + ], + "cellIds": ["C2","C3"], + "cellPartIndexes": [ + [1,0], + [0,1] + ], + "cellFrequencies": [38,67] + } + }, + "R11": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class1", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["setosa"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "PetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [0.1,0.75], + [0.75,2.5] + ] + } + ], + "cellIds": ["C2","C3"], + "cellPartIndexes": [ + [1,0], + [0,1] + ], + "cellFrequencies": [38,67] + } + }, + "R12": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "PetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [0.1,0.75], + [0.75,1.65], + [1.65,2.5] + ] + }, + { + "variable": "SPetalLength", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["1"], + ["5","6"], + ["4","3"] + ], + "defaultGroupIndex": 1 + } + ], + "cellIds": ["C1","C5","C6","C8","C9"], + "cellPartIndexes": [ + [0,0], + [1,1], + [2,1], + [1,2], + [2,2] + ], + "cellFrequencies": [38,3,31,29,4] + } + }, + "R13": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["Iris-virginica","Iris-versicolor"], + ["Iris-setosa"] + ], + "defaultGroupIndex": 0 + }, + { + "variable": "LowerPetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,2.4], + [2.4,3] + ] + } + ], + "cellIds": ["C2","C3"], + "cellPartIndexes": [ + [1,0], + [0,1] + ], + "cellFrequencies": [38,67] + } + }, + "R14": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "LowerPetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,2.4], + [2.4,3] + ] + }, + { + "variable": "SPetalLength", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["5","4","3","6"], + ["1"] + ], + "defaultGroupIndex": 0 + } + ], + "cellIds": ["C2","C3"], + "cellPartIndexes": [ + [1,0], + [0,1] + ], + "cellFrequencies": [67,38] + } + }, + "R15": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "PetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,2.4], + [2.4,4.75], + [4.75,6.9] + ] + }, + { + "variable": "PetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [0.1,0.75], + [0.75,1.65], + [1.65,2.5] + ] + } + ], + "cellIds": ["C1","C5","C6","C9"], + "cellPartIndexes": [ + [0,0], + [1,1], + [2,1], + [2,2] + ], + "cellFrequencies": [38,27,5,35] + } + }, + "R16": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["Iris-setosa","Iris-versicolor"], + ["Iris-virginica"] + ], + "defaultGroupIndex": 0 + }, + { + "variable": "UpperPetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1.5,1.75], + [1.75,2.5] + ] + } + ], + "cellIds": ["C1","C2","C3","C4"], + "cellPartIndexes": [ + [0,0], + [1,0], + [0,1], + [1,1] + ], + "cellFrequencies": [69,2,1,33] + } + }, + "R17": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "PetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [0.1,1.55], + [1.55,2.05], + [2.05,2.5] + ] + }, + { + "variable": "UpperPetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1.5,1.55], + [1.55,2.05], + [2.05,2.5] + ] + } + ], + "cellIds": ["C1","C5","C9"], + "cellPartIndexes": [ + [0,0], + [1,1], + [2,2] + ], + "cellFrequencies": [67,20,18] + } + }, + "R18": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "LowerPetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,1.45], + [1.45,2.4], + [2.4,3] + ] + }, + { + "variable": "PetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,1.45], + [1.45,2.4], + [2.4,6.9] + ] + } + ], + "cellIds": ["C1","C5","C9"], + "cellPartIndexes": [ + [0,0], + [1,1], + [2,2] + ], + "cellFrequencies": [17,21,67] + } + }, + "R19": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class2", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["versicolor"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "PetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [0.1,0.75], + [0.75,1.75], + [1.75,2.5] + ] + } + ], + "cellIds": ["C1","C3","C4","C5","C6"], + "cellPartIndexes": [ + [0,0], + [0,1], + [1,1], + [0,2], + [1,2] + ], + "cellFrequencies": [38,2,31,33,1] + } + }, + "R20": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "SPetalLength", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["5","4"], + ["1","3"], + ["6"] + ], + "defaultGroupIndex": 2 + }, + { + "variable": "SepalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [4.3,5.35], + [5.35,5.85], + [5.85,7.15], + [7.15,7.7] + ] + } + ], + "cellIds": ["C2","C4","C5","C7","C12"], + "cellPartIndexes": [ + [1,0], + [0,1], + [1,1], + [0,2], + [2,3] + ], + "cellFrequencies": [34,10,12,42,7] + } + }, + "R21": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class2", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["versicolor"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "PetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,2.4], + [2.4,4.85], + [4.85,6.9] + ] + } + ], + "cellIds": ["C1","C3","C4","C5","C6"], + "cellPartIndexes": [ + [0,0], + [0,1], + [1,1], + [0,2], + [1,2] + ], + "cellFrequencies": [38,1,29,34,3] + } + }, + "R22": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "LowerPetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,2.4], + [2.4,3] + ] + }, + { + "variable": "PetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [0.1,0.75], + [0.75,2.5] + ] + } + ], + "cellIds": ["C1","C4"], + "cellPartIndexes": [ + [0,0], + [1,1] + ], + "cellFrequencies": [38,67] + } + }, + "R23": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["Iris-setosa"], + ["Iris-virginica"], + ["Iris-versicolor"] + ], + "defaultGroupIndex": 2 + }, + { + "variable": "SepalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [4.3,5.45], + [5.45,6.15], + [6.15,7.7] + ] + } + ], + "cellIds": ["C1","C3","C4","C5","C6","C8","C9"], + "cellPartIndexes": [ + [0,0], + [2,0], + [0,1], + [1,1], + [2,1], + [1,2], + [2,2] + ], + "cellFrequencies": [34,5,4,5,19,30,8] + } + }, + "R24": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class1", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["setosa"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "Class2", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["versicolor"] + ], + "defaultGroupIndex": 1 + } + ], + "cellIds": ["C1","C2","C3"], + "cellPartIndexes": [ + [0,0], + [1,0], + [0,1] + ], + "cellFrequencies": [35,38,32] + } + }, + "R25": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class1", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["setosa"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "SepalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [4.3,5.45], + [5.45,7.7] + ] + } + ], + "cellIds": ["C1","C2","C3","C4"], + "cellPartIndexes": [ + [0,0], + [1,0], + [0,1], + [1,1] + ], + "cellFrequencies": [5,34,62,4] + } + }, + "R26": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "SPetalLength", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["1","3"], + ["5","6"], + ["4"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "UpperPetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1.5,1.55], + [1.55,2.5] + ] + } + ], + "cellIds": ["C1","C2","C3","C5","C6"], + "cellPartIndexes": [ + [0,0], + [1,0], + [2,0], + [1,1], + [2,1] + ], + "cellFrequencies": [46,2,19,32,6] + } + }, + "R27": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "PetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,3.55], + [3.55,4.6], + [4.6,5.95], + [5.95,6.9] + ] + }, + { + "variable": "SepalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [4.3,5.45], + [5.45,5.85], + [5.85,7.15], + [7.15,7.7] + ] + } + ], + "cellIds": ["C1","C2","C5","C6","C7","C10","C11","C16"], + "cellPartIndexes": [ + [0,0], + [1,0], + [0,1], + [1,1], + [2,1], + [1,2], + [2,2], + [3,3] + ], + "cellFrequencies": [37,2,5,10,2,7,35,7] + } + }, + "R28": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "PetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,4.75], + [4.75,6.9] + ] + }, + { + "variable": "UpperPetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1.5,1.65], + [1.65,2.5] + ] + } + ], + "cellIds": ["C1","C2","C4"], + "cellPartIndexes": [ + [0,0], + [1,0], + [1,1] + ], + "cellFrequencies": [65,5,35] + } + }, + "R29": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "PetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [0.1,0.75], + [0.75,1.35], + [1.35,2.5] + ] + }, + { + "variable": "SepalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [4.3,5.45], + [5.45,5.85], + [5.85,7.7] + ] + } + ], + "cellIds": ["C1","C2","C3","C4","C5","C6","C8","C9"], + "cellPartIndexes": [ + [0,0], + [1,0], + [2,0], + [0,1], + [1,1], + [2,1], + [1,2], + [2,2] + ], + "cellFrequencies": [34,3,2,4,10,3,5,44] + } + }, + "R30": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "LowerPetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,2.4], + [2.4,3] + ] + }, + { + "variable": "SepalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [4.3,5.45], + [5.45,7.7] + ] + } + ], + "cellIds": ["C1","C2","C3","C4"], + "cellPartIndexes": [ + [0,0], + [1,0], + [0,1], + [1,1] + ], + "cellFrequencies": [34,5,4,62] + } + }, + "R31": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class1", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["setosa"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "UpperPetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1.5,1.55], + [1.55,2.5] + ] + } + ], + "cellIds": ["C1","C2","C3"], + "cellPartIndexes": [ + [0,0], + [1,0], + [0,1] + ], + "cellFrequencies": [29,38,38] + } + }, + "R32": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "SepalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [4.3,5.85], + [5.85,7.7] + ] + }, + { + "variable": "UpperPetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1.5,1.55], + [1.55,2.5] + ] + } + ], + "cellIds": ["C1","C2","C3","C4"], + "cellPartIndexes": [ + [0,0], + [1,0], + [0,1], + [1,1] + ], + "cellFrequencies": [54,13,2,36] + } + }, + "R33": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class1", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["setosa"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "SepalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [2,2.85], + [2.85,3.35], + [3.35,4.4] + ] + } + ], + "cellIds": ["C1","C3","C4","C5","C6"], + "cellPartIndexes": [ + [0,0], + [0,1], + [1,1], + [0,2], + [1,2] + ], + "cellFrequencies": [30,32,17,5,21] + } + }, + "R34": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class2", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + [""], + ["versicolor"] + ], + "defaultGroupIndex": 1 + }, + { + "variable": "LowerPetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,2.4], + [2.4,3] + ] + } + ], + "cellIds": ["C1","C3","C4"], + "cellPartIndexes": [ + [0,0], + [0,1], + [1,1] + ], + "cellFrequencies": [38,35,32] + } + }, + "R35": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "Class", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["Iris-setosa"], + ["Iris-virginica"], + ["Iris-versicolor"] + ], + "defaultGroupIndex": 2 + }, + { + "variable": "SepalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [2,2.95], + [2.95,4.4] + ] + } + ], + "cellIds": ["C1","C2","C3","C4","C5","C6"], + "cellPartIndexes": [ + [0,0], + [1,0], + [2,0], + [0,1], + [1,1], + [2,1] + ], + "cellFrequencies": [1,13,22,37,22,10] + } + }, + "R36": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "LowerPetalLength", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1,2.4], + [2.4,3] + ] + }, + { + "variable": "UpperPetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [1.5,1.55], + [1.55,2.5] + ] + } + ], + "cellIds": ["C1","C2","C4"], + "cellPartIndexes": [ + [0,0], + [1,0], + [1,1] + ], + "cellFrequencies": [38,29,38] + } + }, + "R37": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "PetalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [0.1,0.75], + [0.75,1.45], + [1.45,2.5] + ] + }, + { + "variable": "SepalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [2,2.95], + [2.95,4.4] + ] + } + ], + "cellIds": ["C1","C2","C3","C4","C5","C6"], + "cellPartIndexes": [ + [0,0], + [1,0], + [2,0], + [0,1], + [1,1], + [2,1] + ], + "cellFrequencies": [1,21,14,37,3,29] + } + }, + "R38": { + "dataGrid": { + "isSupervised": false, + "dimensions": [ + { + "variable": "SPetalLength", + "type": "Categorical", + "partitionType": "Value groups", + "partition": [ + ["4","3","6"], + ["1"], + ["5"] + ], + "defaultGroupIndex": 0 + }, + { + "variable": "SepalWidth", + "type": "Numerical", + "partitionType": "Intervals", + "partition": [ + [2,2.95], + [2.95,3.25], + [3.25,4.4] + ] + } + ], + "cellIds": ["C1","C2","C3","C4","C5","C6","C7","C8","C9"], + "cellPartIndexes": [ + [0,0], + [1,0], + [2,0], + [0,1], + [1,1], + [2,1], + [0,2], + [1,2], + [2,2] + ], + "cellFrequencies": [26,1,9,10,15,16,4,22,2] + } + } + } + }, + "khiops_encoding": "ascii" +} diff --git a/tests/resources/analysis_results/ref_reports/IrisU2D.txt b/tests/resources/analysis_results/ref_reports/IrisU2D.txt new file mode 100644 index 00000000..7600a3c3 --- /dev/null +++ b/tests/resources/analysis_results/ref_reports/IrisU2D.txt @@ -0,0 +1,1159 @@ +Tool Khiops +Version 10.5.0-a1 +Short description + + +Report Preparation + +Dictionary Iris +Variables + Categorical 4 + Numerical 8 + Total 12 +Database ../../../datasets/Iris/Iris.txt +Sample percentage 70 +Sampling mode Include sample +Selection variable +Selection value +Instances 105 +Learning task Unsupervised analysis +Evaluated variables 12 +Informative variables 0 +Max number of constructed variables 0 +Max number of trees 0 +Max number of variable pairs 100 +Discretization MODL +Value grouping MODL + +Variable statistics +Rank Name Type Level Target parts Parts Values Min Max Mean StdDev Missing number Mode Mode frequency Construction cost Preparation cost Data cost Derivation rule +R01 Class Categorical 3 3 Iris-setosa 38 3.17805 +R02 Class1 Categorical 2 2 67 3.17805 IfC(EQc(Class, "Iris-setosa"), "setosa", "") +R03 Class2 Categorical 2 2 73 3.17805 IfC(EQc(Class, "Iris-versicolor"), "versicolor", "") +R04 Dummy1 Numerical 1 1 0 0 0 0 0 3.17805 Copy(0) +R05 Dummy2 Numerical 1 105 0.005121241265 0.9859650261 0.5173966838 0.2650019122 0 3.17805 Random() +R06 LowerPetalLength Numerical 4 10 1 3 2.446666667 0.7433600251 0 3.17805 If(LE(PetalLength, 3), PetalLength, 3) +R07 PetalLength Numerical 5 36 1 6.9 3.686666667 1.80132579 0 3.17805 +R08 PetalWidth Numerical 5 21 0.1 2.5 1.175238095 0.7880996979 0 3.17805 +R09 SPetalLength Categorical 5 5 1 38 3.17805 AsCategorical(Floor(PetalLength)) +R10 SepalLength Numerical 2 31 4.3 7.7 5.827619048 0.8375127846 0 3.17805 +R11 SepalWidth Numerical 3 23 2 4.4 3.081904762 0.4284592446 0 3.17805 +R12 UpperPetalWidth Numerical 2 11 1.5 2.5 1.692380952 0.2962287527 0 3.17805 If(GE(PetalWidth, 1.5), PetalWidth, 1.5) + +Detailed variable statistics + +Rank R01 Class Categorical + +Data grid Unsupervised +Dimensions +Class Categorical Value groups + {Iris-setosa} Iris-setosa + {Iris-virginica} Iris-virginica + {Iris-versicolor} Iris-versicolor * +Cells +Value group Frequency +{Iris-setosa} 38 +{Iris-virginica} 35 +{Iris-versicolor} 32 + +Input values + Iris-setosa 38 + Iris-virginica 35 + Iris-versicolor 32 + +Rank R02 Class1 Categorical + +Data grid Unsupervised +Dimensions +Class1 Categorical Value groups + {} + {setosa} setosa * +Cells +Value group Frequency +{} 67 +{setosa} 38 + +Input values + 67 + setosa 38 + +Rank R03 Class2 Categorical + +Data grid Unsupervised +Dimensions +Class2 Categorical Value groups + {} + {versicolor} versicolor * +Cells +Value group Frequency +{} 73 +{versicolor} 32 + +Input values + 73 + versicolor 32 + +Rank R05 Dummy2 Numerical + +Data grid Unsupervised +Dimensions +Dummy2 Numerical Intervals + ]-inf;+inf[ 0.00390625 1 +Cells +Interval Frequency +]-inf;+inf[ 105 + +Rank R06 LowerPetalLength Numerical + +Data grid Unsupervised +Dimensions +LowerPetalLength Numerical Intervals + ]-inf;1.25] 0.95 1.25 + ]1.25;1.75] 1.25 1.75 + ]1.75;2.95] 1.75 2.95 + ]2.95;+inf[ 2.95 3.05 +Cells +Interval Frequency +]-inf;1.25] 4 +]1.25;1.75] 32 +]1.75;2.95] 2 +]2.95;+inf[ 67 + +Rank R07 PetalLength Numerical + +Data grid Unsupervised +Dimensions +PetalLength Numerical Intervals + ]-inf;1.25] 0.95 1.25 + ]1.25;1.75] 1.25 1.75 + ]1.75;3.85] 1.75 3.85 + ]3.85;6.15] 3.85 6.15 + ]6.15;+inf[ 6.15 6.95 +Cells +Interval Frequency +]-inf;1.25] 4 +]1.25;1.75] 32 +]1.75;3.85] 7 +]3.85;6.15] 59 +]6.15;+inf[ 3 + +Rank R08 PetalWidth Numerical + +Data grid Unsupervised +Dimensions +PetalWidth Numerical Intervals + ]-inf;0.15] 0.05 0.15 + ]0.15;0.25] 0.15 0.25 + ]0.25;0.45] 0.25 0.45 + ]0.45;0.85] 0.45 0.85 + ]0.85;+inf[ 0.85 2.55 +Cells +Interval Frequency +]-inf;0.15] 6 +]0.15;0.25] 20 +]0.25;0.45] 11 +]0.45;0.85] 1 +]0.85;+inf[ 67 + +Rank R09 SPetalLength Categorical + +Data grid Unsupervised +Dimensions +SPetalLength Categorical Value groups + {1} 1 + {5} 5 + {4} 4 + {3} 3 + {6} 6 * +Cells +Value group Frequency +{1} 38 +{5} 27 +{4} 25 +{3} 8 +{6} 7 + +Input values + 1 38 + 5 27 + 4 25 + 3 8 + 6 7 + +Rank R10 SepalLength Numerical + +Data grid Unsupervised +Dimensions +SepalLength Numerical Intervals + ]-inf;7] 4.25 7 + ]7;+inf[ 7 7.75 +Cells +Interval Frequency +]-inf;7] 97 +]7;+inf[ 8 + +Rank R11 SepalWidth Numerical + +Data grid Unsupervised +Dimensions +SepalWidth Numerical Intervals + ]-inf;2.45] 1.95 2.45 + ]2.45;3.25] 2.45 3.25 + ]3.25;+inf[ 3.25 4.45 +Cells +Interval Frequency +]-inf;2.45] 5 +]2.45;3.25] 72 +]3.25;+inf[ 28 + +Rank R12 UpperPetalWidth Numerical + +Data grid Unsupervised +Dimensions +UpperPetalWidth Numerical Intervals + ]-inf;1.55] 1.45 1.55 + ]1.55;+inf[ 1.55 2.55 +Cells +Interval Frequency +]-inf;1.55] 67 +]1.55;+inf[ 38 + + +Report Bivariate preparation + +Dictionary Iris +Variables + Categorical 4 + Numerical 8 + Total 12 +Database ../../../datasets/Iris/Iris.txt +Sample percentage 70 +Sampling mode Include sample +Selection variable +Selection value +Instances 105 +Learning task Unsupervised analysis +Evaluated variable pairs 55 +Informative variable pairs 38 + +Variable pair statistics +Rank Name 1 Name 2 Level Variables Parts 1 Parts 2 Cells Construction cost Preparation cost Data cost +R01 Class Class1 0.286471 2 2 2 2 6.71557 18.9311 110.25 +R02 Class Class2 0.270234 2 2 2 2 6.71557 19.0156 110.25 +R03 Class SPetalLength 0.258511 2 3 3 5 6.71557 41.7647 157.188 +R04 Class1 SPetalLength 0.231831 2 2 2 2 6.71557 27.2099 142.253 +R05 PetalLength SPetalLength 0.151582 2 5 5 5 6.71557 69.091 386.913 +R06 Class2 SPetalLength 0.142436 2 2 2 4 6.71557 27.7273 158.704 +R07 Class PetalWidth 0.14197 2 3 3 5 6.71557 31.1679 396.708 +R08 Class PetalLength 0.136908 2 3 3 5 6.71557 31.1679 399.272 +R09 Class1 LowerPetalLength 0.111506 2 2 2 2 6.71557 13.7255 386.913 +R10 Class1 PetalLength 0.111506 2 2 2 2 6.71557 13.7255 386.913 +R11 Class1 PetalWidth 0.111506 2 2 2 2 6.71557 13.7255 386.913 +R12 PetalWidth SPetalLength 0.109807 2 3 3 5 6.71557 40.5555 438.232 +R13 Class LowerPetalLength 0.0982915 2 2 2 2 6.71557 19.0436 430.955 +R14 LowerPetalLength SPetalLength 0.0887331 2 2 2 2 6.71557 27.3225 462.959 +R15 PetalLength PetalWidth 0.0785935 2 3 3 4 6.71557 29.9587 676.972 +R16 Class UpperPetalWidth 0.0721164 2 2 2 4 6.71557 19.0868 444.17 +R17 PetalWidth UpperPetalWidth 0.0703191 2 3 3 3 6.71557 29.9587 683.381 +R18 LowerPetalLength PetalLength 0.0701201 2 3 3 3 6.71557 29.9587 683.535 +R19 Class2 PetalWidth 0.0662843 2 2 3 5 6.71557 20.8147 396.708 +R20 SPetalLength SepalLength 0.0654694 2 3 4 5 6.71557 49.4973 453.472 +R21 Class2 PetalLength 0.0606416 2 2 3 5 6.71557 20.8147 399.272 +R22 LowerPetalLength PetalWidth 0.0598398 2 2 2 2 6.71557 13.838 707.618 +R23 Class SepalLength 0.059526 2 3 3 7 6.71557 31.1679 438.466 +R24 Class1 Class2 0.0559199 2 2 2 3 6.71557 13.6129 110.25 +R25 Class1 SepalLength 0.0531576 2 2 2 4 6.71557 13.7255 413.664 +R26 SPetalLength UpperPetalWidth 0.0466723 2 3 2 5 6.71557 31.8478 481.373 +R27 PetalLength SepalLength 0.0407398 2 4 4 8 6.71557 47.7303 688.519 +R28 PetalLength UpperPetalWidth 0.0401281 2 2 2 3 6.71557 13.838 722.885 +R29 PetalWidth SepalLength 0.0303985 2 3 3 8 6.71557 29.9587 714.3 +R30 LowerPetalLength SepalLength 0.0253003 2 2 2 4 6.71557 13.838 734.369 +R31 Class1 UpperPetalWidth 0.0166012 2 2 2 3 6.71557 13.7255 430.424 +R32 SepalLength UpperPetalWidth 0.0164148 2 2 2 4 6.71557 13.838 741.251 +R33 Class1 SepalWidth 0.00749643 2 2 3 5 6.71557 20.8147 427.509 +R34 Class2 LowerPetalLength 0.0065114 2 2 2 3 6.71557 13.7255 430.955 +R35 Class SepalWidth 0.00543684 2 3 2 6 6.71557 22.1365 474.893 +R36 LowerPetalLength UpperPetalWidth 0.00366071 2 2 2 3 6.71557 13.838 751.129 +R37 PetalWidth SepalWidth 0.00221737 2 3 2 6 6.71557 20.9273 745.158 +R38 SPetalLength SepalWidth 0.00143264 2 3 3 9 6.71557 40.2319 497.662 +R39 Class Dummy2 0 0 1 1 1 0.693147 8.64312 497.163 +R40 Class1 Dummy2 0 0 1 1 1 0.693147 4.66344 453.12 +R41 Class2 Dummy2 0 0 1 1 1 0.693147 4.66344 448.998 +R42 Class2 SepalLength 0 0 1 1 1 0.693147 4.66344 448.998 +R43 Class2 SepalWidth 0 0 1 1 1 0.693147 4.66344 448.998 +R44 Class2 UpperPetalWidth 0 0 1 1 1 0.693147 4.66344 448.998 +R45 Dummy2 LowerPetalLength 0 0 1 1 1 0.693147 0 773.825 +R46 Dummy2 PetalLength 0 0 1 1 1 0.693147 0 773.825 +R47 Dummy2 PetalWidth 0 0 1 1 1 0.693147 0 773.825 +R48 Dummy2 SPetalLength 0 0 1 1 1 0.693147 15.5317 529.166 +R49 Dummy2 SepalLength 0 0 1 1 1 0.693147 0 773.825 +R50 Dummy2 SepalWidth 0 0 1 1 1 0.693147 0 773.825 +R51 Dummy2 UpperPetalWidth 0 0 1 1 1 0.693147 0 773.825 +R52 LowerPetalLength SepalWidth 0 0 1 1 1 0.693147 0 773.825 +R53 PetalLength SepalWidth 0 0 1 1 1 0.693147 0 773.825 +R54 SepalLength SepalWidth 0 0 1 1 1 0.693147 0 773.825 +R55 SepalWidth UpperPetalWidth 0 0 1 1 1 0.693147 0 773.825 + +Detailed variable pair statistics + +Rank R01 + +Data grid Unsupervised +Dimensions +Class Categorical Value groups + {Iris-virginica, Iris-versicolor} Iris-virginica Iris-versicolor * + {Iris-setosa} Iris-setosa +Class1 Categorical Value groups + {} + {setosa} setosa * +Cells +Cell id Class Class1 Frequency +C1 {Iris-virginica, Iris-versicolor} {} 67 +C4 {Iris-setosa} {setosa} 38 +Confusion matrix + {Iris-virginica, Iris-versicolor} {Iris-setosa} +{} 67 0 +{setosa} 0 38 + +Rank R02 + +Data grid Unsupervised +Dimensions +Class Categorical Value groups + {Iris-setosa, Iris-virginica} Iris-setosa Iris-virginica + {Iris-versicolor} Iris-versicolor * +Class2 Categorical Value groups + {} + {versicolor} versicolor * +Cells +Cell id Class Class2 Frequency +C1 {Iris-setosa, Iris-virginica} {} 73 +C4 {Iris-versicolor} {versicolor} 32 +Confusion matrix + {Iris-setosa, Iris-virginica} {Iris-versicolor} +{} 73 0 +{versicolor} 0 32 + +Rank R03 + +Data grid Unsupervised +Dimensions +Class Categorical Value groups + {Iris-setosa} Iris-setosa + {Iris-virginica} Iris-virginica + {Iris-versicolor} Iris-versicolor * +SPetalLength Categorical Value groups + {1} 1 + {5, 6} 5 6 * + {4, 3} 4 3 +Cells +Cell id Class SPetalLength Frequency +C1 {Iris-setosa} {1} 38 +C5 {Iris-virginica} {5, 6} 32 +C6 {Iris-versicolor} {5, 6} 2 +C8 {Iris-virginica} {4, 3} 3 +C9 {Iris-versicolor} {4, 3} 30 +Confusion matrix + {Iris-setosa} {Iris-virginica} {Iris-versicolor} +{1} 38 0 0 +{5, 6} 0 32 2 +{4, 3} 0 3 30 + +Rank R04 + +Data grid Unsupervised +Dimensions +Class1 Categorical Value groups + {} + {setosa} setosa * +SPetalLength Categorical Value groups + {5, 4, 3, ...} 5 4 3 6 * + {1} 1 +Cells +Cell id Class1 SPetalLength Frequency +C1 {} {5, 4, 3, ...} 67 +C4 {setosa} {1} 38 +Confusion matrix + {} {setosa} +{5, 4, 3, ...} 67 0 +{1} 0 38 + +Rank R05 + +Data grid Unsupervised +Dimensions +PetalLength Numerical Intervals + ]-inf;2.4] 1 2.4 + ]2.4;3.95] 2.4 3.95 + ]3.95;4.95] 3.95 4.95 + ]4.95;5.95] 4.95 5.95 + ]5.95;+inf[ 5.95 6.9 +SPetalLength Categorical Value groups + {1} 1 + {5} 5 + {4} 4 + {3} 3 + {6} 6 * +Cells +Cell id PetalLength SPetalLength Frequency +C1 ]-inf;2.4] {1} 38 +C9 ]4.95;5.95] {5} 27 +C13 ]3.95;4.95] {4} 25 +C17 ]2.4;3.95] {3} 8 +C25 ]5.95;+inf[ {6} 7 +Confusion matrix + ]-inf;2.4] ]2.4;3.95] ]3.95;4.95] ]4.95;5.95] ]5.95;+inf[ +{1} 38 0 0 0 0 +{5} 0 0 0 27 0 +{4} 0 0 25 0 0 +{3} 0 8 0 0 0 +{6} 0 0 0 0 7 + +Rank R06 + +Data grid Unsupervised +Dimensions +Class2 Categorical Value groups + {} + {versicolor} versicolor * +SPetalLength Categorical Value groups + {1, 5, 6} 1 5 6 * + {4, 3} 4 3 +Cells +Cell id Class2 SPetalLength Frequency +C1 {} {1, 5, 6} 70 +C2 {versicolor} {1, 5, 6} 2 +C3 {} {4, 3} 3 +C4 {versicolor} {4, 3} 30 +Confusion matrix + {} {versicolor} +{1, 5, 6} 70 2 +{4, 3} 3 30 + +Rank R07 + +Data grid Unsupervised +Dimensions +Class Categorical Value groups + {Iris-setosa} Iris-setosa + {Iris-virginica} Iris-virginica + {Iris-versicolor} Iris-versicolor * +PetalWidth Numerical Intervals + ]-inf;0.75] 0.1 0.75 + ]0.75;1.75] 0.75 1.75 + ]1.75;+inf[ 1.75 2.5 +Cells +Cell id Class PetalWidth Frequency +C1 {Iris-setosa} ]-inf;0.75] 38 +C5 {Iris-virginica} ]0.75;1.75] 2 +C6 {Iris-versicolor} ]0.75;1.75] 31 +C8 {Iris-virginica} ]1.75;+inf[ 33 +C9 {Iris-versicolor} ]1.75;+inf[ 1 +Confusion matrix + {Iris-setosa} {Iris-virginica} {Iris-versicolor} +]-inf;0.75] 38 0 0 +]0.75;1.75] 0 2 31 +]1.75;+inf[ 0 33 1 + +Rank R08 + +Data grid Unsupervised +Dimensions +Class Categorical Value groups + {Iris-setosa} Iris-setosa + {Iris-virginica} Iris-virginica + {Iris-versicolor} Iris-versicolor * +PetalLength Numerical Intervals + ]-inf;2.4] 1 2.4 + ]2.4;4.85] 2.4 4.85 + ]4.85;+inf[ 4.85 6.9 +Cells +Cell id Class PetalLength Frequency +C1 {Iris-setosa} ]-inf;2.4] 38 +C5 {Iris-virginica} ]2.4;4.85] 1 +C6 {Iris-versicolor} ]2.4;4.85] 29 +C8 {Iris-virginica} ]4.85;+inf[ 34 +C9 {Iris-versicolor} ]4.85;+inf[ 3 +Confusion matrix + {Iris-setosa} {Iris-virginica} {Iris-versicolor} +]-inf;2.4] 38 0 0 +]2.4;4.85] 0 1 29 +]4.85;+inf[ 0 34 3 + +Rank R09 + +Data grid Unsupervised +Dimensions +Class1 Categorical Value groups + {} + {setosa} setosa * +LowerPetalLength Numerical Intervals + ]-inf;2.4] 1 2.4 + ]2.4;+inf[ 2.4 3 +Cells +Cell id Class1 LowerPetalLength Frequency +C2 {setosa} ]-inf;2.4] 38 +C3 {} ]2.4;+inf[ 67 +Confusion matrix + {} {setosa} +]-inf;2.4] 0 38 +]2.4;+inf[ 67 0 + +Rank R10 + +Data grid Unsupervised +Dimensions +Class1 Categorical Value groups + {} + {setosa} setosa * +PetalLength Numerical Intervals + ]-inf;2.4] 1 2.4 + ]2.4;+inf[ 2.4 6.9 +Cells +Cell id Class1 PetalLength Frequency +C2 {setosa} ]-inf;2.4] 38 +C3 {} ]2.4;+inf[ 67 +Confusion matrix + {} {setosa} +]-inf;2.4] 0 38 +]2.4;+inf[ 67 0 + +Rank R11 + +Data grid Unsupervised +Dimensions +Class1 Categorical Value groups + {} + {setosa} setosa * +PetalWidth Numerical Intervals + ]-inf;0.75] 0.1 0.75 + ]0.75;+inf[ 0.75 2.5 +Cells +Cell id Class1 PetalWidth Frequency +C2 {setosa} ]-inf;0.75] 38 +C3 {} ]0.75;+inf[ 67 +Confusion matrix + {} {setosa} +]-inf;0.75] 0 38 +]0.75;+inf[ 67 0 + +Rank R12 + +Data grid Unsupervised +Dimensions +PetalWidth Numerical Intervals + ]-inf;0.75] 0.1 0.75 + ]0.75;1.65] 0.75 1.65 + ]1.65;+inf[ 1.65 2.5 +SPetalLength Categorical Value groups + {1} 1 + {5, 6} 5 6 * + {4, 3} 4 3 +Cells +Cell id PetalWidth SPetalLength Frequency +C1 ]-inf;0.75] {1} 38 +C5 ]0.75;1.65] {5, 6} 3 +C6 ]1.65;+inf[ {5, 6} 31 +C8 ]0.75;1.65] {4, 3} 29 +C9 ]1.65;+inf[ {4, 3} 4 +Confusion matrix + ]-inf;0.75] ]0.75;1.65] ]1.65;+inf[ +{1} 38 0 0 +{5, 6} 0 3 31 +{4, 3} 0 29 4 + +Rank R13 + +Data grid Unsupervised +Dimensions +Class Categorical Value groups + {Iris-virginica, Iris-versicolor} Iris-virginica Iris-versicolor * + {Iris-setosa} Iris-setosa +LowerPetalLength Numerical Intervals + ]-inf;2.4] 1 2.4 + ]2.4;+inf[ 2.4 3 +Cells +Cell id Class LowerPetalLength Frequency +C2 {Iris-setosa} ]-inf;2.4] 38 +C3 {Iris-virginica, Iris-versicolor} ]2.4;+inf[ 67 +Confusion matrix + {Iris-virginica, Iris-versicolor} {Iris-setosa} +]-inf;2.4] 0 38 +]2.4;+inf[ 67 0 + +Rank R14 + +Data grid Unsupervised +Dimensions +LowerPetalLength Numerical Intervals + ]-inf;2.4] 1 2.4 + ]2.4;+inf[ 2.4 3 +SPetalLength Categorical Value groups + {5, 4, 3, ...} 5 4 3 6 * + {1} 1 +Cells +Cell id LowerPetalLength SPetalLength Frequency +C2 ]2.4;+inf[ {5, 4, 3, ...} 67 +C3 ]-inf;2.4] {1} 38 +Confusion matrix + ]-inf;2.4] ]2.4;+inf[ +{5, 4, 3, ...} 0 67 +{1} 38 0 + +Rank R15 + +Data grid Unsupervised +Dimensions +PetalLength Numerical Intervals + ]-inf;2.4] 1 2.4 + ]2.4;4.75] 2.4 4.75 + ]4.75;+inf[ 4.75 6.9 +PetalWidth Numerical Intervals + ]-inf;0.75] 0.1 0.75 + ]0.75;1.65] 0.75 1.65 + ]1.65;+inf[ 1.65 2.5 +Cells +Cell id PetalLength PetalWidth Frequency +C1 ]-inf;2.4] ]-inf;0.75] 38 +C5 ]2.4;4.75] ]0.75;1.65] 27 +C6 ]4.75;+inf[ ]0.75;1.65] 5 +C9 ]4.75;+inf[ ]1.65;+inf[ 35 +Confusion matrix + ]-inf;2.4] ]2.4;4.75] ]4.75;+inf[ +]-inf;0.75] 38 0 0 +]0.75;1.65] 0 27 5 +]1.65;+inf[ 0 0 35 + +Rank R16 + +Data grid Unsupervised +Dimensions +Class Categorical Value groups + {Iris-setosa, Iris-versicolor} Iris-setosa Iris-versicolor * + {Iris-virginica} Iris-virginica +UpperPetalWidth Numerical Intervals + ]-inf;1.75] 1.5 1.75 + ]1.75;+inf[ 1.75 2.5 +Cells +Cell id Class UpperPetalWidth Frequency +C1 {Iris-setosa, Iris-versicolor} ]-inf;1.75] 69 +C2 {Iris-virginica} ]-inf;1.75] 2 +C3 {Iris-setosa, Iris-versicolor} ]1.75;+inf[ 1 +C4 {Iris-virginica} ]1.75;+inf[ 33 +Confusion matrix + {Iris-setosa, Iris-versicolor} {Iris-virginica} +]-inf;1.75] 69 2 +]1.75;+inf[ 1 33 + +Rank R17 + +Data grid Unsupervised +Dimensions +PetalWidth Numerical Intervals + ]-inf;1.55] 0.1 1.55 + ]1.55;2.05] 1.55 2.05 + ]2.05;+inf[ 2.05 2.5 +UpperPetalWidth Numerical Intervals + ]-inf;1.55] 1.5 1.55 + ]1.55;2.05] 1.55 2.05 + ]2.05;+inf[ 2.05 2.5 +Cells +Cell id PetalWidth UpperPetalWidth Frequency +C1 ]-inf;1.55] ]-inf;1.55] 67 +C5 ]1.55;2.05] ]1.55;2.05] 20 +C9 ]2.05;+inf[ ]2.05;+inf[ 18 +Confusion matrix + ]-inf;1.55] ]1.55;2.05] ]2.05;+inf[ +]-inf;1.55] 67 0 0 +]1.55;2.05] 0 20 0 +]2.05;+inf[ 0 0 18 + +Rank R18 + +Data grid Unsupervised +Dimensions +LowerPetalLength Numerical Intervals + ]-inf;1.45] 1 1.45 + ]1.45;2.4] 1.45 2.4 + ]2.4;+inf[ 2.4 3 +PetalLength Numerical Intervals + ]-inf;1.45] 1 1.45 + ]1.45;2.4] 1.45 2.4 + ]2.4;+inf[ 2.4 6.9 +Cells +Cell id LowerPetalLength PetalLength Frequency +C1 ]-inf;1.45] ]-inf;1.45] 17 +C5 ]1.45;2.4] ]1.45;2.4] 21 +C9 ]2.4;+inf[ ]2.4;+inf[ 67 +Confusion matrix + ]-inf;1.45] ]1.45;2.4] ]2.4;+inf[ +]-inf;1.45] 17 0 0 +]1.45;2.4] 0 21 0 +]2.4;+inf[ 0 0 67 + +Rank R19 + +Data grid Unsupervised +Dimensions +Class2 Categorical Value groups + {} + {versicolor} versicolor * +PetalWidth Numerical Intervals + ]-inf;0.75] 0.1 0.75 + ]0.75;1.75] 0.75 1.75 + ]1.75;+inf[ 1.75 2.5 +Cells +Cell id Class2 PetalWidth Frequency +C1 {} ]-inf;0.75] 38 +C3 {} ]0.75;1.75] 2 +C4 {versicolor} ]0.75;1.75] 31 +C5 {} ]1.75;+inf[ 33 +C6 {versicolor} ]1.75;+inf[ 1 +Confusion matrix + {} {versicolor} +]-inf;0.75] 38 0 +]0.75;1.75] 2 31 +]1.75;+inf[ 33 1 + +Rank R20 + +Data grid Unsupervised +Dimensions +SPetalLength Categorical Value groups + {5, 4} 5 4 + {1, 3} 1 3 + {6} 6 * +SepalLength Numerical Intervals + ]-inf;5.35] 4.3 5.35 + ]5.35;5.85] 5.35 5.85 + ]5.85;7.15] 5.85 7.15 + ]7.15;+inf[ 7.15 7.7 +Cells +Cell id SPetalLength SepalLength Frequency +C2 {1, 3} ]-inf;5.35] 34 +C4 {5, 4} ]5.35;5.85] 10 +C5 {1, 3} ]5.35;5.85] 12 +C7 {5, 4} ]5.85;7.15] 42 +C12 {6} ]7.15;+inf[ 7 +Confusion matrix + {5, 4} {1, 3} {6} +]-inf;5.35] 0 34 0 +]5.35;5.85] 10 12 0 +]5.85;7.15] 42 0 0 +]7.15;+inf[ 0 0 7 + +Rank R21 + +Data grid Unsupervised +Dimensions +Class2 Categorical Value groups + {} + {versicolor} versicolor * +PetalLength Numerical Intervals + ]-inf;2.4] 1 2.4 + ]2.4;4.85] 2.4 4.85 + ]4.85;+inf[ 4.85 6.9 +Cells +Cell id Class2 PetalLength Frequency +C1 {} ]-inf;2.4] 38 +C3 {} ]2.4;4.85] 1 +C4 {versicolor} ]2.4;4.85] 29 +C5 {} ]4.85;+inf[ 34 +C6 {versicolor} ]4.85;+inf[ 3 +Confusion matrix + {} {versicolor} +]-inf;2.4] 38 0 +]2.4;4.85] 1 29 +]4.85;+inf[ 34 3 + +Rank R22 + +Data grid Unsupervised +Dimensions +LowerPetalLength Numerical Intervals + ]-inf;2.4] 1 2.4 + ]2.4;+inf[ 2.4 3 +PetalWidth Numerical Intervals + ]-inf;0.75] 0.1 0.75 + ]0.75;+inf[ 0.75 2.5 +Cells +Cell id LowerPetalLength PetalWidth Frequency +C1 ]-inf;2.4] ]-inf;0.75] 38 +C4 ]2.4;+inf[ ]0.75;+inf[ 67 +Confusion matrix + ]-inf;2.4] ]2.4;+inf[ +]-inf;0.75] 38 0 +]0.75;+inf[ 0 67 + +Rank R23 + +Data grid Unsupervised +Dimensions +Class Categorical Value groups + {Iris-setosa} Iris-setosa + {Iris-virginica} Iris-virginica + {Iris-versicolor} Iris-versicolor * +SepalLength Numerical Intervals + ]-inf;5.45] 4.3 5.45 + ]5.45;6.15] 5.45 6.15 + ]6.15;+inf[ 6.15 7.7 +Cells +Cell id Class SepalLength Frequency +C1 {Iris-setosa} ]-inf;5.45] 34 +C3 {Iris-versicolor} ]-inf;5.45] 5 +C4 {Iris-setosa} ]5.45;6.15] 4 +C5 {Iris-virginica} ]5.45;6.15] 5 +C6 {Iris-versicolor} ]5.45;6.15] 19 +C8 {Iris-virginica} ]6.15;+inf[ 30 +C9 {Iris-versicolor} ]6.15;+inf[ 8 +Confusion matrix + {Iris-setosa} {Iris-virginica} {Iris-versicolor} +]-inf;5.45] 34 0 5 +]5.45;6.15] 4 5 19 +]6.15;+inf[ 0 30 8 + +Rank R24 + +Data grid Unsupervised +Dimensions +Class1 Categorical Value groups + {} + {setosa} setosa * +Class2 Categorical Value groups + {} + {versicolor} versicolor * +Cells +Cell id Class1 Class2 Frequency +C1 {} {} 35 +C2 {setosa} {} 38 +C3 {} {versicolor} 32 +Confusion matrix + {} {setosa} +{} 35 38 +{versicolor} 32 0 + +Rank R25 + +Data grid Unsupervised +Dimensions +Class1 Categorical Value groups + {} + {setosa} setosa * +SepalLength Numerical Intervals + ]-inf;5.45] 4.3 5.45 + ]5.45;+inf[ 5.45 7.7 +Cells +Cell id Class1 SepalLength Frequency +C1 {} ]-inf;5.45] 5 +C2 {setosa} ]-inf;5.45] 34 +C3 {} ]5.45;+inf[ 62 +C4 {setosa} ]5.45;+inf[ 4 +Confusion matrix + {} {setosa} +]-inf;5.45] 5 34 +]5.45;+inf[ 62 4 + +Rank R26 + +Data grid Unsupervised +Dimensions +SPetalLength Categorical Value groups + {1, 3} 1 3 + {5, 6} 5 6 * + {4} 4 +UpperPetalWidth Numerical Intervals + ]-inf;1.55] 1.5 1.55 + ]1.55;+inf[ 1.55 2.5 +Cells +Cell id SPetalLength UpperPetalWidth Frequency +C1 {1, 3} ]-inf;1.55] 46 +C2 {5, 6} ]-inf;1.55] 2 +C3 {4} ]-inf;1.55] 19 +C5 {5, 6} ]1.55;+inf[ 32 +C6 {4} ]1.55;+inf[ 6 +Confusion matrix + {1, 3} {5, 6} {4} +]-inf;1.55] 46 2 19 +]1.55;+inf[ 0 32 6 + +Rank R27 + +Data grid Unsupervised +Dimensions +PetalLength Numerical Intervals + ]-inf;3.55] 1 3.55 + ]3.55;4.6] 3.55 4.6 + ]4.6;5.95] 4.6 5.95 + ]5.95;+inf[ 5.95 6.9 +SepalLength Numerical Intervals + ]-inf;5.45] 4.3 5.45 + ]5.45;5.85] 5.45 5.85 + ]5.85;7.15] 5.85 7.15 + ]7.15;+inf[ 7.15 7.7 +Cells +Cell id PetalLength SepalLength Frequency +C1 ]-inf;3.55] ]-inf;5.45] 37 +C2 ]3.55;4.6] ]-inf;5.45] 2 +C5 ]-inf;3.55] ]5.45;5.85] 5 +C6 ]3.55;4.6] ]5.45;5.85] 10 +C7 ]4.6;5.95] ]5.45;5.85] 2 +C10 ]3.55;4.6] ]5.85;7.15] 7 +C11 ]4.6;5.95] ]5.85;7.15] 35 +C16 ]5.95;+inf[ ]7.15;+inf[ 7 +Confusion matrix + ]-inf;3.55] ]3.55;4.6] ]4.6;5.95] ]5.95;+inf[ +]-inf;5.45] 37 2 0 0 +]5.45;5.85] 5 10 2 0 +]5.85;7.15] 0 7 35 0 +]7.15;+inf[ 0 0 0 7 + +Rank R28 + +Data grid Unsupervised +Dimensions +PetalLength Numerical Intervals + ]-inf;4.75] 1 4.75 + ]4.75;+inf[ 4.75 6.9 +UpperPetalWidth Numerical Intervals + ]-inf;1.65] 1.5 1.65 + ]1.65;+inf[ 1.65 2.5 +Cells +Cell id PetalLength UpperPetalWidth Frequency +C1 ]-inf;4.75] ]-inf;1.65] 65 +C2 ]4.75;+inf[ ]-inf;1.65] 5 +C4 ]4.75;+inf[ ]1.65;+inf[ 35 +Confusion matrix + ]-inf;4.75] ]4.75;+inf[ +]-inf;1.65] 65 5 +]1.65;+inf[ 0 35 + +Rank R29 + +Data grid Unsupervised +Dimensions +PetalWidth Numerical Intervals + ]-inf;0.75] 0.1 0.75 + ]0.75;1.35] 0.75 1.35 + ]1.35;+inf[ 1.35 2.5 +SepalLength Numerical Intervals + ]-inf;5.45] 4.3 5.45 + ]5.45;5.85] 5.45 5.85 + ]5.85;+inf[ 5.85 7.7 +Cells +Cell id PetalWidth SepalLength Frequency +C1 ]-inf;0.75] ]-inf;5.45] 34 +C2 ]0.75;1.35] ]-inf;5.45] 3 +C3 ]1.35;+inf[ ]-inf;5.45] 2 +C4 ]-inf;0.75] ]5.45;5.85] 4 +C5 ]0.75;1.35] ]5.45;5.85] 10 +C6 ]1.35;+inf[ ]5.45;5.85] 3 +C8 ]0.75;1.35] ]5.85;+inf[ 5 +C9 ]1.35;+inf[ ]5.85;+inf[ 44 +Confusion matrix + ]-inf;0.75] ]0.75;1.35] ]1.35;+inf[ +]-inf;5.45] 34 3 2 +]5.45;5.85] 4 10 3 +]5.85;+inf[ 0 5 44 + +Rank R30 + +Data grid Unsupervised +Dimensions +LowerPetalLength Numerical Intervals + ]-inf;2.4] 1 2.4 + ]2.4;+inf[ 2.4 3 +SepalLength Numerical Intervals + ]-inf;5.45] 4.3 5.45 + ]5.45;+inf[ 5.45 7.7 +Cells +Cell id LowerPetalLength SepalLength Frequency +C1 ]-inf;2.4] ]-inf;5.45] 34 +C2 ]2.4;+inf[ ]-inf;5.45] 5 +C3 ]-inf;2.4] ]5.45;+inf[ 4 +C4 ]2.4;+inf[ ]5.45;+inf[ 62 +Confusion matrix + ]-inf;2.4] ]2.4;+inf[ +]-inf;5.45] 34 5 +]5.45;+inf[ 4 62 + +Rank R31 + +Data grid Unsupervised +Dimensions +Class1 Categorical Value groups + {} + {setosa} setosa * +UpperPetalWidth Numerical Intervals + ]-inf;1.55] 1.5 1.55 + ]1.55;+inf[ 1.55 2.5 +Cells +Cell id Class1 UpperPetalWidth Frequency +C1 {} ]-inf;1.55] 29 +C2 {setosa} ]-inf;1.55] 38 +C3 {} ]1.55;+inf[ 38 +Confusion matrix + {} {setosa} +]-inf;1.55] 29 38 +]1.55;+inf[ 38 0 + +Rank R32 + +Data grid Unsupervised +Dimensions +SepalLength Numerical Intervals + ]-inf;5.85] 4.3 5.85 + ]5.85;+inf[ 5.85 7.7 +UpperPetalWidth Numerical Intervals + ]-inf;1.55] 1.5 1.55 + ]1.55;+inf[ 1.55 2.5 +Cells +Cell id SepalLength UpperPetalWidth Frequency +C1 ]-inf;5.85] ]-inf;1.55] 54 +C2 ]5.85;+inf[ ]-inf;1.55] 13 +C3 ]-inf;5.85] ]1.55;+inf[ 2 +C4 ]5.85;+inf[ ]1.55;+inf[ 36 +Confusion matrix + ]-inf;5.85] ]5.85;+inf[ +]-inf;1.55] 54 13 +]1.55;+inf[ 2 36 + +Rank R33 + +Data grid Unsupervised +Dimensions +Class1 Categorical Value groups + {} + {setosa} setosa * +SepalWidth Numerical Intervals + ]-inf;2.85] 2 2.85 + ]2.85;3.35] 2.85 3.35 + ]3.35;+inf[ 3.35 4.4 +Cells +Cell id Class1 SepalWidth Frequency +C1 {} ]-inf;2.85] 30 +C3 {} ]2.85;3.35] 32 +C4 {setosa} ]2.85;3.35] 17 +C5 {} ]3.35;+inf[ 5 +C6 {setosa} ]3.35;+inf[ 21 +Confusion matrix + {} {setosa} +]-inf;2.85] 30 0 +]2.85;3.35] 32 17 +]3.35;+inf[ 5 21 + +Rank R34 + +Data grid Unsupervised +Dimensions +Class2 Categorical Value groups + {} + {versicolor} versicolor * +LowerPetalLength Numerical Intervals + ]-inf;2.4] 1 2.4 + ]2.4;+inf[ 2.4 3 +Cells +Cell id Class2 LowerPetalLength Frequency +C1 {} ]-inf;2.4] 38 +C3 {} ]2.4;+inf[ 35 +C4 {versicolor} ]2.4;+inf[ 32 +Confusion matrix + {} {versicolor} +]-inf;2.4] 38 0 +]2.4;+inf[ 35 32 + +Rank R35 + +Data grid Unsupervised +Dimensions +Class Categorical Value groups + {Iris-setosa} Iris-setosa + {Iris-virginica} Iris-virginica + {Iris-versicolor} Iris-versicolor * +SepalWidth Numerical Intervals + ]-inf;2.95] 2 2.95 + ]2.95;+inf[ 2.95 4.4 +Cells +Cell id Class SepalWidth Frequency +C1 {Iris-setosa} ]-inf;2.95] 1 +C2 {Iris-virginica} ]-inf;2.95] 13 +C3 {Iris-versicolor} ]-inf;2.95] 22 +C4 {Iris-setosa} ]2.95;+inf[ 37 +C5 {Iris-virginica} ]2.95;+inf[ 22 +C6 {Iris-versicolor} ]2.95;+inf[ 10 +Confusion matrix + {Iris-setosa} {Iris-virginica} {Iris-versicolor} +]-inf;2.95] 1 13 22 +]2.95;+inf[ 37 22 10 + +Rank R36 + +Data grid Unsupervised +Dimensions +LowerPetalLength Numerical Intervals + ]-inf;2.4] 1 2.4 + ]2.4;+inf[ 2.4 3 +UpperPetalWidth Numerical Intervals + ]-inf;1.55] 1.5 1.55 + ]1.55;+inf[ 1.55 2.5 +Cells +Cell id LowerPetalLength UpperPetalWidth Frequency +C1 ]-inf;2.4] ]-inf;1.55] 38 +C2 ]2.4;+inf[ ]-inf;1.55] 29 +C4 ]2.4;+inf[ ]1.55;+inf[ 38 +Confusion matrix + ]-inf;2.4] ]2.4;+inf[ +]-inf;1.55] 38 29 +]1.55;+inf[ 0 38 + +Rank R37 + +Data grid Unsupervised +Dimensions +PetalWidth Numerical Intervals + ]-inf;0.75] 0.1 0.75 + ]0.75;1.45] 0.75 1.45 + ]1.45;+inf[ 1.45 2.5 +SepalWidth Numerical Intervals + ]-inf;2.95] 2 2.95 + ]2.95;+inf[ 2.95 4.4 +Cells +Cell id PetalWidth SepalWidth Frequency +C1 ]-inf;0.75] ]-inf;2.95] 1 +C2 ]0.75;1.45] ]-inf;2.95] 21 +C3 ]1.45;+inf[ ]-inf;2.95] 14 +C4 ]-inf;0.75] ]2.95;+inf[ 37 +C5 ]0.75;1.45] ]2.95;+inf[ 3 +C6 ]1.45;+inf[ ]2.95;+inf[ 29 +Confusion matrix + ]-inf;0.75] ]0.75;1.45] ]1.45;+inf[ +]-inf;2.95] 1 21 14 +]2.95;+inf[ 37 3 29 + +Rank R38 + +Data grid Unsupervised +Dimensions +SPetalLength Categorical Value groups + {4, 3, 6} 4 3 6 * + {1} 1 + {5} 5 +SepalWidth Numerical Intervals + ]-inf;2.95] 2 2.95 + ]2.95;3.25] 2.95 3.25 + ]3.25;+inf[ 3.25 4.4 +Cells +Cell id SPetalLength SepalWidth Frequency +C1 {4, 3, 6} ]-inf;2.95] 26 +C2 {1} ]-inf;2.95] 1 +C3 {5} ]-inf;2.95] 9 +C4 {4, 3, 6} ]2.95;3.25] 10 +C5 {1} ]2.95;3.25] 15 +C6 {5} ]2.95;3.25] 16 +C7 {4, 3, 6} ]3.25;+inf[ 4 +C8 {1} ]3.25;+inf[ 22 +C9 {5} ]3.25;+inf[ 2 +Confusion matrix + {4, 3, 6} {1} {5} +]-inf;2.95] 26 1 9 +]2.95;3.25] 10 15 16 +]3.25;+inf[ 4 22 2 diff --git a/tests/test_core.py b/tests/test_core.py index 1bab6354..92ded09c 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -70,6 +70,7 @@ def test_analysis_results(self): "IrisMAPLegacy", "IrisR", "IrisU", + "IrisU2D", "LargeSpiral", "Latin", "LatinGreek", From 75e9a89ffe755a85e14607a0f09d406302276c14 Mon Sep 17 00:00:00 2001 From: Thierry RAMORASOAVINA Date: Mon, 18 Mar 2024 15:42:25 +0100 Subject: [PATCH 13/37] Install conda in the `khiopspydev` docker image of all supported python versions - the purpose is to run the unit tests on the supported python versions from 3.8 to 3.12 - miniconda3 was chosen to keep the docker image light - the conda envs follow this name pattern py$version (where version is the python3 version) - installation of packages & run within a conda env is performed without activating it (this saves us from annoying little issues) --- .github/workflows/dev-docker.yml | 6 +++ .github/workflows/unit-tests.yml | 39 ++++++++++++------- packaging/docker/khiopspydev/Dockerfile.rocky | 18 ++++++++- .../docker/khiopspydev/Dockerfile.ubuntu | 16 +++++++- 4 files changed, 62 insertions(+), 17 deletions(-) diff --git a/.github/workflows/dev-docker.yml b/.github/workflows/dev-docker.yml index 41265080..522662fd 100644 --- a/.github/workflows/dev-docker.yml +++ b/.github/workflows/dev-docker.yml @@ -3,6 +3,7 @@ name: Dev Docker env: DEFAULT_KHIOPS_REVISION: main DEFAULT_SERVER_REVISION: main + DEFAULT_PYTHON_VERSIONS: 3.8 3.9 3.10 3.11 3.12 on: pull_request: paths: [packaging/docker/khiopspydev/Dockerfile.*, .github/workflows/dev-docker.yml] @@ -20,6 +21,10 @@ on: type: boolean default: true description: Push to GH Registry + python-versions: + type: string + default: 3.8 3.9 3.10 3.11 3.12 + description: Python versions supported by khiops-python concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true @@ -60,6 +65,7 @@ jobs: "KHIOPS_REVISION=${{ env.KHIOPS_REVISION }}" "KHIOPSDEV_OS=${{ matrix.khiopsdev-os }}" "SERVER_REVISION=${{ env.SERVER_REVISION }}" + "PYTHON_VERSIONS=${{ inputs.python-versions || env.DEFAULT_PYTHON_VERSIONS }}" tags: ghcr.io/khiopsml/khiops-python/khiopspydev-${{ matrix.khiopsdev-os }}:latest # Push only on manual request push: ${{ inputs.push || false }} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 296ee95a..9254cd82 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -20,6 +20,10 @@ concurrency: jobs: run: runs-on: ubuntu-22.04 + strategy: + fail-fast: false + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] container: image: ghcr.io/khiopsml/khiops-python/khiopspydev-ubuntu22.04:latest credentials: @@ -55,14 +59,19 @@ jobs: - name: Setup and Install Test Requirements if: success() || failure() run: | - mkdir -p -m u+rwx ${{ github.workspace }}/reports - pip install unittest-xml-reporting - pip install -r test-requirements.txt + mkdir -p -m u+rwx reports/py${{ matrix.python-version }} + # install within the conda environment without activating it + /root/miniconda3/bin/conda install -y -n py${{ matrix.python-version }} -c conda-forge unittest-xml-reporting + /root/miniconda3/bin/conda install -y -n py${{ matrix.python-version }} --file test-requirements.txt - name: Install khiops-python dependencies if: success() || failure() run: | - python setup.py egg_info - pip install `grep -v "^\[" khiops.egg-info/requires.txt` + # The following git command is required, + # as the Git repository is in a directory the current user does not own, + # Python versioneer fails to compute the current version correctly otherwise + git config --global --add safe.directory $(realpath .) + /root/miniconda3/bin/conda run --no-capture-output -n py${{ matrix.python-version }} python setup.py egg_info + /root/miniconda3/bin/conda install -y -n py${{ matrix.python-version }} `grep -v "^\[" khiops.egg-info/requires.txt` rm -rf khiops.egg-info - name: Prepare Unit Tests Environment if: github.ref != 'dev' && github.ref != 'main' && ! inputs.run-long-tests @@ -77,23 +86,23 @@ jobs: # This is needed so that the Git tag is parsed and the khiops-python # version is retrieved git config --global --add safe.directory $(realpath .) - coverage run -m xmlrunner -o "reports" -v - coverage report -m - coverage xml -o "reports/py-coverage.xml" + /root/miniconda3/bin/conda run --no-capture-output -n py${{ matrix.python-version }} coverage run -m xmlrunner -o "reports/py${{ matrix.python-version }}" -v + /root/miniconda3/bin/conda run --no-capture-output -n py${{ matrix.python-version }} coverage report -m + /root/miniconda3/bin/conda run --no-capture-output -n py${{ matrix.python-version }} coverage xml -o "reports/py${{ matrix.python-version }}/py-coverage.xml" - name: Display Unit Test Reports uses: dorny/test-reporter@v1 with: - name: Unit Tests - path: reports/TEST-tests.*.*.xml + name: Unit Tests ${{ matrix.python-version }} + path: reports/py${{ matrix.python-version }}/TEST-tests.*.*.xml reporter: java-junit path-replace-backslashes: 'true' # Necessary for windows paths - name: Upload Test Reports as Artifacts uses: actions/upload-artifact@v4 with: - name: test-reports + name: test-reports-${{ matrix.python-version }} path: |- - reports/TEST-tests.*.*.xml - reports/py-coverage.xml + reports/py${{ matrix.python-version }}/TEST-tests.*.*.xml + reports/py${{ matrix.python-version }}/py-coverage.xml tests/resources/scenario_generation/*/ref/*._kh tests/resources/scenario_generation/*/output/*._kh tests/resources/*/output_reports/*.txt @@ -128,6 +137,10 @@ jobs: fetch-depth: 0 - name: Install khiops-python dev dependencies run: | + # The following git command is required, + # as the Git repository is in a directory the current user does not own, + # Python versioneer fails to compute the current version correctly otherwise + git config --global --add safe.directory $(realpath .) python setup.py egg_info pip install `grep -v "^\[" khiops.egg-info/requires.txt` rm -rf khiops.egg-info diff --git a/packaging/docker/khiopspydev/Dockerfile.rocky b/packaging/docker/khiopspydev/Dockerfile.rocky index df4d3dc9..7eef216e 100644 --- a/packaging/docker/khiopspydev/Dockerfile.rocky +++ b/packaging/docker/khiopspydev/Dockerfile.rocky @@ -8,8 +8,9 @@ LABEL description="Container for the development of khiops-python" # Reuse KHIOPSDEV_OS from previous stage ARG KHIOPSDEV_OS ARG KHIOPS_REVISION - -# Install dev tools; build and install Khiops; set mpich as the default MPI +# - Install dev tools and miniconda3 (for the unit tests) +# - Build and install Khiops +# - Set mpich as the default MPI RUN true \ && useradd -rm -d /home/rocky -s /bin/bash -g root -u 1000 rocky \ # Install git (for khiops-python version calculation), pandoc and pip \ @@ -18,6 +19,7 @@ RUN true \ && dnf install --enablerepo=devel -y \ git \ pandoc \ + wget \ # Install Python 3.11 if on Rocky 8 \ && if [ "$KHIOPSDEV_OS" = "rocky8" ]; then \ dnf install -y \ @@ -55,11 +57,23 @@ RUN true \ alternatives --install /usr/bin/python python /usr/bin/python3 1 \ && alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 ; \ fi \ + # Install miniconda3 to have multiple Python versions via Conda \ + && mkdir -p /root/miniconda3 && cd /root/miniconda3 \ + && wget https://repo.anaconda.com/miniconda/Miniconda3-py312_24.1.2-0-Linux-x86_64.sh -O ./Miniconda3-py312_24.1.2-0-Linux-x86_64.sh \ + && echo "b978856ec3c826eb495b60e3fffe621f670c101150ebcbdeede4f961f22dc438 Miniconda3-py312_24.1.2-0-Linux-x86_64.sh" | sha256sum --check \ + && bash ./Miniconda3-py312_24.1.2-0-Linux-x86_64.sh -b -u -p /root/miniconda3 \ + && rm -rf /root/miniconda3/Miniconda3-py312_24.1.2-0-Linux-x86_64.sh \ # Clean build files \ && dnf clean all \ && rm -rf ./khiops \ && true +# set up all the supported Python environments under conda (for the unit tests) +# relying on a variable containing all the versions +ARG PYTHON_VERSIONS +RUN for version in ${PYTHON_VERSIONS}; \ + do /root/miniconda3/bin/conda create -y -n py${version} python=${version}; done + RUN mkdir -p /scripts COPY ./run_service.sh /scripts/run_service.sh RUN chmod +x /scripts/run_service.sh diff --git a/packaging/docker/khiopspydev/Dockerfile.ubuntu b/packaging/docker/khiopspydev/Dockerfile.ubuntu index 00f395cf..493f5f5d 100644 --- a/packaging/docker/khiopspydev/Dockerfile.ubuntu +++ b/packaging/docker/khiopspydev/Dockerfile.ubuntu @@ -5,12 +5,12 @@ FROM ghcr.io/khiopsml/khiops/khiopsdev-${KHIOPSDEV_OS}:latest AS khiopsdev LABEL maintainer="khiops.team@orange.com" LABEL description="Container for the development of khiops-python" -# Install dev tools; build and install Khiops +# Install dev tools and miniconda3 (for the unit tests); build and install Khiops ARG KHIOPS_REVISION RUN true \ # Install git (for khiops-python version calculation) and pip \ && apt-get -y update \ - && apt-get -y --no-install-recommends install git python3-pip zip pandoc \ + && apt-get -y --no-install-recommends install git python3-pip zip pandoc wget \ # Obtain the Khiops sources \ && git clone https://github.com/khiopsml/khiops.git \ && cd khiops \ @@ -24,6 +24,12 @@ RUN true \ && cd .. \ # Set python to python3 \ && update-alternatives --install /usr/bin/python python /usr/bin/python3 1 \ + # Install miniconda3 to have multiple Python versions via Conda \ + && mkdir -p /root/miniconda3 && cd /root/miniconda3 \ + && wget https://repo.anaconda.com/miniconda/Miniconda3-py312_24.1.2-0-Linux-x86_64.sh -O ./Miniconda3-py312_24.1.2-0-Linux-x86_64.sh \ + && echo "b978856ec3c826eb495b60e3fffe621f670c101150ebcbdeede4f961f22dc438 Miniconda3-py312_24.1.2-0-Linux-x86_64.sh" | sha256sum --check \ + && bash ./Miniconda3-py312_24.1.2-0-Linux-x86_64.sh -b -u -p /root/miniconda3 \ + && rm -rf /root/miniconda3/Miniconda3-py312_24.1.2-0-Linux-x86_64.sh \ # Make sure that MPI is mpich \ && update-alternatives --set mpirun /usr/bin/mpirun.mpich \ # Clean build files \ @@ -32,6 +38,12 @@ RUN true \ && rm -rf ./khiops \ && true +# set up all the supported Python environments under conda (for the unit tests) +# relying on a variable containing all the versions +ARG PYTHON_VERSIONS +RUN for version in ${PYTHON_VERSIONS}; \ + do /root/miniconda3/bin/conda create -y -n py${version} python=${version}; done + RUN mkdir -p /scripts COPY ./run_service.sh /scripts/run_service.sh RUN chmod +x /scripts/run_service.sh && \ From 3c39334f7b23dcb3cd05482ea69c5611f1f1742c Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Tue, 16 Apr 2024 11:25:53 +0200 Subject: [PATCH 14/37] Fix Python 3.8 test failures In Python 3.8, `scipy.sparse.lil_matrix` rows contain arrays of Python lists. If empty row, it contains a single-element array with an empty Python list element; its numpy array data has size 1. This patch tests directly on the size of the row and flattens the rows to ensure a homogeneous treatment. --- khiops/sklearn/tables.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/khiops/sklearn/tables.py b/khiops/sklearn/tables.py index a20631fe..aa2c44b2 100644 --- a/khiops/sklearn/tables.py +++ b/khiops/sklearn/tables.py @@ -9,7 +9,7 @@ import io import warnings from abc import ABC, abstractmethod -from collections.abc import Mapping, Sequence +from collections.abc import Iterable, Mapping, Sequence import numpy as np import pandas as pd @@ -1409,6 +1409,14 @@ def get_khiops_variable_name(self, column_id): variable_name = f"Var{column_id}" return variable_name + def _flatten(self, iterable): + if isinstance(iterable, Iterable): + for iterand in iterable: + if isinstance(iterand, Iterable): + yield from self._flatten(iterand) + else: + yield iterand + def _write_sparse_block(self, row_index, stream, target=None): assert row_index in range( self.matrix.shape[0] @@ -1420,7 +1428,7 @@ def _write_sparse_block(self, row_index, stream, target=None): # Empty row in the sparse matrix: use the first variable as missing data # TODO: remove this part once Khiops bug # https://github.com/KhiopsML/khiops/issues/235 is solved - if row.data.size == 0: + if row.size == 0: for variable_index in self.column_ids: stream.write(f"{variable_index + 1}: ") break @@ -1428,8 +1436,19 @@ def _write_sparse_block(self, row_index, stream, target=None): else: # Variable indices are not always sorted in `row.indices` # Khiops needs variable indices to be sorted - sorted_indices = np.sort(row.indices, axis=-1, kind="mergesort") - sorted_data = row.data[sorted_indices.argsort()] + sorted_indices = np.sort(row.nonzero()[1], axis=-1, kind="mergesort") + + # Flatten row for Python < 3.9 scipy.sparse.lil_matrix whose API + # is not homogeneous with other sparse matrices: it stores + # opaque Python lists as elements + # Thus: + # - if isinstance(self.matrix, sp.lil_matrix) and Python 3.8, then + # row.data is np.array([list([...])]) + # - else, row.data is np.array([...]) + # TODO: remove this flattening once Python 3.8 support is dropped + sorted_data = np.fromiter(self._flatten(row.data), row.data.dtype)[ + sorted_indices.argsort() + ] for variable_index, variable_value in zip(sorted_indices, sorted_data): stream.write(f"{variable_index + 1}:{variable_value} ") stream.write("\n") From 2bbeb1fca5ca530d80c5446ee21d3a9f04510e10 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Wed, 17 Apr 2024 16:28:28 +0200 Subject: [PATCH 15/37] Print download URL in kh-download-datasets --- khiops/tools.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/khiops/tools.py b/khiops/tools.py index 3880e3e3..58d05e87 100644 --- a/khiops/tools.py +++ b/khiops/tools.py @@ -117,7 +117,7 @@ def kh_download_datasets_entry_point(): def download_datasets( - force_overwrite=False, version="10.1.1", _called_from_shell=False + force_overwrite=False, version="10.2.0", _called_from_shell=False ): """Downloads the Khiops sample datasets for a given version @@ -129,7 +129,7 @@ def download_datasets( ========== force_overwrite : bool, default ``False`` If ``True`` it always overwrites the local samples directory even if it exists. - version : str, default "10.1.1" + version : str, default "10.2.0" The version of the samples datasets. """ # Note: The hidden parameter _called_from_shell is just to change the user messages. @@ -159,6 +159,7 @@ def download_datasets( ) # Download the sample zip file and extracted to the home dataset dir + print(f"Downloading samples from {samples_zip_url}") with tempfile.NamedTemporaryFile() as temp_zip_file, urllib.request.urlopen( samples_zip_url ) as zip_request: From 6336cf7394463b39e56dd1f2ab0674e406403d98 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Wed, 17 Apr 2024 16:47:51 +0200 Subject: [PATCH 16/37] Fix download samples dataset version in CI --- .github/workflows/conda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 6da1f3df..fbcef45c 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -109,7 +109,7 @@ jobs: - name: Download Sample Datasets run: | kh-download-datasets \ - --version ${{ inputs.khiops-core-version || env.DEFAULT_SAMPLES_VERSION }} + --version ${{ inputs.khiops-samples-version || env.DEFAULT_SAMPLES_VERSION }} - name: Test Conda Package Installation on Samples run: | kh-samples core -i train_predictor -e From e83eab1f64f375c30efd1581c2bce0331e517e39 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Mon, 22 Apr 2024 10:20:09 +0200 Subject: [PATCH 17/37] Make samples dir check be executed only on access --- khiops/core/internals/runner.py | 119 +++++++++++++++++--------------- 1 file changed, 62 insertions(+), 57 deletions(-) diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index 83181012..846469e3 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -44,6 +44,55 @@ def _isdir_without_all_perms(dir_path): ) +def get_dir_status(a_dir): + """Returns the status of a local or remote directory + + Against a local directory a real check is performed. A remote directory is detected + but not checked. + """ + if fs.is_local_resource(a_dir): + # Remove initial slash on windows systems + # urllib's url2pathname does not work properly + a_dir_res = fs.create_resource(os.path.normpath(a_dir)) + a_dir_path = a_dir_res.uri_info.path + if platform.system() == "Windows": + if a_dir_path.startswith("/"): + a_dir_path = a_dir_path[1:] + + if not os.path.exists(a_dir_path): + status = "non-existent" + elif not os.path.isdir(a_dir_path): + status = "not-a-dir" + else: + status = "ok" + else: + status = "remote-path" + + assert status in ["non-existent", "not-a-dir", "ok", "remote-path"] + return status + + +def check_samples_dir(samples_dir): + # Warn if there are problems with the samples_dir + samples_dir_status = get_dir_status(samples_dir) + download_msg = ( + "Execute the kh-download-datasets script or " + "the khiops.tools.download_datasets function to download them." + ) + if samples_dir_status == "non-existent": + warnings.warn( + "Sample datasets location does not exist " + f"({samples_dir}). {download_msg}", + stacklevel=3, + ) + elif samples_dir_status == "not-a-dir": + warnings.warn( + "Sample datasets location is not a directory " + f"({samples_dir}). {download_msg}", + stacklevel=3, + ) + + def _extract_path_from_uri(uri): res = fs.create_resource(uri) if platform.system() == "Windows": @@ -69,30 +118,6 @@ def _extract_path_from_uri(uri): return path -def _dir_status(a_dir): - """Returns the status of a local or remote directory""" - if fs.is_local_resource(a_dir): - # Remove initial slash on windows systems - # urllib's url2pathname does not work properly - a_dir_res = fs.create_resource(os.path.normpath(a_dir)) - a_dir_path = a_dir_res.uri_info.path - if platform.system() == "Windows": - if a_dir_path.startswith("/"): - a_dir_path = a_dir_path[1:] - - if not os.path.exists(a_dir_path): - status = "non-existent" - elif not os.path.isdir(a_dir_path): - status = "not-a-dir" - else: - status = "ok" - else: - status = "remote-path" - - assert status in ["non-existent", "not-a-dir", "ok", "remote-path"] - return status - - def _get_system_cpu_cores(): """Portably obtains the number of cpu cores (no hyperthreading)""" # Set the cpu info command and arguments for each platform @@ -969,6 +994,7 @@ def __init__(self): self._khiops_bin_dir = None self._khiops_version = None self._samples_dir = None + self._samples_dir_checked = False # Call parent constructor super().__init__() @@ -1013,9 +1039,8 @@ def _start_khiops_environment_initialization(self): else: self.khiops_temp_dir = "" - # Initialize and check the default samples dir + # Initialize the default samples dir self._initialize_default_samples_dir() - self._check_samples_dir() def _initialize_mpi_command_args(self): """Creates the mpiexec call arguments for each platform""" @@ -1188,10 +1213,12 @@ def _initialize_default_samples_dir(self): ) else: public_samples_dir = None - if public_samples_dir is not None and _dir_status(public_samples_dir) in [ - "ok", - "remote", - ]: + + ok_statuses = ["ok", "remote"] + if ( + public_samples_dir is not None + and get_dir_status(public_samples_dir) in ok_statuses + ): self._samples_dir = public_samples_dir else: self._samples_dir = str(home_samples_dir) @@ -1203,32 +1230,6 @@ def _initialize_default_samples_dir(self): assert self._samples_dir is not None - def _check_samples_dir(self, samples_dir=None): - # Check the runners samples_dir if samples_dir is not specified - if samples_dir is None: - samples_dir_to_check = self._samples_dir - else: - samples_dir_to_check = samples_dir - - # Warn if there are problems with the samples_dir - samples_dir_status = _dir_status(samples_dir_to_check) - download_msg = ( - "Execute the kh-download-datasets script or " - "the khiops.tools.download_datasets function to download them." - ) - if samples_dir_status == "non-existent": - warnings.warn( - "Sample datasets location does not exist " - f"({samples_dir_to_check}). {download_msg}", - stacklevel=3, - ) - elif samples_dir_status == "not-a-dir": - warnings.warn( - "Sample datasets location is not a directory " - f"({samples_dir_to_check}). {download_msg}", - stacklevel=3, - ) - def _finish_khiops_environment_initialization(self): # Initialize Khiops binary directory self._initialize_khiops_bin_dir() @@ -1428,10 +1429,14 @@ def _tool_path(self, tool_name): def _set_samples_dir(self, samples_dir): """Checks and sets the samples directory""" - self._check_samples_dir(samples_dir) + check_samples_dir(samples_dir) super()._set_samples_dir(samples_dir) def _get_samples_dir(self): + # Check the samples dir once (the check emmits only warnings) + if not self._samples_dir_checked: + check_samples_dir(self._samples_dir) + self._samples_dir_checked = True return self._samples_dir def raw_run(self, tool_name, command_line_args=None, use_mpi=True, trace=False): From a1ba662d8dce1694938ab7e24b08f040303da489 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Mon, 29 Apr 2024 15:08:57 +0200 Subject: [PATCH 18/37] Add datasets samples check in the release checklist --- CONTRIBUTING.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5e639bdb..98e43bcf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -261,6 +261,7 @@ Checklist: - Update the API Docs if necessary - Update `CHANGELOG.md` - Update the default `khiops-core` version in [.github/workflows/conda.yml] + - Update the default value for `version` in the `download_datasets` function in [khiops/tools.py] - Git manipulations - Update your local repo and save your work: - `git stash # if necessary` From ec8a3b255982de42001e8a14c86bc3a973b5dcd9 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Thu, 6 Jun 2024 09:59:43 +0200 Subject: [PATCH 19/37] Simplify error reporting Also: Modify an integration test to restore the runners state when there are unexpected errors. --- khiops/core/internals/runner.py | 114 +++++++++++++----------------- tests/test_khiops_integrations.py | 70 ++++++++++-------- 2 files changed, 89 insertions(+), 95 deletions(-) diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index 846469e3..9f1bba75 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -741,74 +741,60 @@ def _report_exit_status( ): """Reports the exit status of a Khiops execution""" # Note: - # We report stdout and stderr in both branches below because we use a log file - # and thus normally Khiops doesn't write anything to these streams. In - # practice MPI and the remote filesystems plugins may write to them to report - # anomalies. - - # If the execution was correct, warn and report: - # - the stdout if it was not empty - # - the stderr if it was not empty - # - any warnings found in the log - if return_code == 0: - # Add Khiops log warnings to the warning message if any - warning_msg = "" - _, _, warning_messages = self._collect_errors(log_file_path) - if warning_messages: - warning_msg += "Warnings in log:\n" + "".join(warning_messages) - - # Add stdout to the warning message if non empty - if stdout: - if warning_msg: - warning_msg += "\n" - warning_msg += f"Contents of stdout:\n{stdout}" - - # Add stderr to the warning message if non empty - if stderr: - if warning_msg: - warning_msg += "\n" - warning_msg += f"Contents of stderr:\n{stderr}" - - # Report the message if there were any - if warning_msg: - warning_msg = ( - "Khiops ended correctly but there were minor issues: " + warning_msg - ) - warnings.warn(warning_msg.rstrip(), stacklevel=4) - # If the execution was incorrect raise an exception reporting: + # We report stdout and stderr below because we use a log file and thus + # normally Khiops doesn't write anything to these streams. In practice MPI and + # the remote filesystems plugins may write to them to report anomalies. + + # Report messages: # - The warnings in the log # - The errors and/or fatal errors in the log # - The stdout if not empty # - The stderr if not empty - else: - # Collect errors and warnings - errors, fatal_errors, warning_messages = self._collect_errors(log_file_path) - - # Create the message reporting the errors - error_msg = "" - if warning_messages: - error_msg += "Warnings in log:\n" + "".join(warning_messages) - if errors: - if error_msg: - error_msg += "\n" - error_msg += "Errors in log:\n" + "".join(errors) - if fatal_errors: - if error_msg: - error_msg += "\n" - error_msg += "Fatal errors in log:\n" + "".join(fatal_errors) - if stdout: - if error_msg: - error_msg += "\n" - error_msg += f"Contents of stdout:\n{stdout}" - if stderr: - if error_msg: - error_msg += "\n" - error_msg += f"Contents of stderr:\n{stderr}" - - # Raise an exception with the errors - raise KhiopsRuntimeError( - f"{tool_name} ended with return code {return_code}\n{error_msg}" - ) + # + # If there were any errors (fatal or not) or the return code is non-zero the + # reporting is via an exception. Otherwise we show the message as a warning. + # + + # Create the message reporting the errors and warnings + error_msg = "" + errors, fatal_errors, warning_messages = self._collect_errors(log_file_path) + if warning_messages: + error_msg += "Warnings in log:\n" + "".join(warning_messages) + if errors: + if error_msg: + error_msg += "\n" + error_msg += "Errors in log:\n" + "".join(errors) + if fatal_errors: + if error_msg: + error_msg += "\n" + error_msg += "Fatal errors in log:\n" + "".join(fatal_errors) + + # Add stdout to the warning message if non empty + if stdout: + if error_msg: + error_msg += "\n" + error_msg += f"Contents of stdout:\n{stdout}" + + # Add stderr to the warning message if non empty + if stderr: + if error_msg: + error_msg += "\n" + error_msg += f"Contents of stderr:\n{stderr}" + + # Report the message to the user if there were any + if error_msg: + # Raise an exception if there were errors + if errors or fatal_errors or return_code != 0: + raise KhiopsRuntimeError( + f"{tool_name} execution had errors (return code {return_code}):\n" + f"{error_msg}" + ) + # Otherwise show the message as a warning + else: + error_msg = ( + f"Khiops ended correctly but there were minor issues:\n{error_msg}" + ) + warnings.warn(error_msg.rstrip()) def _collect_errors(self, log_file_path): # Collect errors any errors found in the log diff --git a/tests/test_khiops_integrations.py b/tests/test_khiops_integrations.py index abc5d404..20d62094 100644 --- a/tests/test_khiops_integrations.py +++ b/tests/test_khiops_integrations.py @@ -138,38 +138,46 @@ def test_runner_with_custom_khiops_binary_directory(self): # Get default runner default_runner = kh.get_runner() - # Create a fresh local runner and initialize its default Khiops binary dir - runner = KhiopsLocalRunner() - runner._initialize_khiops_bin_dir() - - # Get runner's default Khiops binary directory - default_bin_dir = runner.khiops_bin_dir + # Test in a try block to restore the runner if there are unexpected errors + try: + # Create a fresh local runner and initialize its default Khiops binary dir + runner = KhiopsLocalRunner() + runner._initialize_khiops_bin_dir() + + # Get runner's default Khiops binary directory + default_bin_dir = runner.khiops_bin_dir + + # Create temporary directory + with tempfile.TemporaryDirectory() as tmp_khiops_bin_dir: + # Copy Khiops binaries into the temporary directory + for binary_file in os.listdir(default_bin_dir): + if binary_file.startswith("MODL"): + shutil.copy( + os.path.join(default_bin_dir, binary_file), + os.path.join(tmp_khiops_bin_dir, binary_file), + ) + + # Change runner's Khiops binary directory to the temporary directory + runner.khiops_bin_dir = tmp_khiops_bin_dir + + # Set current runner to the fresh runner + kh.set_runner(runner) + + # Test the core API works + # Call check_database (could be any other method) + with self.assertRaises(kh.KhiopsRuntimeError) as cm: + kh.check_database("a.kdic", "dict_name", "data.txt") + + # Test that MODL executable can be found and launched + # Note: The return code is not specified to support older khiops + # versions that returned 2 instead of 0 in this case. + self.assertIn( + "khiops execution had errors (return code ", str(cm.exception) + ) - # Create temporary directory - with tempfile.TemporaryDirectory() as tmp_khiops_bin_dir: - # Copy Khiops binaries into the temporary directory - for binary_file in os.listdir(default_bin_dir): - if binary_file.startswith("MODL"): - shutil.copy( - os.path.join(default_bin_dir, binary_file), - os.path.join(tmp_khiops_bin_dir, binary_file), - ) - - # Change runner's Khiops binary directory to the temporary directory - runner.khiops_bin_dir = tmp_khiops_bin_dir - - # Set current runner to the fresh runner - kh.set_runner(runner) - - # Test the core API works - # Call check_database (could be any other method) - with self.assertRaises(kh.KhiopsRuntimeError) as cm: - kh.check_database("a.kdic", "dict_name", "data.txt") - # Test that MODL executable can be found and launched - self.assertIn("khiops ended with return code 2", str(cm.exception)) - - # Set current runner to the default runner - kh.set_runner(default_runner) + # Always set back to the default runner + finally: + kh.set_runner(default_runner) class KhiopsMultitableFitTests(unittest.TestCase): From 63aebdf3ca897da2fa0af4b2a13952e913fc5c08 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Wed, 5 Jun 2024 14:48:21 +0200 Subject: [PATCH 20/37] Admit periods in pre-release version token --- khiops/core/internals/version.py | 23 ++++++++++++++++------- tests/test_core.py | 8 +++++++- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/khiops/core/internals/version.py b/khiops/core/internals/version.py index 59843701..e4303762 100644 --- a/khiops/core/internals/version.py +++ b/khiops/core/internals/version.py @@ -36,12 +36,12 @@ def __init__(self, version_str): self._version_str = version_str # Remove the "v" prefix if present - raw_parts = re.sub("^v", "", self._version_str).split(".") + raw_parts = re.sub("^v", "", self._version_str).split(".", maxsplit=2) # Check the Khiops version format: MAJOR.MINOR.PATCH[-PRE_RELEASE] - if len(raw_parts) != 3: + if len(raw_parts) < 3: self._raise_init_error( - "Version must have the format " "MAJOR.MINOR.PATCH[-PRE_RELEASE]", + "Version must have the format MAJOR.MINOR.PATCH[-PRE_RELEASE]", version_str, ) self._major, self._minor, patch_and_pre_release = raw_parts @@ -73,6 +73,11 @@ def __init__(self, version_str): "PATCH-PRE_RELEASE version part must contain a single '-'", version_str, ) + if patch_and_pre_release.count(".") > 1: + self._raise_init_error( + "PATCH-PRE_RELEASE version part must contain at most a single '.'", + version_str, + ) self._patch, _pre_release = patch_and_pre_release.split("-") # Store only the patch version part if there are only digits @@ -93,7 +98,11 @@ def __init__(self, version_str): ) # Store the rest of the prelease (if any) and check it is a number - self._pre_release_increment = _pre_release.replace(self._pre_release_id, "") + # We accept not having a "." in the pre-release increment for backward + # compatibility. + self._pre_release_increment = _pre_release.replace( + self._pre_release_id, "" + ).replace(".", "") if _is_simple_number(self._pre_release_increment): self._pre_release_increment = int(self._pre_release_increment) else: @@ -102,7 +111,7 @@ def __init__(self, version_str): ) def _raise_init_error(self, msg, version_str): - raise ValueError(f"{msg}. Version string: {version_str}.") + raise ValueError(f"{msg}. Version string: '{version_str}'.") @property def major(self): @@ -123,12 +132,12 @@ def patch(self): def pre_release(self): """str : The version's pre-release tag - Returns: either 'a', 'b' or 'rc' followed by a number or None. + Returns: either 'a', 'b' or 'rc' followed by '.' and a number or None. """ if self._pre_release_id is None: return None else: - return f"{self._pre_release_id}{self._pre_release_increment}" + return f"{self._pre_release_id}.{self._pre_release_increment}" def __repr__(self): return self._version_str diff --git a/tests/test_core.py b/tests/test_core.py index 92ded09c..1e6a9e13 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -2160,6 +2160,7 @@ def test_version_comparisons(self): "9.0.1", "9.5.1-a1", "9.5.1-a2", + "9.5.1-a.3", "9.5.1", "10.0.0", "10.0.1", @@ -2193,11 +2194,16 @@ def test_version_comparisons(self): def test_invalid_versions(self): """Test invalid versions""" for version in [ + "a.b.c-4", + "...", + ".0.4", "ver10.0.0", "10", "10.0", - "10i.4.0", + "10.4.0-5.4," "10i.4.0", "10.4b.3", + "10.4.1-b..2", + "10.4.1.-b.", "10.2.@", "10.@.2", "10.1.2b", From aea667fbe88a097b9223b24cf561d5c38979188e Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Wed, 12 Jun 2024 19:03:02 +0200 Subject: [PATCH 21/37] Update Docker dev containers - use OpenMPI on native installations - get Khiops binaries either as native packages or as Conda packages related_to #183 --- packaging/docker/khiopspydev/Dockerfile.rocky | 36 ++++++++++--------- .../docker/khiopspydev/Dockerfile.ubuntu | 32 +++++++++-------- 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/packaging/docker/khiopspydev/Dockerfile.rocky b/packaging/docker/khiopspydev/Dockerfile.rocky index 7eef216e..6001b806 100644 --- a/packaging/docker/khiopspydev/Dockerfile.rocky +++ b/packaging/docker/khiopspydev/Dockerfile.rocky @@ -31,23 +31,21 @@ RUN true \ python3-setuptools \ python3-pip ; \ fi \ - # Obtain the Khiops sources \ - && git clone https://github.com/khiopsml/khiops.git \ - && cd khiops \ - && git checkout ${KHIOPS_REVISION} \ - # Make sure that MPI is mpich \ + # Get Linux distribution codename \ + && if [ -f /etc/os-release ]; then . /etc/os-release; fi \ + && IFS='.' read -ra VERSION <<< "$VERSION_ID" \ + && ROCKY_VERSION=${VERSION[0]} \ + # Obtain the Khiops native package \ + && KHIOPS_PKG_FILE=$KHIOPS_REVISION/khiops-core-openmpi-$(echo ${KHIOPS_REVISION} | tr '-' '_')-1.el$ROCKY_VERSION.x86_64.rpm \ + && wget -O KHIOPS_CORE.rpm "https://github.com/KhiopsML/khiops/releases/download/${KHIOPS_PKG_FILE}" \ + # Install the Khiops native package \ + && dnf install KHIOPS_CORE.rpm -y \ + && rm -f KHIOPS_CORE.rpm \ + # Make sure that MPI is openmpi \ && source /etc/profile.d/modules.sh \ && module unload mpi \ - # Hard-code MPICH module name \ - && module load mpi/mpich-x86_64 \ - # Build Khiops \ - # Note: We build the JARs and KNI because the `cmake --install` command below doesn't work \ - && cmake --preset linux-gcc-release -DTESTING=OFF -DBUILD_JARS=ON -DCMAKE_INSTALL_PREFIX= \ - && cmake --build --preset linux-gcc-release --parallel \ - --target MODL${MPI_SUFFIX} MODL_Coclustering${MPI_SUFFIX} \ - KhiopsNativeInterface norm_jar khiops_jar \ - && cmake --install ./build/linux-gcc-release \ - && cd .. \ + # Hard-code OpenMPI module name \ + && module load mpi/openmpi-x86_64 \ # Set python to python3.11 and pip to Pip 3.11 on Rocky 8 \ # Set python to python3 on Rocky 9 \ && if [ "$KHIOPSDEV_OS" = "rocky8" ]; then \ @@ -71,8 +69,12 @@ RUN true \ # set up all the supported Python environments under conda (for the unit tests) # relying on a variable containing all the versions ARG PYTHON_VERSIONS -RUN for version in ${PYTHON_VERSIONS}; \ - do /root/miniconda3/bin/conda create -y -n py${version} python=${version}; done +RUN /bin/bash -c 'for version in ${PYTHON_VERSIONS}; \ +do \ + /root/miniconda3/bin/conda create -y -n py${version} python=${version}; \ + /root/miniconda3/bin/conda create -y -n py${version}_conda python=${version}; \ + /root/miniconda3/bin/conda install -y -n py${version}_conda -c conda-forge -c khiops-dev khiops-core=$(echo ${KHIOPS_REVISION} | tr -d "-") ; \ +done' RUN mkdir -p /scripts COPY ./run_service.sh /scripts/run_service.sh diff --git a/packaging/docker/khiopspydev/Dockerfile.ubuntu b/packaging/docker/khiopspydev/Dockerfile.ubuntu index 493f5f5d..81f4556f 100644 --- a/packaging/docker/khiopspydev/Dockerfile.ubuntu +++ b/packaging/docker/khiopspydev/Dockerfile.ubuntu @@ -11,17 +11,15 @@ RUN true \ # Install git (for khiops-python version calculation) and pip \ && apt-get -y update \ && apt-get -y --no-install-recommends install git python3-pip zip pandoc wget \ - # Obtain the Khiops sources \ - && git clone https://github.com/khiopsml/khiops.git \ - && cd khiops \ - && git checkout ${KHIOPS_REVISION} \ - # Build and install khiops \ - # Note: We build the JARs and KNI because the `cmake --install` command below doesn't work \ - && cmake --preset linux-gcc-release -DTESTING=OFF -DBUILD_JARS=ON -DCMAKE_INSTALL_PREFIX= \ - && cmake --build --preset linux-gcc-release --parallel \ - --target MODL MODL_Coclustering KhiopsNativeInterface norm_jar khiops_jar \ - && cmake --install ./build/linux-gcc-release \ - && cd .. \ + # Get Linux distribution codename \ + && if [ -f /etc/os-release ]; then . /etc/os-release; fi \ + # Obtain the Khiops native package \ + && KHIOPS_PKG_FILE=$KHIOPS_REVISION/khiops-core-openmpi_$KHIOPS_REVISION-1-$VERSION_CODENAME.amd64.deb \ + && wget -O KHIOPS_CORE.deb "https://github.com/KhiopsML/khiops/releases/download/${KHIOPS_PKG_FILE}" \ + # Install the Khiops native package \ + && dpkg -i --force-all KHIOPS_CORE.deb \ + && apt-get -f -y install \ + && rm -f KHIOPS_CORE.deb \ # Set python to python3 \ && update-alternatives --install /usr/bin/python python /usr/bin/python3 1 \ # Install miniconda3 to have multiple Python versions via Conda \ @@ -30,8 +28,8 @@ RUN true \ && echo "b978856ec3c826eb495b60e3fffe621f670c101150ebcbdeede4f961f22dc438 Miniconda3-py312_24.1.2-0-Linux-x86_64.sh" | sha256sum --check \ && bash ./Miniconda3-py312_24.1.2-0-Linux-x86_64.sh -b -u -p /root/miniconda3 \ && rm -rf /root/miniconda3/Miniconda3-py312_24.1.2-0-Linux-x86_64.sh \ - # Make sure that MPI is mpich \ - && update-alternatives --set mpirun /usr/bin/mpirun.mpich \ + # Make sure that MPI is openmpi \ + && update-alternatives --set mpirun /usr/bin/mpirun.openmpi \ # Clean build files \ && rm -fr /var/lib/apt/lists/* \ && apt-get clean \ @@ -41,8 +39,12 @@ RUN true \ # set up all the supported Python environments under conda (for the unit tests) # relying on a variable containing all the versions ARG PYTHON_VERSIONS -RUN for version in ${PYTHON_VERSIONS}; \ - do /root/miniconda3/bin/conda create -y -n py${version} python=${version}; done +RUN /bin/bash -c 'for version in ${PYTHON_VERSIONS}; \ +do \ + /root/miniconda3/bin/conda create -y -n py${version} python=${version}; \ + /root/miniconda3/bin/conda create -y -n py${version}_conda python=${version}; \ + /root/miniconda3/bin/conda install -y -n py${version}_conda -c conda-forge -c khiops-dev khiops-core=$(echo ${KHIOPS_REVISION} | tr -d "-") ; \ +done' RUN mkdir -p /scripts COPY ./run_service.sh /scripts/run_service.sh From f4fdcba0b02e3602a9a3517539b07d7f468f6149 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Wed, 12 Jun 2024 19:05:24 +0200 Subject: [PATCH 22/37] Update Khiops binaries package version to 10.2.2b3 in the CI The update is also propagated to the `khiops-core` Conda dependency to facilitate Conda package manufacturing and testing workflow. related_to #183 --- .github/workflows/conda.yml | 6 +++--- .github/workflows/dev-docker.yml | 2 +- packaging/conda/meta.yaml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index fbcef45c..67f78266 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -4,12 +4,12 @@ env: DEFAULT_SAMPLES_VERSION: 10.2.0 # Note: The default Khiops version must never be an alpha release as they are # ephemeral. To test alpha versions run the workflow manually. - DEFAULT_KHIOPS_CORE_VERSION: 10.2.1 + DEFAULT_KHIOPS_CORE_VERSION: 10.2.2b.3 on: workflow_dispatch: inputs: khiops-core-version: - default: 10.2.1 + default: 10.2.2b.3 description: khiops-core version for testing khiops-samples-version: default: 10.2.0 @@ -45,7 +45,7 @@ jobs: miniconda-version: latest python-version: '3.12' - name: Install Dependency Requirements for Building Conda Packages - run: conda install conda-build + run: conda install -y conda-build - name: Build the Conda Package # Note: The "khiops-dev" conda channel is needed to retrieve the "khiops-core" package. # The "test" part of the conda recipe needs this package. diff --git a/.github/workflows/dev-docker.yml b/.github/workflows/dev-docker.yml index 522662fd..8c17d4a3 100644 --- a/.github/workflows/dev-docker.yml +++ b/.github/workflows/dev-docker.yml @@ -1,7 +1,7 @@ --- name: Dev Docker env: - DEFAULT_KHIOPS_REVISION: main + DEFAULT_KHIOPS_REVISION: 10.2.2-b.3 DEFAULT_SERVER_REVISION: main DEFAULT_PYTHON_VERSIONS: 3.8 3.9 3.10 3.11 3.12 on: diff --git a/packaging/conda/meta.yaml b/packaging/conda/meta.yaml index 0e62f11e..40050dbb 100644 --- a/packaging/conda/meta.yaml +++ b/packaging/conda/meta.yaml @@ -25,7 +25,7 @@ requirements: - python run: - python - - khiops-core >=10.0.0,<11.0.0 + - khiops-core >=10.2.2b.3,<11.0.0 - pandas >=0.25.3 - scikit-learn >=0.22.2 run_constrained: From d67e0dc02763c74a1156585b4b13debcea87e5e8 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Wed, 12 Jun 2024 19:26:52 +0200 Subject: [PATCH 23/37] Also test on the `khiops-core`-based Conda environments These Conda environments use the `khiops-core` Conda package instead of the native Khiops packages. related_to #183 --- .github/workflows/unit-tests.yml | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 9254cd82..555a45ed 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -60,9 +60,14 @@ jobs: if: success() || failure() run: | mkdir -p -m u+rwx reports/py${{ matrix.python-version }} - # install within the conda environment without activating it + mkdir -p -m u+rwx reports/py${{ matrix.python-version }}_conda + # install within the conda environments without activating them + # Native Khiops-based Conda environment /root/miniconda3/bin/conda install -y -n py${{ matrix.python-version }} -c conda-forge unittest-xml-reporting /root/miniconda3/bin/conda install -y -n py${{ matrix.python-version }} --file test-requirements.txt + # `khiops-core`-based Conda environment + /root/miniconda3/bin/conda install -y -n py${{ matrix.python-version }}_conda -c conda-forge unittest-xml-reporting + /root/miniconda3/bin/conda install -y -n py${{ matrix.python-version }}_conda --file test-requirements.txt - name: Install khiops-python dependencies if: success() || failure() run: | @@ -70,8 +75,12 @@ jobs: # as the Git repository is in a directory the current user does not own, # Python versioneer fails to compute the current version correctly otherwise git config --global --add safe.directory $(realpath .) + # Native Khiops-based Conda environment /root/miniconda3/bin/conda run --no-capture-output -n py${{ matrix.python-version }} python setup.py egg_info /root/miniconda3/bin/conda install -y -n py${{ matrix.python-version }} `grep -v "^\[" khiops.egg-info/requires.txt` + # `khiops-core`-based Conda environment + /root/miniconda3/bin/conda run --no-capture-output -n py${{ matrix.python-version }}_conda python setup.py egg_info + /root/miniconda3/bin/conda install -y -n py${{ matrix.python-version }}_conda `grep -v "^\[" khiops.egg-info/requires.txt` rm -rf khiops.egg-info - name: Prepare Unit Tests Environment if: github.ref != 'dev' && github.ref != 'main' && ! inputs.run-long-tests @@ -86,14 +95,21 @@ jobs: # This is needed so that the Git tag is parsed and the khiops-python # version is retrieved git config --global --add safe.directory $(realpath .) + # Native Khiops-based Conda environments /root/miniconda3/bin/conda run --no-capture-output -n py${{ matrix.python-version }} coverage run -m xmlrunner -o "reports/py${{ matrix.python-version }}" -v /root/miniconda3/bin/conda run --no-capture-output -n py${{ matrix.python-version }} coverage report -m /root/miniconda3/bin/conda run --no-capture-output -n py${{ matrix.python-version }} coverage xml -o "reports/py${{ matrix.python-version }}/py-coverage.xml" + # `khiops-core`-based Conda environments + /root/miniconda3/bin/conda run --no-capture-output -n py${{ matrix.python-version }}_conda coverage run -m xmlrunner -o "reports/py${{ matrix.python-version }}_conda" -v + /root/miniconda3/bin/conda run --no-capture-output -n py${{ matrix.python-version }}_conda coverage report -m + /root/miniconda3/bin/conda run --no-capture-output -n py${{ matrix.python-version }}_conda coverage xml -o "reports/py${{ matrix.python-version }}_conda/py-coverage.xml" - name: Display Unit Test Reports uses: dorny/test-reporter@v1 with: name: Unit Tests ${{ matrix.python-version }} - path: reports/py${{ matrix.python-version }}/TEST-tests.*.*.xml + path: >- + reports/py${{ matrix.python-version }}/TEST-tests.*.*.xml, + reports/py${{ matrix.python-version }}_conda/TEST-tests.*.*.xml reporter: java-junit path-replace-backslashes: 'true' # Necessary for windows paths - name: Upload Test Reports as Artifacts @@ -103,6 +119,8 @@ jobs: path: |- reports/py${{ matrix.python-version }}/TEST-tests.*.*.xml reports/py${{ matrix.python-version }}/py-coverage.xml + reports/py${{ matrix.python-version }}_conda/TEST-tests.*.*.xml + reports/py${{ matrix.python-version }}_conda/py-coverage.xml tests/resources/scenario_generation/*/ref/*._kh tests/resources/scenario_generation/*/output/*._kh tests/resources/*/output_reports/*.txt From 4ecfdb2697eca9d22427b904b5d48e0d520692c1 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Tue, 18 Jun 2024 16:14:13 +0200 Subject: [PATCH 24/37] Check inferred Conda env binary dir is really inside the Conda env --- khiops/core/internals/runner.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index 9f1bba75..14eadf4a 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -230,6 +230,33 @@ def _infer_env_bin_dir_for_conda_based_installations(): return env_bin_dir +def _check_conda_env_bin_dir(conda_env_bin_dir): + """Check inferred Conda environment binary directory really is one + + A real Conda environment binary directory: + - should exist + - should not be directly under the root directory + - should coexist with `conda-meta` directory under the same parent + """ + conda_env_bin_dir_path = Path(conda_env_bin_dir) + + # Conda env bin dir should end with `/bin` + assert conda_env_bin_dir_path.parts[-1] == "bin" + + is_conda_env_bin_dir = False + + # Conda env dir is not equal to its root dir + # Conda env bin dir exists, along with the `conda-meta` dir + conda_env_dir_path = conda_env_bin_dir_path.parent + if ( + conda_env_dir_path != conda_env_dir_path.root + and conda_env_bin_dir_path.is_dir() + and conda_env_dir_path.joinpath("conda-meta").is_dir() + ): + is_conda_env_bin_dir = True + return is_conda_env_bin_dir + + def _infer_khiops_installation_method(trace=False): """Return the Khiops installation method""" # We are in a conda environment if @@ -249,7 +276,9 @@ def _infer_khiops_installation_method(trace=False): env_bin_dir = _infer_env_bin_dir_for_conda_based_installations() if trace: print(f"Environment binary dir: '{env_bin_dir}'") - if _modl_and_mpiexec_executables_exist(env_bin_dir): + if _check_conda_env_bin_dir( + env_bin_dir + ) and _modl_and_mpiexec_executables_exist(env_bin_dir): installation_method = "conda-based" else: installation_method = "binary+pip" From 87493a2fd8faf5c5c66001461d7d121447c42c8b Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Tue, 18 Jun 2024 16:17:08 +0200 Subject: [PATCH 25/37] Use OpenMPI on Linux system-wide installs - use KHIOPS_PROC_NUMBER or the number of system cores directly in the mpiexec command; thus, oversubscription is avoided for OpenMPI - do not use MPI for <= 2 CPUs (as one is master anyway) - reset mpiexec command according to `max_core` change in the `KhiopsLocalRunner` closes #183 --- .github/workflows/unit-tests.yml | 5 +- khiops/core/internals/runner.py | 122 +++++++++++++++++------------- tests/test_core.py | 24 +++++- tests/test_khiops_integrations.py | 44 +++++------ 4 files changed, 116 insertions(+), 79 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 555a45ed..29bad24f 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -91,6 +91,9 @@ jobs: KHIOPS_DOCKER_RUNNER_URL: https://localhost:11000 KHIOPS_DOCKER_RUNNER_SHARED_DIR: /tmp/sandbox KHIOPS_RUNNER_SERVICE_PATH: /scripts/run_service.sh + # This is needed so that OpenMPI's mpiexec can be run as root + OMPI_ALLOW_RUN_AS_ROOT: 1 + OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 run: | # This is needed so that the Git tag is parsed and the khiops-python # version is retrieved @@ -170,5 +173,5 @@ jobs: # Make sure MPI support is not loaded through env modules # Note: As Docker container's shell is non-interactive, environment # modules are currently not initializing the shell anyway - if [[ -n "$MODULESHOME" ]]; then module unload mpi; fi + if [ -n "$MODULESHOME" ]; then module unload mpi; fi python -m unittest -v tests.test_khiops_integrations.KhiopsRunnerEnvironmentTests.test_runner_has_mpiexec_on_linux diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index 14eadf4a..e4417c38 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -181,12 +181,9 @@ def _compute_max_cores_from_proc_number(proc_number): # if KHIOPS_PROC_NUMBER is 0 we set max_cores to the system's core number if proc_number == 0: max_cores = _get_system_cpu_cores() - # if KHIOPS_PROC_NUMBER is 1 we just set max_cores to 1 (no MPI) - elif proc_number == 1: - max_cores = 1 - # Otherwise we set max_cores to KHIOPS_PROC_NUMBER - 1 + # Otherwise we set max_cores to KHIOPS_PROC_NUMBER else: - max_cores = proc_number - 1 + max_cores = proc_number return max_cores @@ -472,6 +469,9 @@ def max_cores(self): @max_cores.setter def max_cores(self, core_number): + self._set_max_cores(core_number) + + def _set_max_cores(self, core_number): self.general_options.max_cores = core_number self.general_options.check() @@ -1017,16 +1017,20 @@ def __init__(self): # Initialize Khiops environment self._start_khiops_environment_initialization() + def _set_max_cores(self, core_number): + super()._set_max_cores(core_number) + self._initialize_mpi_command_args() + def _start_khiops_environment_initialization(self): # Set the Khiops process number according to the `KHIOPS_PROC_NUMBER` env var if "KHIOPS_PROC_NUMBER" in os.environ: self.max_cores = _compute_max_cores_from_proc_number( int(os.environ["KHIOPS_PROC_NUMBER"]) ) - # If not defined, set it to the number of system cores + 1 + # If not defined, set it to the number of system cores else: self.max_cores = _get_system_cpu_cores() - os.environ["KHIOPS_PROC_NUMBER"] = str(self.max_cores + 1) + os.environ["KHIOPS_PROC_NUMBER"] = str(self.max_cores) # Set the Khiops memory limit if "KHIOPS_MEMORY_LIMIT" in os.environ: @@ -1054,6 +1058,16 @@ def _start_khiops_environment_initialization(self): else: self.khiops_temp_dir = "" + # Set the OpenMPI variable OMPI_MCA_plm_rsh_agent to the empty string if not set + # This avoids errors on systems without ssh (eg. simple Docker containers) + installation_method = _infer_khiops_installation_method() + if ( + platform.system() == "Linux" + and installation_method == "binary+pip" + and "OMPI_MCA_plm_rsh_agent" not in os.environ + ): + os.environ["OMPI_MCA_plm_rsh_agent"] = "" + # Initialize the default samples dir self._initialize_default_samples_dir() @@ -1154,60 +1168,61 @@ def _initialize_mpi_command_args(self): self._set_mpi_command_args_with_mpiexec(mpiexec_path) # If MPI is still not found, then do not use MPI and warn the user else: - self._set_empty_mpi_command_args_and_raise_warning() - - def _set_empty_mpi_command_args_and_raise_warning(self): - self.mpi_command_args = [] - warnings.warn( - "mpiexec is not in PATH, Khiops will run with just one CPU. " - "We recommend you to reinstall khiops. " - "Go to https://khiops.org for more information." - ) + self.mpi_command_args = [] + warnings.warn( + "mpiexec is not in PATH, Khiops will run with just one CPU. " + "We recommend you to reinstall khiops. " + "Go to https://khiops.org for more information." + ) def _set_mpi_command_args_with_mpiexec(self, mpiexec_path): - self.mpi_command_args = [mpiexec_path] - mpi_command_args = os.environ.get("KHIOPS_MPI_COMMAND_ARGS") - if mpi_command_args is not None: - self.mpi_command_args += shlex.split(mpi_command_args) - elif platform.system() == "Linux": - self.mpi_command_args += [ - "-bind-to", - "hwthread", - "-map-by", - "core", - "-n", - str(self.max_cores + 1), - ] - elif platform.system() == "Darwin": - # Note: The '-host localhost' arguments for arm64 - # may be removed when mpich > 4.1.2 is released - if platform.processor() == "arm": + assert mpiexec_path is not None + # User-specified MPI command args take precendence over automatic setting + if "KHIOPS_MPI_COMMAND_ARGS" in os.environ: + self.mpi_command_args = [mpiexec_path] + shlex.split( + os.environ["KHIOPS_MPI_COMMAND_ARGS"] + ) + # With only 1 or 2 processes run sequentially (without MPI) + elif self.max_cores in (1, 2): + self.mpi_command_args = [] + warnings.warn( + f"Too few cores: {self.max_cores}. " + "To efficiently run Khiops in parallel at least 3 processes " + "are needed. Khiops will run in a single process." + ) + # Otherwise, build the mpiexec command arguments + else: + self.mpi_command_args = [mpiexec_path] + if platform.system() == "Windows": + self.mpi_command_args += [ + "-al", + "spr:P", + "-n", + str(self.max_cores), + "/priority", + "1", + ] + elif platform.system() == "Linux": + self.mpi_command_args += [ + "-bind-to", + "hwthread", + "-map-by", + "core", + "-n", + str(self.max_cores), + ] + elif platform.system() == "Darwin": self.mpi_command_args += [ "-host", "localhost", "-n", - str(self.max_cores + 1), + str(self.max_cores), ] else: - self.mpi_command_args = [ - mpiexec_path, - "-n", - str(self.max_cores + 1), - ] - elif platform.system() == "Windows": - self.mpi_command_args += [ - "-al", - "spr:P", - "-n", - str(self.max_cores + 1), - "/priority", - "1", - ] - else: - raise KhiopsEnvironmentError( - f"Unsupported OS {platform.system()}. " - "Check the supported OSes at https://khiops.org." - ) + raise KhiopsEnvironmentError( + f"Unsupported OS {platform.system()}. " + "Check the supported OSes at https://khiops.org." + ) def _initialize_default_samples_dir(self): """See class docstring""" @@ -1268,6 +1283,7 @@ def _initialize_khiops_bin_dir(self): # System-wide installations else: self._initialize_default_system_wide_khiops_bin_dir() + assert self.khiops_bin_dir is not None def _initialize_default_system_wide_khiops_bin_dir(self): # Warn if both KHIOPS_HOME and KhiopsHome are set diff --git a/tests/test_core.py b/tests/test_core.py index 1e6a9e13..a397d599 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -2599,7 +2599,7 @@ def test_khiops_environment_variables_basic(self): "variable": "KHIOPS_PROC_NUMBER", "value": 2, "runner_field": "max_cores", - "expected_field_value": 1, + "expected_field_value": 2, }, { "variable": "KHIOPS_PROC_NUMBER", @@ -2661,6 +2661,24 @@ def test_khiops_environment_variables_basic(self): else: os.environ[fixture["variable"]] = old_value + def test_mpi_command_is_updated_on_max_cores_update(self): + """Test MPI command is updated on max_cores update""" + # Create a fresh runner and initialize its env + with MockedRunnerContext(create_mocked_raw_run(False, False, 0)) as runner: + pass + + # Update max_cores + max_cores_updated_value = 100 + runner.max_cores = max_cores_updated_value + + # Check MPI command arguments contain the updated max_cores + # The number of cores in the MPI command is the value after '-n' + mpi_command_args = runner.mpi_command_args + max_cores_in_mpi_command = int( + mpi_command_args[mpi_command_args.index("-n") + 1] + ) + self.assertEqual(max_cores_in_mpi_command, max_cores_updated_value) + def test_undefined_khiops_proc_number_env_var(self): """Test default value for KHIOPS_PROC_NUMBER env var @@ -2677,8 +2695,8 @@ def test_undefined_khiops_proc_number_env_var(self): pass # Define default `KHIOPS_PROC_NUMBER` and check the `maxcores` attribute # is set accordingly - default_khiops_proc_number = _get_system_cpu_cores() + 1 - self.assertEqual(runner.max_cores, default_khiops_proc_number - 1) + default_khiops_proc_number = _get_system_cpu_cores() + self.assertEqual(runner.max_cores, default_khiops_proc_number) # Check default environment variable value is added self.assertTrue("KHIOPS_PROC_NUMBER" in os.environ) diff --git a/tests/test_khiops_integrations.py b/tests/test_khiops_integrations.py index 20d62094..7b7c30c8 100644 --- a/tests/test_khiops_integrations.py +++ b/tests/test_khiops_integrations.py @@ -35,7 +35,7 @@ def test_runner_has_mpiexec_on_linux(self): # Check package is installed on supported platform: # Check /etc/os-release for Linux version linux_distribution = None - mpich_found = None + openmpi_found = None with open( os.path.join(os.sep, "etc", "os-release"), encoding="ascii" ) as os_release_info: @@ -44,48 +44,48 @@ def test_runner_has_mpiexec_on_linux(self): linux_distribution = entry.split("=")[-1].strip('"\n').lower() break - # Check if MPICH is installed on the Debian Linux OS + # Check if OpenMPI is installed on the Debian Linux OS if linux_distribution == "ubuntu": with subprocess.Popen( - ["dpkg", "-l", "mpich"], + ["dpkg", "-l", "openmpi-bin"], stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True, - ) as mpich_query: - stdout, _ = mpich_query.communicate() - if mpich_query.returncode != 0: - mpich_found = False + ) as openmpi_query: + stdout, _ = openmpi_query.communicate() + if openmpi_query.returncode != 0: + openmpi_found = False for line in stdout.splitlines(): - if all(field in line for field in ("ii", "mpich")): - # MPICH installed - mpich_found = True + # openmpi installed + if all(field in line for field in ("ii", "openmpi")): + openmpi_found = True break else: - mpich_found = False + openmpi_found = False - # Check if MPICH is installed on the CentOS / Rocky Linux OS + # Check if openmpi is installed on the CentOS / Rocky Linux OS elif linux_distribution == "rocky linux": with subprocess.Popen( - ["yum", "list", "installed", "mpich"], + ["yum", "list", "installed", "openmpi"], stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True, - ) as mpich_query: - stdout, _ = mpich_query.communicate() - if mpich_query.returncode != 0: - mpich_found = False + ) as openmpi_query: + stdout, _ = openmpi_query.communicate() + if openmpi_query.returncode != 0: + openmpi_found = False for line in stdout.splitlines(): - if line.startswith("mpich"): - # MPICH installed - mpich_found = True + # openmpi installed + if line.startswith("openmpi"): + openmpi_found = True break else: - mpich_found = False + openmpi_found = False else: self.skipTest("Skipping test: platform not Ubuntu or Rocky Linux") - if mpich_found: + if openmpi_found: runner = kh.get_runner() if not runner.mpi_command_args: self.fail("MPI support found, but MPI command args not set") From 7dad777efbb4f54891a00249931fc8ed37550f46 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Tue, 18 Jun 2024 15:41:55 +0200 Subject: [PATCH 26/37] Oversubscribe in the CI to use 4 cores Thusly, tests become more realistic. --- .github/workflows/conda.yml | 8 ++++++++ .github/workflows/pip.yml | 7 +++++++ .github/workflows/unit-tests.yml | 10 ++++++++++ 3 files changed, 25 insertions(+) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 67f78266..5a43c5e0 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -111,6 +111,14 @@ jobs: kh-download-datasets \ --version ${{ inputs.khiops-samples-version || env.DEFAULT_SAMPLES_VERSION }} - name: Test Conda Package Installation on Samples + env: + # Force > 2 CPU cores to launch mpiexec + KHIOPS_PROC_NUMBER: 4 + # Oversubscribe for MPI 4.x + rmaps_base_oversubscribe: true + # Oversubscribe for MPI > 4.x + OMPI_MCA_rmaps_base_oversubscribe: true + PRTE_MCA_rmaps_default_mapping_policy: :oversubscribe run: | kh-samples core -i train_predictor -e kh-samples core -i train_predictor_error_handling -e diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml index 900a9fb4..c73ee281 100644 --- a/.github/workflows/pip.yml +++ b/.github/workflows/pip.yml @@ -71,6 +71,13 @@ jobs: - name: Run tests env: KHIOPS_SAMPLES_DIR: ${{ github.workspace }}/khiops-samples + # Force > 2 CPU cores to launch mpiexec + KHIOPS_PROC_NUMBER: 4 + # Oversubscribe for MPI 4.x + rmaps_base_oversubscribe: true + # Oversubscribe for MPI > 4.x + OMPI_MCA_rmaps_base_oversubscribe: true + PRTE_MCA_rmaps_default_mapping_policy: :oversubscribe run: |- # Make sure MPI support is not loaded through env modules # Note: As the Docker container's shell is non-interactive, environment diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 29bad24f..bf7b9f45 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -94,6 +94,13 @@ jobs: # This is needed so that OpenMPI's mpiexec can be run as root OMPI_ALLOW_RUN_AS_ROOT: 1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 + # Force > 2 CPU cores to launch mpiexec + KHIOPS_PROC_NUMBER: 4 + # Oversubscribe for MPI 4.x + rmaps_base_oversubscribe: true + # Oversubscribe for MPI > 4.x + OMPI_MCA_rmaps_base_oversubscribe: true + PRTE_MCA_rmaps_default_mapping_policy: :oversubscribe run: | # This is needed so that the Git tag is parsed and the khiops-python # version is retrieved @@ -169,6 +176,9 @@ jobs: run: | pip install -r test-requirements.txt - name: Launch proper MPI awareness test + env: + # Force > 2 CPU cores to launch mpiexec + KHIOPS_PROC_NUMBER: 4 run: |- # Make sure MPI support is not loaded through env modules # Note: As Docker container's shell is non-interactive, environment From bd92a2a4624ee40e2f6ae693162b9910bd15c5e7 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Fri, 14 Jun 2024 16:50:24 +0200 Subject: [PATCH 27/37] Allow OpenMPI to run as root for all workflows --- .github/workflows/conda.yml | 3 +++ .github/workflows/pip.yml | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 5a43c5e0..0c185f29 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -114,6 +114,9 @@ jobs: env: # Force > 2 CPU cores to launch mpiexec KHIOPS_PROC_NUMBER: 4 + # This is needed so that OpenMPI's mpiexec can be run as root + OMPI_ALLOW_RUN_AS_ROOT: 1 + OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 # Oversubscribe for MPI 4.x rmaps_base_oversubscribe: true # Oversubscribe for MPI > 4.x diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml index c73ee281..ed05e318 100644 --- a/.github/workflows/pip.yml +++ b/.github/workflows/pip.yml @@ -71,6 +71,9 @@ jobs: - name: Run tests env: KHIOPS_SAMPLES_DIR: ${{ github.workspace }}/khiops-samples + # This is needed so that OpenMPI's mpiexec can be run as root + OMPI_ALLOW_RUN_AS_ROOT: 1 + OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 # Force > 2 CPU cores to launch mpiexec KHIOPS_PROC_NUMBER: 4 # Oversubscribe for MPI 4.x @@ -82,7 +85,7 @@ jobs: # Make sure MPI support is not loaded through env modules # Note: As the Docker container's shell is non-interactive, environment # modules are currently not initializing the shell anyway - if [[ -n "$MODULESHOME" ]]; then module unload mpi; fi + if [ -n "$MODULESHOME" ]; then module unload mpi; fi # Print khiops installation status kh-status From 5d8d4ca41d377aa08e442db6c7d6fa3fce2839d3 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Fri, 14 Jun 2024 17:48:23 +0200 Subject: [PATCH 28/37] Force `khiops` Conda package installation from the artefact channel in the CI --- .github/workflows/conda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 0c185f29..50f1dbc7 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -97,7 +97,7 @@ jobs: if: runner.os == 'Windows' run: | conda install --channel khiops-dev khiops-core=$KHIOPS_CORE_VERSION - conda install --channel ./khiops-conda/ khiops + conda install --override-channels --channel conda-forge --channel ./khiops-conda/ khiops # In Linux/macOS we need the conda-forge channel to install their pinned versions - name: Install the Khiops Conda package (Linux/macOS) if: runner.os != 'Windows' From 00a25403744c0b5d364889ee70ab81bf319ae704 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Thu, 20 Jun 2024 09:59:08 +0200 Subject: [PATCH 29/37] Drop MacOS 11 support, as GitHub runners are dropping it as well See https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners/about-github-hosted-runners. --- .github/workflows/conda.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 50f1dbc7..a107619c 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -72,7 +72,6 @@ jobs: - {os: ubuntu-22.04, json-image: '{"image": "rockylinux:9"}'} - {os: windows-2019, json-image: '{"image": null}'} - {os: windows-2022, json-image: '{"image": null}'} - - {os: macos-11, json-image: '{"image": null}'} - {os: macos-12, json-image: '{"image": null}'} - {os: macos-13, json-image: '{"image": null}'} - {os: macos-14, json-image: '{"image": null}'} From 540b857536c0e73bd49e587f81aab84238338977 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Thu, 20 Jun 2024 18:32:10 +0200 Subject: [PATCH 30/37] Remove OpenMPI-specific CI environment variables for Conda tests This is because OpenMPI is not used in Conda environments for Khiops Conda packages. MPICH is used instead. --- .github/workflows/conda.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index a107619c..74297025 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -113,14 +113,6 @@ jobs: env: # Force > 2 CPU cores to launch mpiexec KHIOPS_PROC_NUMBER: 4 - # This is needed so that OpenMPI's mpiexec can be run as root - OMPI_ALLOW_RUN_AS_ROOT: 1 - OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 - # Oversubscribe for MPI 4.x - rmaps_base_oversubscribe: true - # Oversubscribe for MPI > 4.x - OMPI_MCA_rmaps_base_oversubscribe: true - PRTE_MCA_rmaps_default_mapping_policy: :oversubscribe run: | kh-samples core -i train_predictor -e kh-samples core -i train_predictor_error_handling -e From 2ed28643511617c33b5cfe529f72052b136bdca0 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Thu, 20 Jun 2024 19:03:31 +0200 Subject: [PATCH 31/37] Look-up the OpenMPI environment module for Rocky local installations --- khiops/core/internals/runner.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index e4417c38..c5f7a794 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -1100,9 +1100,9 @@ def _initialize_mpi_command_args(self): mpiexec_path = os.environ.get("KHIOPS_MPIEXEC_PATH") or shutil.which( "mpiexec" ) - # If mpiexec is not in the path, then try to load MPI environment module - # so that mpiexec is in the path - if mpiexec_path is None: + # If mpiexec is not in the path, and the installation method is local, + # then try to load MPI environment module so that mpiexec is in the path + if mpiexec_path is None and installation_method == "binary+pip": # If environment modules are installed, then load the MPI module module_init_script_path = os.path.join( os.path.sep, "etc", "profile.d", "modules.sh" @@ -1130,17 +1130,13 @@ def _initialize_mpi_command_args(self): reverse=True, ): # If MPI environment module is found, attempt to load it - if ( - re.search("mpich-[0-9]", line) is not None - and platform.machine() in line - or f"mpich-{platform.machine()}" in line - ): - mpich_module = line + if f"openmpi-{platform.machine()}" in line: + mpi_module = line # Use 'type -P' to get the path to executable, # as 'which' is non-portable shell_command = shlex.split( f"sh -c 'source {module_init_script_path} && " - f"module unload mpi && module load {mpich_module} && " + f"module unload mpi && module load {mpi_module} && " "type -P mpiexec'" ) with subprocess.Popen( From 3094527b394b29049ad066c399e53be85178fc82 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Fri, 28 Jun 2024 14:33:19 +0200 Subject: [PATCH 32/37] Honor recent MPI command simplifications in upstream Khiops binary packages Thusly, functional parity is kept with the `khiops-env` script which is part of the native Khiops binary packages. closes #192 --- khiops/core/internals/runner.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index c5f7a794..f3d341df 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -1155,13 +1155,13 @@ def _initialize_mpi_command_args(self): break if mpiexec_path is not None: self._set_mpi_command_args_with_mpiexec( - mpiexec_path + mpiexec_path, installation_method ) break # If MPI is found, then set the path to mpiexec accordingly if mpiexec_path is not None: - self._set_mpi_command_args_with_mpiexec(mpiexec_path) + self._set_mpi_command_args_with_mpiexec(mpiexec_path, installation_method) # If MPI is still not found, then do not use MPI and warn the user else: self.mpi_command_args = [] @@ -1171,7 +1171,7 @@ def _initialize_mpi_command_args(self): "Go to https://khiops.org for more information." ) - def _set_mpi_command_args_with_mpiexec(self, mpiexec_path): + def _set_mpi_command_args_with_mpiexec(self, mpiexec_path, installation_method): assert mpiexec_path is not None # User-specified MPI command args take precendence over automatic setting if "KHIOPS_MPI_COMMAND_ARGS" in os.environ: @@ -1199,11 +1199,10 @@ def _set_mpi_command_args_with_mpiexec(self, mpiexec_path): "1", ] elif platform.system() == "Linux": + # For Linux native installations we use OpenMPI + if installation_method == "binary+pip": + self.mpi_command_args.append("--quiet") self.mpi_command_args += [ - "-bind-to", - "hwthread", - "-map-by", - "core", "-n", str(self.max_cores), ] From c698cd214920c9d144a78c1fa9554da2b179102b Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Mon, 1 Jul 2024 13:13:28 +0200 Subject: [PATCH 33/37] Backport OpenMPI variables from `khiops-env` to mitigate errors Thus, PR https://github.com/KhiopsML/khiops/pull/313 that addresses issue https://github.com/KhiopsML/khiops/issues/307 is backported to the KhiopsLocalRunner Python code. closes #196 --- khiops/core/internals/runner.py | 84 ++++++++++++++++++++++++++++--- tests/test_khiops_integrations.py | 4 +- 2 files changed, 78 insertions(+), 10 deletions(-) diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index f3d341df..5fe03059 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -294,6 +294,62 @@ def _check_executable(bin_path): ) +def get_linux_distribution_name(): + """Detect Linux distribution name + + Parses the `NAME` variable defined in the `/etc/os-release` or + `/usr/lib/os-release` files and converts it to lowercase. + + Returns + ------- + str + Name of the Linux distribution, converted to lowecase + + Raises + ------ + OSError + If neither `/etc/os-release` nor `/usr/lib/os-release` are found + """ + + def get_linux_distribution_from_os_release_file(os_release_file_path): + # The `NAME` variable is always defined according to the freedesktop.org + # standard: + # https://www.freedesktop.org/software/systemd/man/latest/os-release.html + with open(os_release_file_path, encoding="ascii") as os_release_info_file: + for entry in os_release_info_file: + if entry.startswith("NAME"): + linux_distribution = entry.split("=")[-1].strip('"\n') + break + return linux_distribution + + assert platform.system() == "Linux" + + # If Python version >= 3.10, use standard library support; see + # https://docs.python.org/3/library/platform.html#platform.freedesktop_os_release + python_ver_major, python_ver_minor, _ = platform.python_version_tuple() + if int(python_ver_major) >= 3 and int(python_ver_minor) >= 10: + linux_distribution = platform.freedesktop_os_release()["NAME"] + + # If Python version < 3.10, determine the Linux distribution manually, + # but mimic the behavior of Python >= 3.10 standard library support + else: + # First try to parse /etc/os-release + try: + linux_distribution = get_linux_distribution_from_os_release_file( + os.path.join(os.sep, "etc", "os-release") + ) + except FileNotFoundError: + # Fallback on parsing /usr/lib/os-release + try: + linux_distribution = get_linux_distribution_from_os_release_file( + os.path.join(os.sep, "usr", "lib", "os-release") + ) + # Mimic `platform.freedesktop_os_release` function behavior + except FileNotFoundError as error: + raise OSError from error + return linux_distribution.lower() + + class KhiopsRunner(ABC): """Abstract Khiops Python runner to be re-implemented""" @@ -1058,15 +1114,27 @@ def _start_khiops_environment_initialization(self): else: self.khiops_temp_dir = "" - # Set the OpenMPI variable OMPI_MCA_plm_rsh_agent to the empty string if not set - # This avoids errors on systems without ssh (eg. simple Docker containers) installation_method = _infer_khiops_installation_method() - if ( - platform.system() == "Linux" - and installation_method == "binary+pip" - and "OMPI_MCA_plm_rsh_agent" not in os.environ - ): - os.environ["OMPI_MCA_plm_rsh_agent"] = "" + if platform.system() == "Linux" and installation_method == "binary+pip": + # Set the OpenMPI variable OMPI_MCA_plm_rsh_agent to the empty string + # if not set + # This avoids errors on systems without ssh (eg. simple Docker containers) + if "OMPI_MCA_plm_rsh_agent" not in os.environ: + os.environ["OMPI_MCA_plm_rsh_agent"] = "" + + # Set the OpenMPI variable OMPI_MCA_btl_vader_single_copy_mechanism + # to the "none" string value to remove the mpi message + # "Read -1, expected 65536, errno = 1" that appears on Docker + if "OMPI_MCA_btl_vader_single_copy_mechanism" not in os.environ: + os.environ["OMPI_MCA_btl_vader_single_copy_mechanism"] = "none" + + # Set the OpenMPI variable PSM3_DEVICES to the "self" string value to + # fix issue https://github.com/KhiopsML/khiops/issues/307 on Rocky + if ( + get_linux_distribution_name() == "rocky linux" + and "PSM3_DEVICES" not in os.environ + ): + os.environ["PSM3_DEVICES"] = "self" # Initialize the default samples dir self._initialize_default_samples_dir() diff --git a/tests/test_khiops_integrations.py b/tests/test_khiops_integrations.py index 7b7c30c8..02a18392 100644 --- a/tests/test_khiops_integrations.py +++ b/tests/test_khiops_integrations.py @@ -15,7 +15,7 @@ import khiops.core as kh from khiops.core.exceptions import KhiopsEnvironmentError -from khiops.core.internals.runner import KhiopsLocalRunner +from khiops.core.internals.runner import KhiopsLocalRunner, get_linux_distribution_name from khiops.extras.docker import KhiopsDockerRunner from khiops.sklearn.estimators import KhiopsClassifier from tests.test_helper import KhiopsTestHelper @@ -34,7 +34,7 @@ def test_runner_has_mpiexec_on_linux(self): """Test that local runner has executable mpiexec on Linux if MPI is installed""" # Check package is installed on supported platform: # Check /etc/os-release for Linux version - linux_distribution = None + linux_distribution = get_linux_distribution_name() openmpi_found = None with open( os.path.join(os.sep, "etc", "os-release"), encoding="ascii" From e5bc7e560b927b710b098487761387d8c56c4104 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Tue, 2 Jul 2024 16:32:39 +0200 Subject: [PATCH 34/37] Fix Conda package version extraction from Git tag The '-' pre-release version character is removed from the Conda version. --- packaging/conda/meta.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packaging/conda/meta.yaml b/packaging/conda/meta.yaml index 40050dbb..06de61a3 100644 --- a/packaging/conda/meta.yaml +++ b/packaging/conda/meta.yaml @@ -1,7 +1,8 @@ {% set metadata = load_setup_py_data(setup_file='../../setup.py', from_recipe_dir=True) %} package: name: {{ metadata.get('name') }} - version: {{ metadata.get('version') }} + # The Conda version cannot contain the '-' character, so we eliminate it + version: {{ metadata.get('version') | replace('-', '') }} source: path: ../../ From feea8d7253a24cc01458c5d0c25876835175cc2f Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Wed, 3 Jul 2024 14:03:31 +0200 Subject: [PATCH 35/37] Update CHANGELOG for release 10.2.2.0 --- CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c25ea3e..ef1f42bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,19 @@ - Example: 10.2.1.4 is the 5th version that supports khiops 10.2.1. - Internals: Changes in *Internals* sections are unlikely to be of interest for data scientists. +## 10.2.2.0 - 2024-07-03 + +### Added +- (`sklearn`) Support for sparse arrays in sklearn estimators. + +### Changed +- *Internals*: + - MPI backend from MPICH to OpenMPI for native + Pip-based Linux installations. + +### Fixed +- `core` + - Metric name search in estimator analyis report. + ## 10.2.1.0 - 2024-03-26 ### Added From 10d9855ce40d457ee11020347d289660da6f2dac Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Wed, 3 Jul 2024 14:11:22 +0200 Subject: [PATCH 36/37] Update upstream Khiops dependency default versions to 10.2.2 in the CI --- .github/workflows/conda.yml | 4 ++-- .github/workflows/dev-docker.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 74297025..9f26b6be 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -4,12 +4,12 @@ env: DEFAULT_SAMPLES_VERSION: 10.2.0 # Note: The default Khiops version must never be an alpha release as they are # ephemeral. To test alpha versions run the workflow manually. - DEFAULT_KHIOPS_CORE_VERSION: 10.2.2b.3 + DEFAULT_KHIOPS_CORE_VERSION: 10.2.2 on: workflow_dispatch: inputs: khiops-core-version: - default: 10.2.2b.3 + default: 10.2.2 description: khiops-core version for testing khiops-samples-version: default: 10.2.0 diff --git a/.github/workflows/dev-docker.yml b/.github/workflows/dev-docker.yml index 8c17d4a3..d6995cb5 100644 --- a/.github/workflows/dev-docker.yml +++ b/.github/workflows/dev-docker.yml @@ -1,7 +1,7 @@ --- name: Dev Docker env: - DEFAULT_KHIOPS_REVISION: 10.2.2-b.3 + DEFAULT_KHIOPS_REVISION: 10.2.2 DEFAULT_SERVER_REVISION: main DEFAULT_PYTHON_VERSIONS: 3.8 3.9 3.10 3.11 3.12 on: From 9e3b4e5397d32a2250636a89a650ae56d13516a3 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Wed, 3 Jul 2024 14:47:33 +0200 Subject: [PATCH 37/37] Only deploy GitHub pages when explicitly instructed to do so --- .github/workflows/api-docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/api-docs.yml b/.github/workflows/api-docs.yml index 89d17b7a..419791a3 100644 --- a/.github/workflows/api-docs.yml +++ b/.github/workflows/api-docs.yml @@ -68,7 +68,7 @@ jobs: path: doc/_build/html/ # Deploy only when the user explicitly (and manually) orders it deploy: - if: ${{ github.event_name == 'workflow_dispatch' || inputs.deploy-gh-pages == true }} + if: github.event_name == 'workflow_dispatch' && inputs.deploy-gh-pages == true runs-on: ubuntu-latest needs: build environment: