diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000000..2750a53f965 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,31 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +html: + @$(SPHINXBUILD) -b html "$(SOURCEDIR)" "$(BUILDDIR)"/html $(SPHINXOPTS) $(O) + + cp source/_static/redirects/guide-homepage-redirect.html "$(BUILDDIR)"/html/index.html + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +# Custom clean target that also removes autosummary generated files. Can +# be removed when https://github.com/sphinx-doc/sphinx/issues/1999 is fixed. +clean: + rm -rf "$(SOURCEDIR)/guide/reference/_autosummary" + $(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000000..ff358cc7182 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,21 @@ +# OTX 2.0 Documentation + +## Introduction + +This is the source code for the OTX documentation. It is built using sphinx-design and myst parser. + +## Installation + +To install the dependencies, run the following command: + +```bash +otx install --option docs +``` + +## Build + +To build the documentation, run the following command: + +```bash +sphinx-build -b html source build +``` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000000..ded01792820 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,44 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help +if "%1" == "html" goto html + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:html +%SPHINXBUILD% -b %1 %SOURCEDIR% %BUILDDIR%\html %SPHINXOPTS% %O% + +copy _static\redirects\guide-homepage-redirect.html %BUILDDIR%\html\index.html + + +:end +popd \ No newline at end of file diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css new file mode 100644 index 00000000000..5b2fe2052fa --- /dev/null +++ b/docs/source/_static/css/custom.css @@ -0,0 +1,77 @@ +:root { + /***************************************************************************** + * Theme config + **/ + --pst-header-height: 45px; + /* smaller then heading font sizes*/ + --pst-font-size-milli: 12px; + --pst-sidebar-font-size: 0.9em; + --pst-sidebar-caption-font-size: 0.9em; + /***************************************************************************** + * Color + * + * Colors are defined in rgb string way, "red, green, blue" + **/ + --pst-color-success: 40, 167, 69; + --pst-color-info: 0, 123, 255; + /*23, 162, 184;*/ + --pst-color-danger: 220, 53, 69; + --pst-color-headerlink: 198, 15, 15; + --pst-color-headerlink-hover: 255, 255, 255; + --pst-color-preformatted-text: 34, 34, 34; + --pst-color-preformatted-background: 250, 250, 250; + --pst-color-inline-code: 232, 62, 140; + --pst-color-active-navigation: 19, 6, 84; + --pst-color-navbar-link: 77, 77, 77; + --pst-color-navbar-link-hover: var(--pst-color-active-navigation); + --pst-color-navbar-link-active: var(--pst-color-active-navigation); + --pst-color-sidebar-link: 77, 77, 77; + --pst-color-sidebar-link-hover: var(--pst-color-active-navigation); + --pst-color-sidebar-link-active: var(--pst-color-active-navigation); + --pst-color-sidebar-expander-background-hover: 244, 244, 244; + --pst-color-sidebar-caption: 77, 77, 77; + --pst-color-toc-link: 119, 117, 122; + --pst-color-toc-link-hover: var(--pst-color-active-navigation); + --pst-color-toc-link-active: var(--pst-color-active-navigation); + /***************************************************************************** + * Icon + **/ + /* font awesome icons*/ + --pst-icon-check-circle: "\f058"; + --pst-icon-info-circle: "\f05a"; + --pst-icon-exclamation-triangle: "\f071"; + --pst-icon-exclamation-circle: "\f06a"; + --pst-icon-times-circle: "\f057"; + --pst-icon-lightbulb: "\f0eb"; + /***************************************************************************** + * Admonitions + **/ + --pst-color-admonition-default: var(--pst-color-info); + --pst-color-admonition-note: var(--pst-color-info); + --pst-color-admonition-attention: var(--pst-color-warning); + --pst-color-admonition-caution: var(--pst-color-warning); + --pst-color-admonition-warning: var(--pst-color-warning); + --pst-color-admonition-danger: var(--pst-color-danger); + --pst-color-admonition-error: var(--pst-color-danger); + --pst-color-admonition-hint: var(--pst-color-success); + --pst-color-admonition-tip: var(--pst-color-success); + --pst-color-admonition-important: var(--pst-color-success); + --pst-icon-admonition-default: var(--pst-icon-info-circle); + --pst-icon-admonition-note: var(--pst-icon-info-circle); + --pst-icon-admonition-attention: var(--pst-icon-exclamation-circle); + --pst-icon-admonition-caution: var(--pst-icon-exclamation-triangle); + --pst-icon-admonition-warning: var(--pst-icon-exclamation-triangle); + --pst-icon-admonition-danger: var(--pst-icon-exclamation-triangle); + --pst-icon-admonition-error: var(--pst-icon-times-circle); + --pst-icon-admonition-hint: var(--pst-icon-lightbulb); + --pst-icon-admonition-tip: var(--pst-icon-lightbulb); + --pst-icon-admonition-important: var(--pst-icon-exclamation-circle); +} + +.navbar { + background: #0095ca !important; +} + +.video { + text-align: center; +} diff --git a/docs/source/_static/logos/github_icon.png b/docs/source/_static/logos/github_icon.png new file mode 100644 index 00000000000..30183508885 Binary files /dev/null and b/docs/source/_static/logos/github_icon.png differ diff --git a/docs/source/_static/logos/otx-logo-black.png b/docs/source/_static/logos/otx-logo-black.png new file mode 100644 index 00000000000..5a78f297a7f Binary files /dev/null and b/docs/source/_static/logos/otx-logo-black.png differ diff --git a/docs/source/_static/logos/otx-logo.png b/docs/source/_static/logos/otx-logo.png new file mode 100644 index 00000000000..fb6f24f7312 Binary files /dev/null and b/docs/source/_static/logos/otx-logo.png differ diff --git a/docs/source/_static/redirects/guide-homepage-redirect.html b/docs/source/_static/redirects/guide-homepage-redirect.html new file mode 100644 index 00000000000..ae5857ac193 --- /dev/null +++ b/docs/source/_static/redirects/guide-homepage-redirect.html @@ -0,0 +1,4 @@ + diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000000..c2d7fe05f22 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,114 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- # + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# + +import os +import sys + +sys.path.insert(0, os.path.abspath("../../src")) + +from otx import __version__ + +# ruff: noqa + +# -- Project information ----------------------------------------------------- # + +project = "OpenVINO™ Training Extensions" +copyright = "2024, OpenVINO™ Training Extensions Contributors" +author = "OpenVINO™ Training Extensions Contributors" +release = __version__ + +# -- General configuration --------------------------------------------------- # + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.napoleon", # Support for NumPy and Google style docstrings + "sphinx.ext.autodoc", + "sphinx_copybutton", + "sphinx.ext.autosummary", # Create neat summary tables + "sphinx.ext.viewcode", # Find the source files + "sphinx.ext.autosectionlabel", # Refer sections its title + "sphinx.ext.intersphinx", # Generate links to the documentation + "sphinx_tabs.tabs", + "sphinx_design", +] + +source_suffix = { + ".rst": "restructuredtext", + ".md": "markdown", +} + +suppress_warnings = [ + "ref.python", + "autosectionlabel.*", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +# -- Options for HTML output ------------------------------------------------- # +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_theme = "pydata_sphinx_theme" +html_static_path = ["_static"] + +html_theme_options = { + "navbar_center": [], + "logo": { + "image_light": "logos/otx-logo.png", + "image_dark": "logos/otx-logo.png", + }, + "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/openvinotoolkit/training_extensions", + "icon": "_static/logos/github_icon.png", + "type": "local", + }, + ], +} +html_css_files = [ + "css/custom.css", +] + +# -- Extension configuration ------------------------------------------------- +autodoc_docstring_signature = True +autodoc_member_order = "bysource" +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "numpy": ("https://numpy.org/doc/stable/", None), +} +autodoc_member_order = "groupwise" +autodoc_default_options = { + "members": True, + "methods": True, + "special-members": "__call__", + "exclude-members": "_abc_impl", + "show-inheritance": True, +} + +autoclass_content = "both" + +autosummary_generate = True # Turn on sphinx.ext.autosummary +autosummary_ignore_module_all = False # Summary list in __all__ no others +# autosummary_imported_members = True # document classes and functions imported in modules diff --git a/docs/source/guide/explanation/additional_features/adaptive_training.rst b/docs/source/guide/explanation/additional_features/adaptive_training.rst new file mode 100644 index 00000000000..0bc03fe6b56 --- /dev/null +++ b/docs/source/guide/explanation/additional_features/adaptive_training.rst @@ -0,0 +1,26 @@ +Adaptive Training +================== + +Adaptive-training focuses to adjust the number of iterations or interval for the validation to achieve the fast training. +In the small data regime, we don't need to validate the model at every epoch since there are a few iterations at a single epoch. +To handle this, we have implemented module named ``AdaptiveTrainScheduling``. This callback controls the interval of the validation to do faster training. + +.. note:: + ``AdaptiveTrainScheduling`` changes the interval of the validation, evaluation and updating learning rate by checking the number of dataset. + + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + from otx.algo.callbacks.adaptive_train_scheduling import AdaptiveTrainScheduling + + engine.train(callbacks=[AdaptiveTrainScheduling()]) + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx train ... --callbacks otx.algo.callbacks.adaptive_train_scheduling.AdaptiveTrainScheduling diff --git a/docs/source/guide/explanation/additional_features/auto_configuration.rst b/docs/source/guide/explanation/additional_features/auto_configuration.rst new file mode 100644 index 00000000000..8f4a125d187 --- /dev/null +++ b/docs/source/guide/explanation/additional_features/auto_configuration.rst @@ -0,0 +1,151 @@ +Auto-configuration +================== + +Auto-configuration for a deep learning framework means the automatic finding of the most appropriate settings for the training parameters, based on the dataset and the specific task at hand. +Auto-configuration can help to save time, it eases the process of interaction with OpenVINO™ Training Extensions and gives a better baseline for the given dataset. + +At this end, we developed a simple auto-configuration functionality to ease the process of training and validation utilizing our framework. +Basically, to start the training and obtain a good baseline with the best trade-off between accuracy and speed we need to pass only a dataset in the right format without specifying anything else: + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + from otx.engine import Engine + + engine = Engine(data_root="") + engine.train() + + .. tab-item:: CLI + + .. code-block:: bash + + (otx) ...$ otx train ... --data_root + + +After dataset preparation, the training will be started with the middle-sized template to achieve competitive accuracy preserving fast inference. + + +Supported dataset formats for each task: + +- classification: `Imagenet `_, `COCO `_ (multi-label), :ref:`custom hierarchical ` +- object detection: `COCO `_, `Pascal-VOC `_, `YOLO `_ +- semantic segmentation: `Common Semantic Segmentation `_, `Pascal-VOC `_, `Cityscapes `_, `ADE20k `_ +- action classification: `CVAT `_ +- action detection: `CVAT `_ +- anomaly classification: `MVTec `_ +- anomaly detection: `MVTec `_ +- anomaly segmentation: `MVTec `_ +- instance segmentation: `COCO `_, `Pascal-VOC `_ + +If we have a dataset format occluded with other tasks, for example ``COCO`` format, we should directly emphasize the task type. If not, OpenVINO™ Training Extensions automatically chooses the task type that you might not intend: + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + from otx.engine import Engine + + engine = Engine(data_root="", task="") + engine.train() + + .. tab-item:: CLI + + .. code-block:: bash + + (otx) ...$ otx train --data_root + --task {MULTI_CLASS_CLS, MULTI_LABEL_CLS, H_LABEL_CLS, DETECTION, INSTANCE_SEGMENTATION, SEMANTIC_SEGMENTATION, ACTION_CLASSIFICATION, ACTION_DETECTION, ACTION_SEGMENTATION, ANOMALY_CLASSIFICATION, ANOMALY_DETECTION, ANOMALY_SEGMENTATION, VISUAL_PROMPTING} + ... + +Auto-adapt batch size +--------------------- + +This feature adapts a batch size based on the current hardware environment. +There are two methods available for adapting the batch size. + +1. Prevent GPU Out of Memory (`Safe` mode) + +The first method checks if the current batch size is compatible with the available GPU devices. +Larger batch sizes consume more GPU memory for training. Therefore, the system verifies if training is possible with the current batch size. +If it's not feasible, the batch size is decreased to reduce GPU memory usage. +However, setting the batch size too low can slow down training. +To address this, the batch size is reduced to the maximum amount that could be run safely on the current GPU resource. +The learning rate is also adjusted based on the updated batch size accordingly. + +To use this feature, add the following parameter: + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + Need to update! + + .. tab-item:: CLI + + .. code-block:: bash + + Need to update! + +2. Find the maximum executable batch size (`Full` mode) + +The second method aims to find a possible large batch size that reduces the overall training time. +Increasing the batch size reduces the effective number of iterations required to sweep the whole dataset, thus speeds up the end-to-end training. +However, it does not search for the maximum batch size as it is not efficient and may require significantly more time without providing substantial acceleration compared to a large batch size. +Similar to the previous method, the learning rate is adjusted according to the updated batch size accordingly. + +To use this feature, add the following parameter: + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + Need to update! + + .. tab-item:: CLI + + .. code-block:: bash + + Need to update! + + +.. Warning:: + When using a fixed epoch, training with larger batch sizes is generally faster than with smaller batch sizes. + However, if early stop is enabled, training with a lower batch size can finish early. + + +Auto-adapt num_workers +---------------------- + +This feature adapts the ``num_workers`` parameter based on the current hardware environment. +The ``num_workers`` parameter controls the number of subprocesses used for data loading during training. +While increasing ``num_workers`` can reduce data loading time, setting it too high can consume a significant amount of CPU memory. + +To simplify the process of setting ``num_workers`` manually, this feature automatically determines the optimal value based on the current hardware status. + +To use this feature, add the following parameter: + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + from otx.core.config.data import DataModuleConfig + from otx.core.data.module import OTXDataModule + + data_config = DataModuleConfig(..., auto_num_workers=True) + datamodule = OTXDataModule(..., config=data_config) + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx train ... --data.config.auto_num_workers True diff --git a/docs/source/guide/explanation/additional_features/fast_data_loading.rst b/docs/source/guide/explanation/additional_features/fast_data_loading.rst new file mode 100644 index 00000000000..111412ac969 --- /dev/null +++ b/docs/source/guide/explanation/additional_features/fast_data_loading.rst @@ -0,0 +1,40 @@ +Fast Data Loading +================= + +OpenVINO™ Training Extensions provides several ways to boost model training speed, +one of which is fast data loading. + + +======= +Caching +======= + + +***************** +In-Memory Caching +***************** +OpenVINO™ Training Extensions provides in-memory caching for decoded images in main memory. +If the batch size is large, such as for classification tasks, or if dataset contains +high-resolution images, image decoding can account for a non-negligible overhead +in data pre-processing. +One can enable in-memory caching for maximizing GPU utilization and reducing model +training time in those cases. + + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + from otx.core.config.data import DataModuleConfig + from otx.core.data.module import OTXDataModule + + data_config = DataModuleConfig(..., mem_cache_size="8GB") + datamodule = OTXDataModule(..., config=data_config) + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx train ... --data.config.mem_cache_size 8GB diff --git a/docs/source/guide/explanation/additional_features/hpo.rst b/docs/source/guide/explanation/additional_features/hpo.rst new file mode 100644 index 00000000000..effb70ac0c0 --- /dev/null +++ b/docs/source/guide/explanation/additional_features/hpo.rst @@ -0,0 +1,115 @@ +Hyperparameters Optimization +============================ + +Hyper-parameter optimization (HPO) can be a time-consuming process, even with state-of-the-art off-the-shelf libraries. OpenVINO™ Training Extensions makes HPO faster and easier by providing an easy-to-use interface and automatic configuration. + +With OpenVINO™ Training Extensions, you can run hyper-parameter optimization by simply adding a time constraint parameter. The auto-config feature automatically sets internal control parameters, guaranteeing that HPO will finish within the given time constraint. + +OpenVINO™ Training Extensions provides both sequential and parallel methods, making it scalable for different training environments. If you have multiple GPUs, you can accelerate HPO by utilizing all available GPU resources. + +Key features of OpenVINO™ Training Extensions include: + +- **Easy usability** : By using time as the control parameter, OpenVINO™ Training Extensions offers a straightforward and intuitive interface for users. + +- **Auto-config** : The automatic configuration feature sets internal control parameters automatically, ensuring that HPO finishes within the given time constraint. + +- **Scalability** : OpenVINO™ Training Extensions offers both sequential and parallel methods, making it scalable for different training environments. If you have multiple GPUs, you can take advantage of all available GPU resources to accelerate HPO. + +You can run HPO by just adding **--enable-hpo** argument as below: + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + from otx.engine import Engine + + engine = Engine(data_root="") + engine.train(run_hpo=True) + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx train ... --run_hpo True + + +========= +Algorithm +========= + +If you have abundant GPU resources, it's better to run HPO in parallel. +In that case, `ASHA `_ is a good choice. +Currently, OpenVINO™ Training Extensions uses the ASHA algorithm. + +The **Asynchronous Successive Halving Algorithm (ASHA)** is a hyperparameter optimization algorithm that is based on Successive Halving Algorithm (SHA) but is designed to be more efficient in a parallel computing environment. It is used to efficiently search for the best hyperparameters for machine learning models. + +ASHA involves running multiple trials in parallel and evaluating them based on their validation metrics. It starts by running many trials for a short time, with only the best-performing trials advancing to the next round. In each subsequent round, the number of trials is reduced, and the amount of time spent on each trial is increased. This process is repeated until only one trial remains. + +ASHA is designed to be more efficient than SHA in parallel computing environments because it allows for asynchronous training of the trials. This means that each trial can be trained independently of the others, and they do not have to wait for all the other trials to be complete before advancing to the next round. This reduces the amount of time it takes to complete the optimization process. + +ASHA also includes a technique called Hyperband, which is used to determine how much time to allocate to each trial in each round. Hyperband allocates more time to the best-performing trials, with the amount of time allocated decreasing as the performance of the trials decreases. This technique helps to reduce the overall amount of training time required to find the best hyperparameters. + +********************************************* +How to configure hyper-parameter optimization +********************************************* + +You can configure HPO by modifying the ``hpo_config.yaml`` file. This file contains everything related to HPO, including the hyperparameters to optimize, the HPO algorithm, and more. The ``hpo_config.yaml`` file already exists with default values in the same directory where ``template.yaml`` resides. Here is the default ``hpo_config.yaml`` file for classification: + +.. code-block:: + + Need to Update! + +As you can see, there are a few attributes required to run HPO. +Fortunately, there are not many attributes, so it's not difficult to write your own ``hpo_config.yaml`` file. The more detailed description is as follows: + +- **hp_space** (*List[Dict[str, Any]]*, `required`) - Hyper parameter search space to find. It should be list of dictionary. Each dictionary has a hyperparameter name as the key and param_type and range as the values. You can optimize any learning parameters of each task. + + - **Keys of each hyper parameter** + + - **param_type** (*str*, `required`) : Hyper parameter search space type. It must be one of the following: + + - uniform : Samples a float value uniformly between the lower and upper bounds. + - quniform : Samples a quantized float value uniformly between the lower and upper bounds. + - loguniform : Samples a float value after scaling search space by logarithm scale. + - qloguniform : Samples a quantized float value after scaling the search space by logarithm scale. + - choice : Samples a categorical value. + + - **range** (*List[Any]*, `required`) + + - uniform : List[Union[float, int]] + + - min (*Union[float, int]*, `required`) : The lower bound of search space. + - max (*Union[float, int]*, `required`) : The upper bound of search space. + + - quniform : List[Union[float, int]] + + - min (*Union[float, int]*, `required`) : The lower bound of search space. + - max (*Union[float, int]*, `required`) : The upper bound of search space. + - step (*Union[float, int]*, `required`) : The unit value of search space. + + - loguniform : List[Union[float, int]) + + - min (*Union[float, int]*, `required`) : The lower bound of search space. + - max (*Union[float, int]*, `required`) : The upper bound of search space. + - log_base (*Union[float, int]*, *default=10*) : The logarithm base. + + - qloguniform : List[Union[float, int]] + + - min (*Union[float, int]*, `required`) : The lower bound of search space + - max (*Union[float, int]*, `required`) : The upper bound of search space + - step (*Union[float, int]*, `required`) : The unit value of search space + - log_base (*Union[float, int]*, *default=10*) : The logarithm base. + + - choice : List[Any] + + - vaule : values to be chosen from candidates. + +- **metric** (*str*, *default='mAP*') - Name of the metric that will be used to evaluate the performance of each trial. The hyperparameter optimization algorithm will aim to maximize or minimize this metric depending on the value of the mode hyperparameter. The default value is 'mAP'. + +- **mode** (*str*, *default='max*') - Optimization mode for the metric. It determines whether the metric should be maximized or minimized. The possible values are 'max' and 'min', respectively. The default value is 'max'. + +- **maximum_resource** (*int*, *default=None*) - Maximum number of training epochs for each trial. When the number of training epochs reaches this value, the training of the trial will stop. The default value is None. + +- **minimum_resource** (*int*, *default=None*) - Minimum number of training epochs for each trial. Each trial will run for at least this many epochs, even if the performance of the model is not improving. The default value is None. diff --git a/docs/source/guide/explanation/additional_features/index.rst b/docs/source/guide/explanation/additional_features/index.rst new file mode 100644 index 00000000000..d7fa4855d47 --- /dev/null +++ b/docs/source/guide/explanation/additional_features/index.rst @@ -0,0 +1,15 @@ +Additional Features +=================== + +.. _features_section_ref: + +.. toctree:: + :maxdepth: 1 + + models_optimization + hpo + auto_configuration + adaptive_training + xai + fast_data_loading + tiling diff --git a/docs/source/guide/explanation/additional_features/models_optimization.rst b/docs/source/guide/explanation/additional_features/models_optimization.rst new file mode 100644 index 00000000000..89438badf21 --- /dev/null +++ b/docs/source/guide/explanation/additional_features/models_optimization.rst @@ -0,0 +1,31 @@ +Models Optimization +=================== + +OpenVINO™ Training Extensions provides optimization algorithm: `Post-Training Quantization tool (PTQ) `_. + +******************************* +Post-Training Quantization Tool +******************************* + +PTQ is designed to optimize the inference of models by applying post-training methods that do not require model retraining or fine-tuning. If you want to know more details about how PTQ works and to be more familiar with model optimization methods, please refer to `documentation `_. + +To run Post-training quantization it is required to convert the model to OpenVINO™ intermediate representation (IR) first. To perform fast and accurate quantization we use ``DefaultQuantization Algorithm`` for each task. Please, refer to the `Tune quantization Parameters `_ for further information about configuring the optimization. + +Please, refer to our :doc:`dedicated tutorials <../../tutorials/base/how_to_train/index>` on how to optimize your model using PTQ. + + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + from otx.engine import Engine + ... + engine.optimize(checkpoint="") + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx optimize ... --checkpoint diff --git a/docs/source/guide/explanation/additional_features/tiling.rst b/docs/source/guide/explanation/additional_features/tiling.rst new file mode 100644 index 00000000000..fe233057903 --- /dev/null +++ b/docs/source/guide/explanation/additional_features/tiling.rst @@ -0,0 +1,227 @@ +Improve Small Object Detection with Image Tiling +************************************************* + +The OpenVINO Training Extensions introduces the concept of image tiling to enhance the accuracy of detection algorithms and instance segmentation algorithms, particularly for small and densely packed objects in high-resolution images. + +Image tiling involves dividing the original full-resolution image into multiple smaller tiles or patches. This division allows objects within the tiles to appear larger in relation to the tile size, effectively addressing the challenge of objects becoming nearly invisible in deeper layers of feature maps due to downsampling operations. Image tiling proves especially beneficial for datasets where objects can be as small as 20 by 20 pixels in a 4K image. + +However, it's important to consider the trade-off associated with image tiling. Dividing a single image sample into several tiles increases the number of samples for training, evaluation, and testing. This trade-off impacts the execution speed, as processing more images requires additional computational resources. To strike a balance between patch size and computational efficiency, the OpenVINO Training incorporates tile dataset samples and adaptive tiling parameter optimization. These features enable the proper tuning of tile size and other tiling-related parameters to ensure efficient execution without compromising accuracy. + +By leveraging image tiling, the OpenVINO Training Extensions empowers detection and instance segmentation algorithms to effectively detect and localize small and crowded objects in large-resolution images, ultimately leading to improved overall performance and accuracy. + +Tiling Strategies +================= +Below we provided an example of tiling used on one of the image from `DOTA `_. + +.. image:: ../../../../utils/images/dota_tiling_example.jpg + :width: 800 + :alt: this image uploaded from this `source `_ + + +In this example, the full image is cropped into 9 tiles. During training, only the tiles with annotations (bounding boxes or masks) are used for training. + +During evaluation in training, only the tiles with annotations are used for evaluation, and evaluation is performed at the tile level. + +During testing, each tile is processed and predicted separately. The tiles are then stitched back together to form the full image, and the tile predictions are merged to form the full image prediction. + +The tiling strategy is implemented in the OpenVINO Training Extensions through the following steps: + +.. note:: + + * Training: Create an ImageTilingDataset with annotated tiles -> Train with annotated tile images -> Evaluate on annotated tiles + * Testing: Create an ImageTilingDataset including all tiles -> Test with all tile images -> Stitching -> Merge tile-level predictions -> Full Image Prediction + +.. note:: + + While running `ote test` on models trained with tiling enabled, the evaluation will be performed on all tiles, this process includes merging all the tile-level prediction. + The below context will be provided during evaluation: + + .. code-block:: shell + + [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 650/650, 17.2 task/s, elapsed: 38s, ETA: 0s + ==== merge: 7.326097726821899 sec ==== + + +Enable Tiling via OTX Training +================================== + +Currently, tiling is supported for both detection and instance segmentation models. Please refer to :doc:`../algorithms/object_detection/object_detection` and :doc:`../algorithms/segmentation/instance_segmentation` for more details. + +To enable tiling in OTX training, set ``data.config.tile_config.enable_tiler`` parameter to 1. Here's an example of enabling tiling: + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + from otx.core.config.data import DataModuleConfig, TileConfig + from otx.core.data.module import OTXDataModule + + data_config = DataModuleConfig(..., tile_config=TileConfig(enable_tiler=True)) + datamodule = OTXDataModule(..., config=data_config) + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx train ... --data.config.tile_config.enable_tiler True + +.. note:: + + To learn how to deploy the trained model and run the exported demo, refer to :doc:`../../tutorials/base/deploy`. + + To learn how to run the demo in CLI and visualize results, refer to :doc:`../../tutorials/base/demo`. + + +Tile Size and Tile Overlap Optimization +----------------------------------------- +By default, the OpenVINO Training Extensions automatically optimize tile size and tile overlap to ensure efficient execution without compromising accuracy. + +To strike a balance between patch size and computational efficiency, the OpenVINO Training Extensions incorporate adaptive tiling parameter optimization. These features enable the proper tuning of tile size and other tiling-related parameters to ensure efficient execution without compromising accuracy. + +Adaptive tiling parameter optimization works by finding the average object size in the training dataset and using that to determine the tile size. Currently, the average object size to tile size ratio is set to 3%. For example, if the average object size is 100x100 pixels, the tile size will be around 577x577 pixels. + +This computation is performed by dividing the average object size by the desired object size ratio (default: 3%) and then taking the square root. This ensures that the objects are large enough to be detected by the model. The object size to tile size ratio can also be configured with ``tiling_parameters.object_tile_ratio`` parameter. + +Here's an example of setting the object size ratio to 5%: + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + from otx.core.config.data import DataModuleConfig, TileConfig + from otx.core.data.module import OTXDataModule + + tile_config = TileConfig(enable_tiler=True, enable_adaptive_tiling=True, object_tile_ratio=0.05) + data_config = DataModuleConfig(..., tile_config=tile_config) + datamodule = OTXDataModule(..., config=data_config) + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx train ... --data.config.tile_config.enable_tiler True \ # enable tiling + --data.config.tile_config.enable_adaptive_tiling True \ # enable automatic tiling parameter optimization + --data.config.tile_config.object_tile_ratio 0.05 # set the object size ratio to 5% + + +After determining the tile size, the tile overlap is computed by dividing the largest object size in the training dataset by the adaptive tile size. +This calculation ensures that the largest object on the border of a tile is not split into two tiles and is covered by adjacent tiles. + +You can also manually configure the tile overlap using ``tiling_parameters.tile_overlap parameter`` parameter. For more details, please refer to the section on `Manual Tiling Parameter Configuration`_ . + + +Tiling Sampling Strategy +------------------------ +To accelerate the training process, the OpenVINO Training Extensions introduces a tile sampling strategy. This strategy involves randomly sampling a percentage of tile images from the dataset to be used for training. + +Since training and validation on all tiles from a high-resolution image dataset can be time-consuming, sampling the tile dataset can significantly reduce the training and validation time. + +It's important to note that sampling is applied to the training and validation datasets, not the test dataset. + +This can be configured with ``data.config.tile_config.enable_adaptive_tiling`` parameter. Here's an example: + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + from otx.core.config.data import DataModuleConfig, TileConfig + from otx.core.data.module import OTXDataModule + + tile_config = TileConfig(enable_tiler=True, enable_adaptive_tiling=True, sampling_ratio=0.5) + data_config = DataModuleConfig(..., tile_config=tile_config) + datamodule = OTXDataModule(..., config=data_config) + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx train ... --data.config.tile_config.enable_tiler True + --data.config.tile_config.enable_adaptive_tiling True + --data.config.tile_config.sampling_ratio 0.5 + + +Manual Tiling Parameter Configuration +------------------------------------- + +Users can disable adaptive tiling and customize the tiling process by setting the following parameters: + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + from otx.core.config.data import DataModuleConfig, TileConfig + from otx.core.data.module import OTXDataModule + + tile_config = TileConfig(enable_tiler=True, enable_adaptive_tiling=False, tile_size=(512,512), tile_overlap=0.2) + data_config = DataModuleConfig(..., tile_config=tile_config) + datamodule = OTXDataModule(..., config=data_config) + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx train ... --data.config.tile_config.enable_tiler True + --data.config.tile_config.enable_adaptive_tiling False + --data.config.tile_config.tile_size '[512,512]' + --data.config.tile_config.tile_overlap 0.2 + +By specifying these parameters, automatic tiling parameter optimization is disabled, and the tile size is configured to 512x512 pixels with a 10% overlap between tiles. + +The following parameters can be configured to customize the tiling process: + +- ``tile_config.enable_tiling``: Enable or disable tiling (0 or 1) +- ``tile_config.enable_adaptive_params``: Enable or disable adaptive tiling parameter optimization (0 or 1) +- ``tile_config.object_tile_ratio``: Ratio of average object size to tile size (float between 0.0 and 1.0) +- ``tile_config.tile_size``: Tile edge length in pixels (integer between 100 and 4096) +- ``tile_config.overlap``: The overlap between adjacent tiles as a percentage (float between 0.0 and 1.0) +- ``tile_config.sampling_ratio``: The percentage of tiles to sample from the dataset (float between 0.0 and 1.0) + + +Run Tiling on OpenVINO Exported Model +====================================== + +After training a model with tiling enabled, you can export the model to OpenVINO IR format using the following command: + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + engine.export(checkpoint="") + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx export ... --checkpoint + +After exporting the model, you can run inference on the exported model using the following command: + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + engine.test(checkpoint="") + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx test ... --checkpoint + +.. warning:: + When tiling is enabled, there is a trade-off between speed and accuracy as it increases the number of images to be processed. + As a result, longer training and inference times are expected. If you encounter GPU out of memory errors, + you can mitigate the issue by reducing the number of batches through the command-line interface (CLI) or + by adjusting the batch size value. diff --git a/docs/source/guide/explanation/additional_features/xai.rst b/docs/source/guide/explanation/additional_features/xai.rst new file mode 100644 index 00000000000..ad3f7a00cfe --- /dev/null +++ b/docs/source/guide/explanation/additional_features/xai.rst @@ -0,0 +1,110 @@ +Explainable AI (XAI) +==================== + +**Explainable AI (XAI)** is a field of research that aims to make machine learning models more transparent and interpretable to humans. +The goal is to help users understand how and why AI systems make decisions and provide insight into their inner workings. It allows us to detect, analyze, and prevent common mistakes, for example, when the model uses irrelevant features to make a prediction. +XAI can help to build trust in AI, make sure that the model is safe for development and increase its adoption in various domains. + +Most XAI methods generate **saliency maps** as a result. Saliency map is a visual representation, suitable for human comprehension, that highlights the most important parts of the image from the model point of view. +It looks like a heatmap, where warm-colored areas represent the areas with main focus. + + +.. figure:: ../../../../utils/images/xai_example.jpg + :width: 600 + :alt: this image shows the result of XAI algorithm + + These images are taken from `D-RISE paper `_. + + +We can generate saliency maps for a certain model that was trained in OpenVINO™ Training Extensions, using ``otx explain`` command line. Learn more about its usage in :doc:`../../tutorials/base/explain` tutorial. + +********************************* +XAI algorithms for classification +********************************* + +.. image:: ../../../../utils/images/xai_cls.jpg + :width: 600 + :align: center + :alt: this image shows the comparison of XAI classification algorithms + + +For classification networks these algorithms are used to generate saliency maps: + +- **Activation Map​** - this is the most basic and naive approach. It takes the outputs of the model's feature extractor (backbone) and averages it in channel dimension. The results highly rely on the backbone and ignore neck and head computations. Basically, it gives a relatively good and fast result. + +- `Eigen-Cam `_ uses Principal Component Analysis (PCA). It returns the first principal component of the feature extractor output, which most of the time corresponds to the dominant object. The results highly rely on the backbone as well and ignore neck and head computations. + +- `Recipro-CAM​ `_ uses Class Activation Mapping (CAM) to weigh the activation map for each class, so it can generate different saliency per class. Recipro-CAM is a fast gradient-free Reciprocal CAM method. The method involves spatially masking the extracted feature maps to exploit the correlation between activation maps and network predictions for target classes. + + +Below we show the comparison of described algorithms. ``Access to the model internal state`` means the necessity to modify the model's outputs and dump inner features. +``Per-class explanation support`` means generation different saliency maps for different classes. + ++-------------------------------------------+----------------+----------------+-------------------------------------------------------------------------+ +| Classification algorithm | Activation Map | Eigen-Cam | Recipro-CAM | ++===========================================+================+================+=========================================================================+ +| Need access to model internal state | Yes | Yes | Yes | ++-------------------------------------------+----------------+----------------+-------------------------------------------------------------------------+ +| Gradient-free | Yes | Yes | Yes | ++-------------------------------------------+----------------+----------------+-------------------------------------------------------------------------+ +| Single-shot | Yes | Yes | No (re-infer neck + head H*W times, where HxW – feature map size) | ++-------------------------------------------+----------------+----------------+-------------------------------------------------------------------------+ +| Per-class explanation support | No | No | Yes | ++-------------------------------------------+----------------+----------------+-------------------------------------------------------------------------+ +| Execution speed | Fast | Fast | Medium | ++-------------------------------------------+----------------+----------------+-------------------------------------------------------------------------+ + + +**************************** +XAI algorithms for detection +**************************** + +For detection networks these algorithms are used to generate saliency maps: + +- **Activation Map​** - the same approach as for classification networks, which uses the outputs from feature extractor. This is an algorithm is used to generate saliency maps for two-stage detectors. + +- **DetClassProbabilityMap** - this approach takes the raw classification head output and uses class probability maps to calculate regions of interest for each class. So, it creates different salience maps for each class. This algorithm is implemented for single-stage detectors only. + +.. image:: ../../../../utils/images/xai_det.jpg + :width: 600 + :align: center + :alt: this image shows the detailed description of XAI detection algorithm + + +The main limitation of this method is that, due to training loss design of most single-stage detectors, activation values drift towards the center of the object while propagating through the network. +This prevents from getting clear explanation in the input image space using intermediate activations. + +Below we show the comparison of described algorithms. ``Access to the model internal state`` means the necessity to modify the model's outputs and dump inner features. +``Per-class explanation support`` means generation different saliency maps for different classes. ``Per-box explanation support`` means generation standalone saliency maps for each detected prediction. + + ++-------------------------------------------+----------------------------+--------------------------------------------+ +| Detection algorithm | Activation Map | DetClassProbabilityMap | ++===========================================+============================+============================================+ +| Need access to model internal state | Yes | Yes | ++-------------------------------------------+----------------------------+--------------------------------------------+ +| Gradient-free | Yes | Yes | ++-------------------------------------------+----------------------------+--------------------------------------------+ +| Single-shot | Yes | Yes | ++-------------------------------------------+----------------------------+--------------------------------------------+ +| Per-class explanation support | No | Yes | ++-------------------------------------------+----------------------------+--------------------------------------------+ +| Per-box explanation support | No | No | ++-------------------------------------------+----------------------------+--------------------------------------------+ +| Execution speed | Fast | Fast | ++-------------------------------------------+----------------------------+--------------------------------------------+ + + +.. tab-set:: + + .. tab-item:: API + + .. code-block:: python + + engine.explain(checkpoint="") + + .. tab-item:: CLI + + .. code-block:: bash + + (otx) ...$ otx explain ... --checkpoint diff --git a/docs/source/guide/explanation/algorithms/action/action_classification.rst b/docs/source/guide/explanation/algorithms/action/action_classification.rst new file mode 100644 index 00000000000..f6626de07a0 --- /dev/null +++ b/docs/source/guide/explanation/algorithms/action/action_classification.rst @@ -0,0 +1,54 @@ +Action Classification +===================== + +Action classification is a problem of identifying the action that is being performed in a video. The input to the algorithm is a sequence of video frames, and the output is a label indicating the action that is being performed. + +For supervised learning we use the following algorithms components: + +- ``Augmentations``: We use standard data augmentations for videos, including random resizing and random cropping, horizontal flipping. We randomly sample a segment of frames from each video during training. + +- ``Optimizer``: We use the Adam with weight decay fix (AdamW) optimizer. + +- ``Learning rate schedule``: We use a step learning rate schedule, where the learning rate is reduced by a factor of 10 after a fixed number of epochs. We also use the Linear Warmup technique to gradually increase the learning rate at the beginning of training. + +- ``Loss function``: We use the Cross-Entropy Loss as the loss function. + +************** +Dataset Format +************** + +We support the popular action classification formats, such as `Jester `_, `HMDB51 `_, `UCF101 `_. Specifically, these formats will be converted into our `internal representation `_ using the `Datumaro `_ dataset handler. + +The names of the annotations files and the overall dataset structure should be the same as the original dataset. + +Refer to our tutorial for more information on how to train, validate, and optimize action classification models. + +****** +Models +****** + +Currently OpenVINO™ Training Extensions supports `X3D `_ and `MoViNet `_ for action classification. + ++----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+---------------------+-------------------------+ +| Template ID | Name | Complexity (GFLOPs) | Model size (MB) | ++========================================================================================================================================================================================+=========+=====================+=========================+ +| `Custom_Action_Classification_X3D `_ | X3D | 2.49 | 3.79 | ++----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+---------------------+-------------------------+ +| `Custom_Action_Classificaiton_MoViNet `_ | MoViNet | 2.71 | 3.10 | ++----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+---------------------+-------------------------+ + +To see which models are available for the task, the following command can be executed: + +.. code-block:: shell + + (otx) ...$ otx find --task ACTION_CLASSIFICATION + +In the table below the **top-1 accuracy** on some academic datasets are presented. Each model is trained with single NVIDIA GeForce RTX 3090. + ++-----------------------+------------+-----------------+ +| Model name | HMDB51 | UCF101 | ++=======================+============+=================+ +| X3D | 67.19 | 87.89 | ++-----------------------+------------+-----------------+ +| MoViNet | 62.74 | 81.32 | ++-----------------------+------------+-----------------+ diff --git a/docs/source/guide/explanation/algorithms/action/action_detection.rst b/docs/source/guide/explanation/algorithms/action/action_detection.rst new file mode 100644 index 00000000000..726acbd5e5a --- /dev/null +++ b/docs/source/guide/explanation/algorithms/action/action_detection.rst @@ -0,0 +1,48 @@ +Action Detection +================ + +Sptio-Temporal action detection is the problem of localizing the actor(spatial detection) and action(temporal detection). We solve this problem by combining 3D action classification backbone and 2D object detection model. We can combine these two models in several ways. Currently, we support the simplest way. The other ways will be supported in near future. + +X3D + Fast-RCNN architecture comes from `X3D paper `_. This model requires pre-computed actor proposals. Actor pre-proposals can be obtained from `COCO `_ pre-trained 2D object detector (e.g. `Faster-RCNN `_, `ATSS `_). If the custom dataset requires finetuning of 2d object detector, please refer :doc:`otx.algorithms.detection <../object_detection/object_detection>`. Region-of-interest (RoI) features are extracted at the last feature map of X3D by extending a 2D proposal at a keyframe into a 3D RoI by replicating it along the temporal axis. The RoI features fed into the roi head of Fast-RCNN. + +For better transfer learning we use the following algorithm components: + +- ``Augmentations``: We use only random crop and random flip for the training pipeline + +- ``Optimizer``: We use `SGD `_ optimizer with the weight decay set to **1e-4** and momentum set to **0.9**. + +- ``Loss functions``: For the multi-label case binary cross entropy loss is used. In the other case, `Cross Entropy Loss `_ is used for the categories classification. + +************** +Dataset Format +************** + +For the dataset handling inside OpenVINO™ Training Extensions, we use `Dataset Management Framework (Datumaro) `_. Since current Datumaro does not support `AVA dataset `_ format, therefore conversion to `CVAT dataset format `_ is needed. Currently, we offer conversion code from the AVA dataset format to the CVAT dataset format. Please refer +`this script `_ + + +****** +Models +****** + +We support the following ready-to-use model templates for transfer learning: + ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------------+-------------------------+ +| Template ID | Name | Complexity (GFLOPs) | Model size (MB) | ++=========================================================================================================================================================================================+===============+=====================+=========================+ +| `Custom_Action_Detection_X3D_FAST_RCNN `_ | x3d_fast_rcnn | 13.04 | 8.32 | ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------------+-------------------------+ + +To see which models are available for the task, the following command can be executed: + +.. code-block:: shell + + (otx) ...$ otx find --task ACTION_DETECTION + +In the table below the **mAP** on some academic datasets are presented. Each model is trained using `Kinetics-400 `_ pre-trained weight with single Nvidia GeForce RTX3090. + ++----------------+-------+-----------+ +| Model name | JHMDB | UCF101-24 | ++================+=======+===========+ +| x3d_fast_rcnn | 92.14 | 80.7 | ++----------------+-------+-----------+ diff --git a/docs/source/guide/explanation/algorithms/action/index.rst b/docs/source/guide/explanation/algorithms/action/index.rst new file mode 100644 index 00000000000..c2965183809 --- /dev/null +++ b/docs/source/guide/explanation/algorithms/action/index.rst @@ -0,0 +1,9 @@ +Action Recognition +================== + +.. toctree:: + :maxdepth: 1 + + + action_classification + action_detection diff --git a/docs/source/guide/explanation/algorithms/anomaly/index.rst b/docs/source/guide/explanation/algorithms/anomaly/index.rst new file mode 100644 index 00000000000..74aa80b6909 --- /dev/null +++ b/docs/source/guide/explanation/algorithms/anomaly/index.rst @@ -0,0 +1,166 @@ +Anomaly Detection +================= + +The datasets from real-world applications such as industrial, medical, and security are extremely unbalanced, with an abundance of normal images and a severe lack of abnormal samples. A second issue is that the definition and type of abnormality are constantly evolving, making it difficult to train a supervised model once and use it permanently. An alternative approach is to train a model using only normal samples to learn normality. During the validation or testing phases, a deviation from this would indicate an anomaly. The process of identifying such deviations or anomalies in data is known as anomaly detection. + +This section examines the solutions for anomaly detection offered by the OpenVINO Training Extensions library. + + +Task Types +********** +OpenVINO Training Extensions supports the following anomaly task types: + +* Anomaly Classification - (Image-level detection) +* Anomaly Detection - (Box-level localization) +* Anomaly Segmentation - (Pixel-level localization) + +.. note:: + All anomaly task types are only trained on normal images. The different task types use the same models for generating the raw predictions, but differ in the post-processing that is applied to the raw predictions to localize the anomalous regions. Anomaly detection generates bounding-box predictions, while anomaly segmentation generates pixel-mask predictions. Anomaly classification does not perform localization of the anomalous regions within the image, but just outputs a normal vs. anomalous label for the entire image. + + +Anomaly Classification +---------------------- +Anomaly classification is the task of predicting normal and abnormal images at the image level. As noted above, a model is trained on only normal images. During the testing phase, the model predicts an anomaly score indicating the likelihood of an image being abnormal. The threshold for anomaly classification is either set by the user or adaptively tuned by the algo backend. An image is classified as abnormal if the anomaly score is above the threshold. + +Anomaly Detection +----------------- +Anomaly detection is the task of predicting normal and abnormal images at the box level. Similar to anomaly classification, a model is trained on normal images only. During the testing phase, the model outputs an anomaly heatmap showing the likelihood of each pixel being abnormal. After post-processing the heatmap, the model predicts a bounding box around the anomaly. + +Anomaly Segmentation +-------------------- +The anomaly segmentation task locates anomalies at the pixel level. Similar to anomaly classification and detection, a model is trained on only normal images. During the validation and testing phase, the model outputs an anomaly heatmap showing the likelihood of each pixel being abnormal. After post-processing the heatmap, the model predicts a mask around the anomaly. + + +.. _fig-anomaly-tasks: + +.. figure:: ../../../../../utils/images/anomaly_tasks.png + :width: 600 + :align: center + :alt: Anomaly Task Types + + Anomaly task types (a) A normal image used during training. (b) An image-level prediction of an anomaly by anomaly classification task (c) A box-level prediction of an anomaly by an anomaly detection task (d) A pixel-level prediction of an anomaly by anomaly segmentation task + +Dataset Format +************** +At the moment, the anomaly tasks support the MVTec AD dataset format, which is one of the most popular formats for detecting anomalies. + +.. code-block:: + + MVTec/ + ├── ground_truth + │ ├── + │ │ ├── 000_mask.png + │ │ └── ... + │ ├── ... + │ └── + │ ├── 000_mask.png + │ └── ... + ├── test + │ ├── + │ │ ├── 000.png + │ │ └── ... + │ ├── ... + │ ├── contamination + │ │ ├── 000.png + │ │ └── ... + │ └── good + │ ├── 000.png + │ └── ... + └── train + └── good + ├── 000.png + └── ... + +Future releases of OpenVINO Training Extensions will support other benchmark datasets, such as Amazon's `Visual Anomaly (VisA) `_ dataset. Meanwhile, you can use the `MVTec AD dataset `_ to train and test anomaly detection models, or use MVTec dataset structure to train and test anomaly detection models on your own dataset. + +Models +****** +As mentioned above, the goal of visual anomaly detection is to learn a representation of normal behaviour in the data and then identify instances that deviate from this normal behaviour. OpenVINO Training Extensions supports several deep learning approaches to this task, including the following: + ++-------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+---------------------+-----------------+ +| Name | Classification | Detection | Segmentation | Complexity (GFLOPs) | Model size (MB) | ++=======+==============================================================================================================================================+==================================================================================================================================================+============================================================================================================================================+=====================+=================+ +| PADIM | `padim `_ | `padim `_ | `padim `_ | 3.9 | 168.4 | ++-------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+---------------------+-----------------+ +| STFPM | `stfpm `_ | `stfpm `_ | `stfpm `_ | 5.6 | 21.1 | ++-------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+---------------------+-----------------+ + + +Clustering-based Models +----------------------- +These models initially extract features from a CNN or transformer and subsequently use clustering algorithms to learn normality. The anomaly score is then calculated as the distance between the input image and the cluster center. OpenVINO Training Extensions currently supports `PADIM `_. + +PADIM +^^^^^ + +.. figure:: ../../../../../utils/images/padim.png + :width: 600 + :align: center + :alt: Anomaly Task Types + +Padim is a clustering-based anomaly detection approach. The model uses a patch-based mechanism that extracts patches from the input image and then uses a CNN to extract features from the patches. + +To eliminate the redundant information from the extracted features, the model randomly selects a subset of the features to reduce the dimensionality of the features. A multi-variate Gaussian distribution is fitted for each patch embedding. This means each patch of the set of training images has a corresponding multivariate Gaussian distribution. + +To predict the anomaly score, Mahalanobis distance is calculated to score each patch position of the test image. The matrices of Mahalanobis distances constitute the anomaly map, with higher scores indicating anomalous regions. + +.. note:: + + Since the PADIM model uses a pre-trained backbone to extract features, the training does not involve fine-tuning of neural network weights. This means that PADIM does not make use of an optimizer and a loss function. + +Knowledge Distillation-based Models +----------------------------------- +Knowledge distillation is a deep learning technique in which a smaller model (student) is trained to imitate the behavior of a larger and more complex model (teacher). This technique is predicated on the notion that the knowledge contained in a large and complex model can be transferred to a smaller and simpler model, resulting in a model with comparable performance that is both more efficient and faster. OpenVINO Training Extensions currently supports `STFPM: Student-Teacher Feature Pyramid Matching for Unsupervised Anomaly Detection `_. + +STFPM +^^^^^ + +.. figure:: ../../../../../utils/images/stfpm.png + :width: 600 + :align: center + :alt: Anomaly Task Types + +The STFPM algorithm is composed of a pre-trained teacher network and a student network with the same architecture. The student network learns the distribution of anomaly-free images by matching the features to their corresponding features in the teacher network. + +Multiple-scale feature matching is utilized to enable the student network during training to receive a mixture of multi-level knowledge from the feature pyramid, thereby enabling the detection of anomalies of various sizes. To compute the anomaly scores during the inference, the student network's feature pyramid is compared to the teacher network's feature pyramid. + +The anomaly score is computed as the sum of the L2 distances between the student and teacher feature pyramids. This distance is then used to compute the anomaly map and the anomaly score. + +Training Parameters +~~~~~~~~~~~~~~~~~~~~ + +Since STFPM trains the student network, we use the following parameters for its training: + +- ``Backbone``: The default backbone is ``ResNet18``. You can also use ``Wide ResNet50``. +- ``Loss``: Loss is computed as the mean squared error between the student and teacher feature pyramids. The default loss is ``MSE`` and cannot be changed. +- ``Optimizer``: The default optimizer is ``SGD`` and cannot be changed. It uses the following parameters that can be changed: + - ``Learning Rate``: The default learning rate is ``0.4``. + - ``Momentum``: The default momentum is ``0.9``. + - ``Weight Decay``: The default weight decay is ``0.0001``. + +- ``Aditional Techniques``: + - ``Early Stopping``: Early stopping is used to stop the training process when the validation loss stops improving. The default value of the early stopping patience is ``10``. + +For more information on STFPM's training. We invite you to read Anomalib's `STFPM documentation `_. + +Reconstruction-based Models +--------------------------- +These models initially extract features from a CNN or transformer and subsequently reconstruct the input image. The anomaly score is then calculated as the distance between the input image and the reconstructed image. OpenVINO Training Extensions currently supports `DRÆM – A discriminatively trained reconstruction embedding for surface anomaly detection `_. + +DRÆM +^^^^ + +.. figure:: ../../../../../utils/images/draem.png + :width: 600 + :align: center + :alt: Anomaly Task Types + +A reconstruction-based algorithm, DRAEM consists of a reconstructive subnetwork and a discriminative subnetwork. DRAEM is trained on simulated anomaly images, which are produced by combining normal input images from the training set with a random Perlin noise mask extracted from an unrelated source of image data. The reconstructive subnetwork is an autoencoder trained to reconstruct the original input images from the augmented images. Combining L2 loss and structural similarity loss, the reconstructive submodel is trained. The input of the discriminative subnetwork is the channel-by-channel concatenation of the (augmented) input image and the output of the reconstructive subnetwork. The output of the discriminative subnetwork is an anomaly map containing the predicted anomaly scores for each pixel. + +Training Parameters +~~~~~~~~~~~~~~~~~~~~ + +- ``Optimizer``: Both the reconstructive subnetwork and the discriminative subnetwork are trained using the Adam optimizer. +- ``Loss``: The reconstructive subnetwork is trained using reconstruction loss which consists of a combination of L2 loss and Structural Similarity (SSIM) loss between the reconstructions and the original images. The discriminative subnetwork is trained using focal loss, computed between the pixel-level predictions and the ground truth masks of the augmented images. +- ``Additional Training Techniques``: + - ``Early Stopping``: Early stopping is used to prevent overfitting. The early stopping patience can be configured by the user. By default, early stopping is enabled with a patience of 20 epochs. diff --git a/docs/source/guide/explanation/algorithms/classification/hierarhical_classification.rst b/docs/source/guide/explanation/algorithms/classification/hierarhical_classification.rst new file mode 100644 index 00000000000..f6c50a2297a --- /dev/null +++ b/docs/source/guide/explanation/algorithms/classification/hierarhical_classification.rst @@ -0,0 +1,64 @@ +Hierarchical Classification +=========================== + +Hierarchical classification is an extension of the classification task where, besides the set of categories to predict, we have a hierarchical relationship between them. +The goal of this task is to predict the hierarchical tree of the categories for the given image. + +We solve this task by assigning a separate head for each label group on each hierarchical level. +Specifically, we have a classifier that solves the multi-class classification problem and assigns one label from the given exclusive label group. + +To create a non-exclusive label group, we can construct single-label exclusive groups for every label and each of them will be handled by an individual binary classifier. + +In this fashion, we train different classifiers: one for each label group. We use the same training strategy as for :doc:`multi_class_classification` task. + + +Assume, we have a label tree as below: + +.. _hierarchical_image_example: + + +.. image:: ../../../../../utils/images/label_tree.png + :width: 600 + :alt: image uploaded from this `source `_ + + +The goal of our algorithm is to return the right branch of this tree. For example: ``Persian -> Cats -> Pets`` + +At the inference stage, we traverse the tree from head to leaves and obtain labels predicted by the corresponding classifier. + +Let's say, we forward an image with the label tree pictured above. On the first level, our corresponding classifier returns 3 predictions. + +We perform an **argmax** operation and obtain, for example, class ``Cats``. Then, we choose a classifier related to ``{Siamse, Persian, Sphynx}`` label group, +obtain its predictions, and after performing the **argmax** operation we choose our last leaf label. + +After that, we can easily reconstruct the final predicted tree branch: ``Persian -> Cats -> Pets``. + +************** +Dataset Format +************** +.. _hierarchical_dataset: + +For hierarchical image classification, we created our custom dataset format that is supported by `Datumaro `_. +An example of the annotations format and dataset structure can be found in our `sample `_. + +.. note:: + + Please, refer to our :doc:`dedicated tutorial <../../../tutorials/base/how_to_train/classification>` for more information how to train, validate and optimize classification models. + +****** +Models +****** + +We use the same model templates as for Multi-class Classification. Please, refer: :ref:`Classification Models `. + +To see which models are available for the task, the following command can be executed: + +.. code-block:: shell + + (otx) ...$ otx find --task H_LABEL_CLS + +.. ******************** +.. Incremental Learning +.. ******************** + +.. To be added soon diff --git a/docs/source/guide/explanation/algorithms/classification/index.rst b/docs/source/guide/explanation/algorithms/classification/index.rst new file mode 100644 index 00000000000..70247656e66 --- /dev/null +++ b/docs/source/guide/explanation/algorithms/classification/index.rst @@ -0,0 +1,10 @@ +Classification +============== + +.. toctree:: + :maxdepth: 1 + + + multi_class_classification + multi_label_classification + hierarhical_classification diff --git a/docs/source/guide/explanation/algorithms/classification/multi_class_classification.rst b/docs/source/guide/explanation/algorithms/classification/multi_class_classification.rst new file mode 100644 index 00000000000..93ec1fbce23 --- /dev/null +++ b/docs/source/guide/explanation/algorithms/classification/multi_class_classification.rst @@ -0,0 +1,94 @@ +Multi-class Classification +========================== + +Multi-class classification is the problem of classifying instances into one of two or more classes. We solve this problem in a common fashion, based on the feature extractor backbone and classifier head that predicts the distribution probability of the categories from the given corpus. +For the supervised training we use the following algorithms components: + +.. _mcl_cls_supervised_pipeline: + +- ``Augmentations``: Besides basic augmentations like random flip and random rotate, we use `Augmix `_. This advanced type of augmentations helps to significantly expand the training distribution. + +- ``Optimizer``: `Sharpness Aware Minimization (SAM) `_. Wrapper upon the `SGD `_ optimizer that helps to achieve better generalization minimizing simultaneously loss value and loss sharpness. + +- ``Learning rate schedule``: `Cosine Annealing `_. It is a common learning rate scheduler that tends to work well on average for this task on a variety of different datasets. + +- ``Loss function``: We use standard `Cross Entropy Loss `_ to train a model. However, for the class-incremental scenario we use `Influence-Balanced Loss `_. IB loss is a solution for the class imbalance, which avoids overfitting to the majority classes re-weighting the influential samples. + +- ``Additional training techniques`` + - `No Bias Decay (NBD) `_: To add adaptability to the training pipeline and prevent overfitting. + - ``Early stopping``: To add adaptability to the training pipeline and prevent overfitting. + - `Balanced Sampler `_: To create an efficient batch that consists of balanced samples over classes, reducing the iteration size as well. + +************** +Dataset Format +************** + +We support a commonly used format for multi-class image classification task: `ImageNet `_ class folder format. +This format has the following structure: + +:: + + data + ├── train + ├── class 0 + ├── 0.png + ├── 1.png + ... + └── N.png + ├── class 1 + ├── 0.png + ├── 1.png + ... + └── N.png + ... + └── class N + ├── 0.png + ├── 1.png + ... + └── N.png + └── val + ... + +.. note:: + + Please, refer to our :doc:`dedicated tutorial <../../../tutorials/base/how_to_train/classification>` for more information how to train, validate and optimize classification models. + +****** +Models +****** +.. _classification_models: + +We support the following ready-to-use model templates: + ++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+---------------------+-----------------+ +| Template ID | Name | Complexity (GFLOPs) | Model size (MB) | ++==================================================================================================================================================================================================================+=======================+=====================+=================+ +| `Custom_Image_Classification_MobileNet-V3-large-1x `_ | MobileNet-V3-large-1x | 0.44 | 4.29 | ++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+---------------------+-----------------+ +| `Custom_Image_Classification_EfficinetNet-B0 `_ | EfficientNet-B0 | 0.81 | 4.09 | ++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+---------------------+-----------------+ +| `Custom_Image_Classification_EfficientNet-V2-S `_ | EfficientNet-V2-S | 5.76 | 20.23 | ++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+---------------------+-----------------+ + +`EfficientNet-V2-S `_ has more parameters and Flops and needs more time to train, meanwhile providing superior classification performance. `MobileNet-V3-large-1x `_ is the best choice when training time and computational cost are in priority, nevertheless, this template provides competitive accuracy as well. +`EfficientNet-B0 `_ consumes more Flops compared to MobileNet, providing better performance on large datasets, but may be not so stable in case of a small amount of training data. + +To see which models are available for the task, the following command can be executed: + +.. code-block:: shell + + (otx) ...$ otx find --task MULTI_CLASS_CLS + +In the table below the top-1 accuracy on some academic datasets using our :ref:`supervised pipeline ` is presented. The results were obtained on our templates without any changes. We use 224x224 image resolution, for other hyperparameters, please, refer to the related template. We trained each model with single Nvidia GeForce RTX3090. + ++-----------------------+-----------------+-----------+-----------+-----------+ +| Model name | CIFAR10 |CIFAR100 |flowers* | cars* | ++=======================+=================+===========+===========+===========+ +| MobileNet-V3-large-1x | 93.36 | 83.01 | 96.45 | 83.24 | ++-----------------------+-----------------+-----------+-----------+-----------+ +| EfficientNet-B0 | 94.86 | 84.73 | 96.86 | 85.70 | ++-----------------------+-----------------+-----------+-----------+-----------+ +| EfficientNet-V2-S | 96.13 | 90.36 | 97.68 | 86.74 | ++-----------------------+-----------------+-----------+-----------+-----------+ + +\* These datasets were splitted with auto-split (80% train, 20% test). diff --git a/docs/source/guide/explanation/algorithms/classification/multi_label_classification.rst b/docs/source/guide/explanation/algorithms/classification/multi_label_classification.rst new file mode 100644 index 00000000000..47f651492ad --- /dev/null +++ b/docs/source/guide/explanation/algorithms/classification/multi_label_classification.rst @@ -0,0 +1,58 @@ +Multi-label Classification +========================== + +Multi-label classification is a generalization of multiclass classification. The main goal of the task is to predict a set of labels per image. Each image may belong to more than one class and may belong to none of them at all. + +We solve this problem by optimizing small binary classification sub-tasks aimed to predict whether or not the specific category from the corpus is presented on the given image. + +.. _ml_cls_supervised_pipeline: + +For supervised learning we use the following algorithms components: + +- ``Augmentations``: Besides basic augmentations like random flip and random rotate, we use `Augmix `_. This advanced type of augmentation helps to significantly expand the training distribution. + +- ``Optimizer``: `Sharpness Aware Minimization (SAM) `_. Wrapper upon the `SGD `_ optimizer that helps to achieve better generalization minimizing simultaneously loss value and loss sharpness. + +- ``Learning rate schedule``: `One Cycle Learning Rate policy `_. It is the combination of gradually increasing the learning rate and gradually decreasing the momentum during the first half of the cycle, then gradually decreasing the learning rate and increasing the momentum during the latter half of the cycle. + +- ``Loss function``: We use **Asymmetric Angular Margin Loss**. We can formulate this loss as follows: :math:`L_j (cos\Theta_j,y) = \frac{k}{s}y p_-^{\gamma^-}\log{p_+} + \frac{1-k}{s}(1-y)p_+^{\gamma^+}\log{p_-}`, where :math:`s` is a scale parameter, :math:`m` is an angular margin, :math:`k` is negative-positive weighting coefficient, :math:`\gamma^+` and :math:`\gamma^-` are weighting parameters. For further information about loss function, ablation studies, and experiments, please refer to our dedicated `paper `_. + +- Additionally, we use the `No Bias Decay (NBD) `_ technique, **Exponential Moving Average (EMA)** for the model's weights and adaptive **early stopping** to add adaptability and prevent overfitting. + +************** +Dataset Format +************** + +As it is a common practice to use object detection datasets in the academic area, we support the most popular object detection format: `COCO `_. +Specifically, this format should be converted in our `internal representation `_. + +.. note:: + Names of the annotations files and overall dataset structure should be the same as the original `COCO `_. You need to convert train and validation sets separately. + + Please, refer to our :doc:`dedicated tutorial <../../../tutorials/base/how_to_train/classification>` for more information how to train, validate and optimize classification models. + +.. note:: + For now, "___" is a symbol to distinguish the multi-label format. So, it must be included at the front of the label name. + +****** +Models +****** +We use the same models as for Multi-class classification. Please, refer: :ref:`Classification Models `. + +To see which models are available for the task, the following command can be executed: + +.. code-block:: shell + + (otx) ...$ otx find --task MULTI_LABEL_CLS + +In the table below the `mAP `_ metrics on some academic datasets using our :ref:`supervised pipeline ` are presented. The results were obtained on our templates without any changes (including input resolution, which is 224x224 for all templates). We trained each model with single Nvidia GeForce RTX3090. + ++-----------------------+-----------------+-----------+------------------+-----------+ +| Model name | Pascal-VOC 2007 | COCO 2014 | Aerial Maritime | Mean mAP | ++=======================+=================+===========+==================+===========+ +| MobileNet-V3-large-1x | 86.14 | 67.94 | 69.61 | 74.56 | ++-----------------------+-----------------+-----------+------------------+-----------+ +| EfficientNet-B0 | 86.07 | 67.87 | 73.83 | 75.92 | ++-----------------------+-----------------+-----------+------------------+-----------+ +| EfficientNet-V2-S | 91.91 | 77.28 | 71.52 | 80.24 | ++-----------------------+-----------------+-----------+------------------+-----------+ diff --git a/docs/source/guide/explanation/algorithms/index.rst b/docs/source/guide/explanation/algorithms/index.rst new file mode 100644 index 00000000000..6092f6a355e --- /dev/null +++ b/docs/source/guide/explanation/algorithms/index.rst @@ -0,0 +1,30 @@ +Algorithms +========== + +.. _algo_section_ref: + +OpenVINO™ Training Extensions supports different training types to solve a variety of computer vision problems. This section provides what exactly we utilize inside our algorithms providing an end-to-end solution to solve real-life computer vision problems. + + +To this end, we support: + +- **Supervised training**. This is the most common approach for computer vision tasks such as object detection and image classification. Supervised learning involves training a model on a labeled dataset of images. The model learns to associate specific features in the images with the corresponding labels. + +- **Incremental learning**. This learning approach lets the model train on new data as it becomes available, rather than retraining the entire model on the whole dataset every time new data is added. OpenVINO™ Training Extensions supports also the class incremental approach for all tasks. In this approach, the model is first trained on a set of classes, and then incrementally updated with new classes of data, while keeping the previously learned classes' knowledge. The class incremental approach is particularly useful in situations where the number of classes is not fixed and new classes may be added over time. + + +******** +Contents +******** + + +.. toctree:: + :maxdepth: 2 + :titlesonly: + + classification/index + object_detection/index + segmentation/index + anomaly/index + action/index + visual_prompting/index diff --git a/docs/source/guide/explanation/algorithms/object_detection/index.rst b/docs/source/guide/explanation/algorithms/object_detection/index.rst new file mode 100644 index 00000000000..0f5cbd2840c --- /dev/null +++ b/docs/source/guide/explanation/algorithms/object_detection/index.rst @@ -0,0 +1,8 @@ +Object Detection +================ + +.. toctree:: + :maxdepth: 1 + + + object_detection diff --git a/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst b/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst new file mode 100644 index 00000000000..64e926b8928 --- /dev/null +++ b/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst @@ -0,0 +1,125 @@ +Object Detection +================ + +Object detection is a computer vision task where it's needed to locate objects, finding their bounding boxes coordinates together with defining class. +The input is an image, and the output is a pair of coordinates for bouding box corners and a class number for each detected object. + +The common approach to building object detection architecture is to take a feature extractor (backbone), that can be inherited from the classification task. +Then goes a head that calculates coordinates and class probabilities based on aggregated information from the image. +Additionally, some architectures use `Feature Pyramid Network (FPN) `_ to transfer and process feature maps from backbone to head and called neck. + +For the supervised training we use the following algorithms components: + +.. _od_supervised_pipeline: + +- ``Augmentations``: We use random crop and random rotate, simple bright and color distortions and multiscale training for the training pipeline. + +- ``Optimizer``: We use `SGD `_ optimizer with the weight decay set to **1e-4** and momentum set to **0.9**. + +- ``Learning rate schedule``: `ReduceLROnPlateau `_. This learning rate scheduler proved its efficiency in dataset-agnostic trainings, its logic is to drop LR after some time without improving the target accuracy metric. Also, we update it with ``iteration_patience`` parameter that ensures that a certain number of training iterations (steps through the dataset) were passed before dropping LR. + +- ``Loss function``: We use `Generalized IoU Loss `_ for localization loss to train the ability of the model to find the coordinates of the objects. For the classification head, we use a standard `FocalLoss `_. + +- ``Additional training techniques`` + - ``Early stopping``: To add adaptability to the training pipeline and prevent overfitting. + - `Anchor clustering for SSD `_: This model highly relies on predefined anchor boxes hyperparameter that impacts the size of objects, which can be detected. So before training, we collect object statistics within dataset, cluster them and modify anchor boxes sizes to fit the most for objects the model is going to detect. + - ``Backbone pretraining``: we pretrained MobileNetV2 backbone on large `ImageNet21k `_ dataset to improve feature extractor and learn better and faster. + + +************** +Dataset Format +************** + +At the current point we support `COCO `_ and +`Pascal-VOC `_ dataset formats. +Learn more about the formats by following the links above. Here is an example of expected format for COCO dataset: + +.. code:: + + ├── annotations/ + ├── instances_train.json + ├── instances_val.json + └── instances_test.json + ├──images/ + (Split is optional) + ├── train + ├── val + └── test + +.. note:: + + Please, refer to our :doc:`dedicated tutorial <../../../tutorials/base/how_to_train/detection>` for more information how to train, validate and optimize detection models. + +****** +Models +****** + +We support the following ready-to-use model templates: + ++------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+ +| Template ID | Name | Complexity (GFLOPs) | Model size (MB) | ++============================================================================================================================================================+=====================+=====================+=================+ +| `Custom_Object_Detection_YOLOX `_ | YOLOX-TINY | 6.5 | 20.4 | ++------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+ +| `Object_Detection_YOLOX_S `_ | YOLOX_S | 33.51 | 46.0 | ++------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+ +| `Object_Detection_YOLOX_L `_ | YOLOX_L | 194.57 | 207.0 | ++------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+ +| `Object_Detection_YOLOX_X `_ | YOLOX_X | 352.42 | 378.0 | ++------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+ +| `Custom_Object_Detection_Gen3_SSD `_ | SSD | 9.4 | 7.6 | ++------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+ +| `Custom_Object_Detection_Gen3_ATSS `_ | MobileNetV2-ATSS | 20.6 | 9.1 | ++------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+ +| `Object_Detection_ResNeXt101_ATSS `_ | ResNeXt101-ATSS | 434.75 | 344.0 | ++------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+ + +Above table can be found using the following command + +.. code-block:: shell + + (otx) ...$ otx find --task DETECTION + +`MobileNetV2-ATSS `_ is a good medium-range model that works well and fast in most cases. +`SSD `_ and `YOLOX `_ are light models, that a perfect for the fastest inference on low-power hardware. +YOLOX achieved the same accuracy as SSD, and even outperforms its inference on CPU 1.5 times, but requires 3 times more time for training due to `Mosaic augmentation `_, which is even more than for ATSS. +So if you have resources for a long training, you can pick the YOLOX model. +ATSS still shows good performance among `RetinaNet `_ based models. Therfore, We added ATSS with large scale backbone, ResNeXt101-ATSS. We integrated large ResNeXt101 backbone to our Custom ATSS head, and it shows good transfer learning performance. +In addition, we added a YOLOX variants to support users' diverse situations. + +In the table below the test mAP on some academic datasets using our :ref:`supervised pipeline ` is presented. + +For `COCO `__ dataset the accuracy of pretrained weights is shown, and we report official COCO mAP with AP50. +Except for COCO, we report AP50 as performance metric. + +5 datasets were selected as transfer learning datasets. +`BDD100K `_ is the largest dataset among we used. 70000 images are used as train images and 10000 images are used for validation. +`Brackish `_ and `Plantdoc `_ are datasets of medium size. They have around 10000 images for train and 1500 images for validation. +`BCCD `_ and `Chess pieces `_ are datasets of small size. They have around 300 images for train and 100 images for validation. +We used our own templates without any modification. +For hyperparameters, please, refer to the related template. +We trained each model with a single Nvidia GeForce RTX3090. + ++----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+ +| Model name | COCO(AP50) | BDD100K | Brackish | Plantdoc | BCCD | Chess pieces | ++============================+==================+===========+===========+===========+===========+==============+ +| YOLOX-TINY | 31.0 (48.2) | 24.8 | 96.3 | 51.5 | 88.5 | 99.2 | ++----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+ +| SSD | 13.5 | 28.2 | 96.5 | 52.9 | 91.1 | 99.1 | ++----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+ +| MobileNetV2-ATSS | 32.5 (49.5) | 40.2 | 99.1 | 63.4 | 93.4 | 99.1 | ++----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+ +| ResNeXt101-ATSS | 45.1 (63.8) | 45.5 | 99.3 | 69.3 | 93.1 | 99.1 | ++----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+ +| ResNet50-Deformable-DETR | 44.3 (63.2) | 44.8 | 97.7 | 60.7 | 93.4 | 99.2 | ++----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+ +| ResNet50-DINO | 49.0 (66.4) | 47.2 | 99.5 | 62.9 | 93.5 | 99.1 | ++----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+ +| ResNet50-Lite-DINO | 48.1 (64.4) | 47.0 | 99.0 | 62.5 | 93.6 | 99.4 | ++----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+ +| YOLOX-S | 40.3 (59.1) | 37.1 | 93.6 | 54.8 | 92.7 | 98.8 | ++----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+ +| YOLOX-L | 49.4 (67.1) | 44.5 | 94.6 | 55.8 | 91.8 | 99.0 | ++----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+ +| YOLOX-X | 50.9 (68.4) | 44.2 | 96.3 | 56.2 | 91.5 | 98.9 | ++----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+ diff --git a/docs/source/guide/explanation/algorithms/segmentation/index.rst b/docs/source/guide/explanation/algorithms/segmentation/index.rst new file mode 100644 index 00000000000..e9bb1068962 --- /dev/null +++ b/docs/source/guide/explanation/algorithms/segmentation/index.rst @@ -0,0 +1,9 @@ +Segmentation +============ + +.. toctree:: + :maxdepth: 1 + + + semantic_segmentation + instance_segmentation diff --git a/docs/source/guide/explanation/algorithms/segmentation/instance_segmentation.rst b/docs/source/guide/explanation/algorithms/segmentation/instance_segmentation.rst new file mode 100644 index 00000000000..69f80931290 --- /dev/null +++ b/docs/source/guide/explanation/algorithms/segmentation/instance_segmentation.rst @@ -0,0 +1,83 @@ +Instance Segmentation +===================== + +Instance segmentation is a computer vision task that involves identifying and segmenting individual objects within an image. + +It is a more advanced version of object detection, as it doesn't only detect the presence of an object in an image but also segments the object by creating a mask that separates it from the background. This allows getting more detailed information about the object, such as its shape and location, to be extracted. + +Instance segmentation is commonly used in applications such as self-driving cars, robotics, and image-editing software. + +.. _instance_segmentation_image_example: + + +.. image:: ../../../../../utils/images/instance_seg_example.png + :width: 600 + +| + +We solve this problem in the `MaskRCNN `_ manner. The main idea of Mask R-CNN is to add a branch for predicting an object mask in parallel with the existing branch for bounding box regression and object classification. + +This is done by using a fully convolutional network (FCN) on top of the feature map generated by the last convolutional layer of the backbone network. The model first generates region proposals, and then uses a RoIAlign layer to align the region proposals with the feature map, then the FCN predicts the class and box offset for each proposal and the mask for each class. + +The mask branch is trained to predict a binary mask for each object instance, where the mask is aligned with the object's bounding box and has the same size as the region of interest (RoI). The predicted mask is then used to segment the object from the background. + + +For the supervised training we use the following algorithms components: + +.. _instance_segmentation_supervised_pipeline: + +- ``Augmentations``: We use only a random flip for both augmentations pipelines, train and validation. + +- ``Optimizer``: We use `SGD `_ optimizer with the weight decay set to **1e-4** and momentum set to **0.9**. + +- ``Learning rate schedule``: For scheduling training process we use **ReduceLrOnPlateau** with linear learning rate warmup for **200** iterations. This method monitors a target metric (in our case we use metric on the validation set) and if no improvement is seen for a ``patience`` number of epochs, the learning rate is reduced. + +- ``Loss functions``: For the bounding box regression we use **L1 Loss** (the sum of the absolute differences between the ground truth value and the predicted value), `Cross Entropy Loss `_ for the categories classification and segmentation masks prediction. + +- Additionally, we use the **Exponential Moving Average (EMA)** for the model's weights and the **early stopping** to add adaptability to the training pipeline and prevent overfitting. + +************** +Dataset Format +************** + +For the dataset handling inside OpenVINO™ Training Extensions, we use `Dataset Management Framework (Datumaro) `_. For instance segmentation we support `COCO `_ dataset format. + +.. note:: + + Please, refer to our :doc:`dedicated tutorial <../../../tutorials/base/how_to_train/instance_segmentation>` how to train, validate and optimize instance segmentation model for more details. + +****** +Models +****** + +We support the following ready-to-use model templates: + ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------+---------------------+-----------------+ +| Template ID | Name | Complexity (GFLOPs) | Model size (MB) | ++===============================================================================================================================================================================================================+============================+=====================+=================+ +| `Custom_Counting_Instance_Segmentation_MaskRCNN_EfficientNetB2B `_ | MaskRCNN-EfficientNetB2B | 68.48 | 13.27 | ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------+---------------------+-----------------+ +| `Custom_Counting_Instance_Segmentation_MaskRCNN_ResNet50 `_ | MaskRCNN-ResNet50 | 533.80 | 177.90 | ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------+---------------------+-----------------+ + +Above table can be found using the following command + +.. code-block:: shell + + (otx) ...$ otx find --task INSTANCE_SEGMENTATION + +MaskRCNN-ResNet50 utilizes the `ResNet-50 `_ architecture as the backbone network for extracting image features. This choice of backbone network results in a higher number of parameters and FLOPs, which consequently requires more training time. However, the model offers superior performance in terms of accuracy. + +On the other hand, MaskRCNN-EfficientNetB2B employs the `EfficientNet-B2 `_ architecture as the backbone network. This selection strikes a balance between accuracy and speed, making it a preferable option when prioritizing training time and computational cost. + +Recently, we have made updates to MaskRCNN-ConvNeXt, incorporating the `ConvNeXt backbone `_. Through our experiments, we have observed that this variant achieves better accuracy compared to MaskRCNN-ResNet50 while utilizing less GPU memory. However, it is important to note that the training time and inference duration may slightly increase. If minimizing training time is a significant concern, we recommend considering a switch to MaskRCNN-EfficientNetB2B. + +In the table below the `mAP `_ metric on some academic datasets using our :ref:`supervised pipeline ` is presented. The results were obtained on our templates without any changes. We use 1024x1024 image resolution, for other hyperparameters, please, refer to the related template. We trained each model with single Nvidia GeForce RTX3090. + ++---------------------------+--------------+------------+-----------------+ +| Model name | ADE20k | Cityscapes | Pascal-VOC 2007 | ++===========================+==============+============+=================+ +| MaskRCNN-EfficientNetB2B | N/A | N/A | N/A | ++---------------------------+--------------+------------+-----------------+ +| MaskRCNN-ResNet50 | N/A | N/A | N/A | ++---------------------------+--------------+------------+-----------------+ diff --git a/docs/source/guide/explanation/algorithms/segmentation/semantic_segmentation.rst b/docs/source/guide/explanation/algorithms/segmentation/semantic_segmentation.rst new file mode 100644 index 00000000000..020c09488bb --- /dev/null +++ b/docs/source/guide/explanation/algorithms/segmentation/semantic_segmentation.rst @@ -0,0 +1,99 @@ +Semantic Segmentation +===================== + +Semantic segmentation is a computer vision task in which an algorithm assigns a label or class to each pixel in an image. +For example, semantic segmentation can be used to identify the boundaries of different objects in an image, such as cars, buildings, and trees. +The output of semantic segmentation is typically an image where each pixel is colored with a different color or label depending on its class. + +.. _semantic_segmentation_image_example: + + +.. image:: ../../../../../utils/images/semantic_seg_example.png + :width: 600 + :alt: image uploaded from this `source `_ + +| + +We solve this task by utilizing `FCN Head `_ with implementation from `MMSegmentation `_ on the multi-level image features obtained by the feature extractor backbone (`Lite-HRNet `_). +For the supervised training we use the following algorithms components: + +.. _semantic_segmentation_supervised_pipeline: + +- ``Augmentations``: Besides basic augmentations like random flip, random rotate and random crop, we use mixing images technique with different `photometric distortions `_. + +- ``Optimizer``: We use `Adam `_ optimizer with weight decay set to zero and gradient clipping with maximum quadratic norm equals to 40. + +- ``Learning rate schedule``: For scheduling training process we use **ReduceLROnPlateau** with linear learning rate warmup for 100 iterations. This method monitors a target metric (in our case we use metric on the validation set) and if no improvement is seen for a ``patience`` number of epochs, the learning rate is reduced. + +- ``Loss function``: We use standard `Cross Entropy Loss `_ to train a model. + +- ``Additional training techniques`` + - ``Early stopping``: To add adaptability to the training pipeline and prevent overfitting. + +************** +Dataset Format +************** + +For the dataset handling inside OpenVINO™ Training Extensions, we use `Dataset Management Framework (Datumaro) `_. + +At this end we support `Common Semantic Segmentation `_ data format. +If you organized supported dataset format, starting training will be very simple. We just need to pass a path to the root folder and desired model template to start training: + +.. note:: + + Due to some internal limitations, the dataset should always consist of a "background" label. If your dataset doesn't have a background label, rename the first label to "background" in the ``meta.json`` file. + + +.. note:: + + Currently, metrics with models trained with our OTX dataset adapter can differ from popular benchmarks. To avoid this and train the model on exactly the same segmentation masks as intended by the authors, please, set the parameter ``use_otx_adapter`` to ``False``. + +****** +Models +****** +.. _semantic_segmentation_models: + +We support the following ready-to-use model templates: + ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------+---------------------+-----------------+ +| Template ID | Name | Complexity (GFLOPs) | Model size (MB) | ++======================================================================================================================================================================================+========================+=====================+=================+ +| `Custom_Semantic_Segmentation_Lite-HRNet-s-mod2_OCR `_ | Lite-HRNet-s-mod2 | 1.44 | 3.2 | ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------+---------------------+-----------------+ +| `Custom_Semantic_Segmentation_Lite-HRNet-18-mod2_OCR `_ | Lite-HRNet-18-mod2 | 2.82 | 4.3 | ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------+---------------------+-----------------+ +| `Custom_Semantic_Segmentation_Lite-HRNet-x-mod3_OCR `_ | Lite-HRNet-x-mod3 | 9.20 | 5.7 | ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------+---------------------+-----------------+ +| `Custom_Semantic_Segmentation_SegNext_T `_ | SegNext-t | 6.07 | 4.23 | ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------+---------------------+-----------------+ +| `Custom_Semantic_Segmentation_SegNext_S `_ | SegNext-s | 15.35 | 13.9 | ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------+---------------------+-----------------+ +| `Custom_Semantic_Segmentation_SegNext_B `_ | SegNext-b | 32.08 | 27.56 | ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------+---------------------+-----------------+ + +All of these models are members of the same `Lite-HRNet `_ backbones family. They differ in the trade-off between accuracy and inference/training speed. ``Lite-HRNet-x-mod3`` is the template with heavy-size architecture for accurate predictions but it requires long training. +Whereas the ``Lite-HRNet-s-mod2`` is the lightweight architecture for fast inference and training. It is the best choice for the scenario of a limited amount of data. The ``Lite-HRNet-18-mod2`` model is the middle-sized architecture for the balance between fast inference and training time. + +Use `SegNext `_ model which can achieve superior perfomance while preserving fast inference and fast training. + +In the table below the `Dice score `_ on some academic datasets using our :ref:`supervised pipeline ` is presented. We use 512x512 image crop resolution, for other hyperparameters, please, refer to the related template. We trained each model with single Nvidia GeForce RTX3090. + ++-----------------------+--------------------------------------------------------------+-----------------------------------------------------+----------------------------------------------------------------------+-----------------------------------------------------------------+--------+ +| Model name | `DIS5K `_ | `Cityscapes `_ | `Pascal-VOC 2012 `_ | `KITTI full `_ | Mean | ++=======================+==============================================================+=====================================================+======================================================================+=================================================================+========+ +| Lite-HRNet-s-mod2 | 79.95 | 62.38 | 58.26 | 36.06 | 59.16 | ++-----------------------+--------------------------------------------------------------+-----------------------------------------------------+----------------------------------------------------------------------+-----------------------------------------------------------------+--------+ +| Lite-HRNet-18-mod2 | 81.12 | 65.04 | 63.48 | 39.14 | 62.20 | ++-----------------------+--------------------------------------------------------------+-----------------------------------------------------+----------------------------------------------------------------------+-----------------------------------------------------------------+--------+ +| Lite-HRNet-x-mod3 | 79.98 | 59.97 | 61.9 | 41.55 | 60.85 | ++-----------------------+--------------------------------------------------------------+-----------------------------------------------------+----------------------------------------------------------------------+-----------------------------------------------------------------+--------+ +| SegNext-t | 85.05 | 70.67 | 80.73 | 51.25 | 68.99 | ++-----------------------+--------------------------------------------------------------+-----------------------------------------------------+----------------------------------------------------------------------+-----------------------------------------------------------------+--------+ +| SegNext-s | 85.62 | 70.91 | 82.31 | 52.94 | 69.82 | ++-----------------------+--------------------------------------------------------------+-----------------------------------------------------+----------------------------------------------------------------------+-----------------------------------------------------------------+--------+ +| SegNext-b | 87.92 | 76.94 | 85.01 | 55.49 | 73.45 | ++-----------------------+--------------------------------------------------------------+-----------------------------------------------------+----------------------------------------------------------------------+-----------------------------------------------------------------+--------+ + +.. note:: + + Please, refer to our :doc:`dedicated tutorial <../../../tutorials/base/how_to_train/semantic_segmentation>` for more information on how to train, validate and optimize the semantic segmentation model. diff --git a/docs/source/guide/explanation/algorithms/visual_prompting/fine_tuning.rst b/docs/source/guide/explanation/algorithms/visual_prompting/fine_tuning.rst new file mode 100644 index 00000000000..f2838a6ece5 --- /dev/null +++ b/docs/source/guide/explanation/algorithms/visual_prompting/fine_tuning.rst @@ -0,0 +1,102 @@ +Visual Prompting (Fine-tuning) +================================== + +Visual prompting is a computer vision task that uses a combination of an image and prompts, such as texts, bounding boxes, points, and so on to troubleshoot problems. +Using these useful prompts, the main purpose of this task is to obtain labels from unlabeled datasets, and to use generated label information on particular domains or to develop a new model with the generated information. + +This section examines the solutions for visual prompting offered by the OpenVINO Training Extensions library. +`Segment Anything (SAM) `_, is one of the most famous visual prompting methods and this model will be used to adapt a new dataset domain. +Because `SAM `_ was trained by using web-scale dataset and has huge backbone network, fine-tuning the whole network is difficult and lots of resources are required. +Therefore, in this section, we try to fine-tune only mask decoder only for several epochs to increase performance on the new dataset domain. +For fine-tuning `SAM `_, we use following algorithms components: + +.. _visual_prompting_finetuning_pipeline: + +- ``Pre-processing``: Resize an image according to the longest axis and pad the rest with zero. + +- ``Optimizer``: We use `Adam `_ optimizer. + +- ``Loss function``: We use standard loss combination, 20 * focal loss + dice loss + iou loss, used in `SAM `_ as it is. + +- ``Additional training techniques`` + - ``Early stopping``: To add adaptability to the training pipeline and prevent overfitting. Early stopping will be automatically applied. + + +.. note:: + + Currently, fine-tuning `SAM `_ with bounding boxes in the OpenVINO Training Extensions is only supported. + We will support fine-tuning with other prompts (points and texts) and continuous fine-tuning with predicted mask information in the near future. + +.. note:: + + Currently, Post-Training Quantization (PTQ) for `SAM `_ is only supported, not Quantization Aware Training (QAT). + + +************** +Dataset Format +************** +.. _visual_prompting_dataset: + +For the dataset handling inside OpenVINO™ Training Extensions, we use `Dataset Management Framework (Datumaro) `_. + +We support three dataset formats for visual prompting: + +- `Common Semantic Segmentation `_ for semantic segmentation + +- `COCO `_ for instance segmentation + +- `Pascal VOC `_ for instance segmentation and semantic segmentation + +.. note:: + + During training, mDice for binary mask without label information is used for train/validation metric. + After training, if using ``otx test`` to evaluate performance, mDice for binary or multi-class masks with label information will be used. + As you can expect, performance will be different between ``otx train`` and ``otx test``, but if unlabeled mask performance is high, labeld mask performance is high as well. + + +****** +Models +****** +.. _visual_prompting_model: + +We support the following model templates in experimental phase: + ++------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+---------------------+-----------------+ +| Template ID | Name | Complexity (GFLOPs) | Model size (MB) | ++============================================================================================================================================================+==============+=====================+=================+ +| `Visual_Prompting_SAM_Tiny_ViT `_ | SAM_Tiny_ViT | 38.95 | 47 | ++------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+---------------------+-----------------+ +| `Visual_Prompting_SAM_ViT_B `_ | SAM_ViT_B | 483.71 | 362 | ++------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+---------------------+-----------------+ + +To check feasibility of `SAM `_, we did experiments using three public datasets with each other domains: `WGISD `_, `Trashcan `_, and `FLARE22 `_, and checked `Dice score `_. +We used sampled training data from `Trashcan `_ and `FLARE22 `_, and full training data (=110) from `WGISD `_. + ++---------------------------------------------------------------+--------------------+ +| Dataset | #samples | ++===============================================================+====================+ +| `WGISD `_ | 110 | ++---------------------------------------------------------------+--------------------+ +| `Trashcan `_ | 500 | ++---------------------------------------------------------------+--------------------+ +| `FLARE22 `_ | 1 CT (=100 slices) | ++---------------------------------------------------------------+--------------------+ + +The below table shows performance improvement after fine-tuning. + ++------------+--------------------------------------------+---------------------------------------------------------------+---------------------------------------------------+ +| Model name | `WGISD `_ | `Trashcan `_ | `FLARE22 `_ | ++============+============================================+===============================================================+===================================================+ +| Tiny_ViT | 90.32 → 92.29 (+1.97) | 82.38 → 85.01 (+2.63) | 89.69 → 93.05 (+3.36) | ++------------+--------------------------------------------+---------------------------------------------------------------+---------------------------------------------------+ +| ViT_B | 92.32 → 92.46 (+0.14) | 79.61 → 81.50 (+1.89) | 91.48 → 91.68 (+0.20) | ++------------+--------------------------------------------+---------------------------------------------------------------+---------------------------------------------------+ + +According to datasets, ``learning rate`` and ``batch size`` can be adjusted like below: + +.. code-block:: shell + + (otx) ...$ otx train --config \ + --data_root \ + --data.config.train_subset.batch_size \ + --optimizer.lr diff --git a/docs/source/guide/explanation/algorithms/visual_prompting/index.rst b/docs/source/guide/explanation/algorithms/visual_prompting/index.rst new file mode 100644 index 00000000000..8910b1101b5 --- /dev/null +++ b/docs/source/guide/explanation/algorithms/visual_prompting/index.rst @@ -0,0 +1,9 @@ +Visual Prompting +================ + +.. toctree:: + :maxdepth: 1 + + + fine_tuning + zero_shot diff --git a/docs/source/guide/explanation/algorithms/visual_prompting/zero_shot.rst b/docs/source/guide/explanation/algorithms/visual_prompting/zero_shot.rst new file mode 100644 index 00000000000..daacbef51b4 --- /dev/null +++ b/docs/source/guide/explanation/algorithms/visual_prompting/zero_shot.rst @@ -0,0 +1,93 @@ +Visual Prompting (Zero-shot learning) +===================================== + +Visual prompting is a computer vision task that uses a combination of an image and prompts, such as texts, bounding boxes, points, and so on to troubleshoot problems. +Using these useful prompts, the main purpose of this task is to obtain labels from unlabeled datasets, and to use generated label information on particular domains or to develop a new model with the generated information. + +This section examines the solutions for visual prompting offered by the OpenVINO Training Extensions library. +`Segment Anything (SAM) `_, is one of the most famous visual prompting methods and this model will be used to adapt a new dataset domain. +Especially, in this section, we try to automatically predict given images without any training, called as ``zero-shot learning``. +Unlike fine-tuning, zero-shot learning needs only pre-processing component. + + +.. _visual_prompting_zeroshot_pipeline: + +- ``Pre-processing``: Resize an image according to the longest axis and pad the rest with zero. + + +.. note:: + + Currently, zero-shot learning with `SAM `_ with bounding boxes in the OpenVINO Training Extensions is only supported. + We will support zero-shot learning with other prompts (points and texts) in the near future. + +.. note:: + + Currently, Post-Training Quantization (PTQ) for `SAM `_ is only supported, not Quantization Aware Training (QAT). + + +************** +Dataset Format +************** +.. _visual_prompting_dataset: + +For the dataset handling inside OpenVINO™ Training Extensions, we use `Dataset Management Framework (Datumaro) `_. + +We support three dataset formats for visual prompting: + +- `Common Semantic Segmentation `_ for semantic segmentation + +- `COCO `_ for instance segmentation + +- `Pascal VOC `_ for instance segmentation and semantic segmentation + + +****** +Models +****** +.. _visual_prompting_zero_shot_model: + +We support the following model templates in experimental phase: + ++---------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------+---------------------+-----------------+ +| Template ID | Name | Complexity (GFLOPs) | Model size (MB) | ++===============================================================================================================================================================+========================+=====================+=================+ +| `Zero_Shot_SAM_Tiny_ViT `_ | Zero_Shot_SAM_Tiny_ViT | 38.18 | 25 | ++---------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------+---------------------+-----------------+ + +*************** +Simple tutorial +*************** +.. _visual_prompting_zero_shot_tutorial: + +There are two steps for zero-shot inference: ``learn`` and ``infer``. +``Learn`` is to extracet reference features from given reference images and prompts. These extracted reference features will be used to get point candidates on given target images. +Extracted reference features will be saved in the model checkpoint (such as `weight.pth`) with the model. +You can do ``learn`` with the following source code: + +.. code-block:: shell + + (otx) ...$ otx train --config \ + --data_root + +``Infer`` is to get predicted masks on given target images. Unlike ``learn``, this stage doesn't need any prompt information. + +.. code-block:: + + (otx) ...$ otx test --config \ + --data_root \ + --checkpoint + + +For example, when the positive (green) and the negative (red) points were given with the reference image for ``learn`` stage, you can get basic `SAM `_ prediction result (left). +If you give the same reference image as the target image for ``infer`` stage, you can get target prediction results (right). + +.. list-table:: + + * - .. figure:: ../../../../../utils/images/vpm_ref_result.png + + - .. figure:: ../../../../../utils/images/vpm_ref_prediction.png + + +You can get target prediction results for other given images like below. + +.. image:: ../../../../../utils/images/vpm_tgt_prediction.png diff --git a/docs/source/guide/get_started/api_tutorial.rst b/docs/source/guide/get_started/api_tutorial.rst new file mode 100644 index 00000000000..83fa6f52ca5 --- /dev/null +++ b/docs/source/guide/get_started/api_tutorial.rst @@ -0,0 +1,501 @@ +OpenVINO™ Training Extensions API Quick-Start +============================================== + +Besides CLI functionality, The OpenVINO™ Training Extension provides APIs that help developers to integrate OpenVINO™ Training Extensions models into their projects. +This tutorial intends to show how to create a dataset, model and use all of the CLI functionality through APIs. + +For demonstration purposes we will use the Object Detection SSD model with `WGISD `_ public dataset as we did for the :doc:`CLI tutorial <../tutorials/base/how_to_train/detection>`. + +.. note:: + + To start with we need to `install OpenVINO™ Training Extensions `_. + +******************* +Dataset preparation +******************* + +1. Clone a repository +with `WGISD dataset `_. + +.. code-block:: shell + + cd data + git clone https://github.com/thsant/wgisd.git + cd wgisd + git checkout 6910edc5ae3aae8c20062941b1641821f0c30127 + +2. We need to rename annotations to +be distinguished by OpenVINO™ Training Extensions Datumaro manager: + +.. code-block:: shell + + mv data images && mv coco_annotations annotations && mv annotations/train_bbox_instances.json instances_train.json && mv annotations/test_bbox_instances.json instances_val.json + +Now it is all set to use this dataset inside OpenVINO™ Training Extensions + +************************************ +Quick Start with auto-configuration +************************************ + +Once the dataset is ready, we can immediately start training with the model and data pipeline recommended by OTX through auto-configuration. +The following code snippet demonstrates how to use the auto-configuration feature: + +.. code-block:: python + + from otx.engine import Engine + + engine = Engine(data_root="data/wgisd") + engine.train() + + +.. note:: + + If dataset supports multiple Task types, this will default to the Task type detected by OTX. + If you want to specify a specific Task type, you need to specify it like below: + + .. code-block:: python + + from otx.engine import Engine + + engine = Engine(data_root="data/wgisd", task="INSTANCE_SEGMENTATION") + engine.train() + + +********************************** +Check Available Model Recipes +********************************** + +If you want to use other models offered by OTX besides the ones provided by Auto-Configuration, you can get a list of available models in OTX as shown below. + +.. code-block:: python + + from otx.engine.utils.api import list_models + + model_lists = list_models(task="DETECTION") + print(model_lists) + + ''' + [ + 'yolox_tiny_tile', + 'yolox_x', + 'yolox_l_tile', + 'yolox_x_tile', 'yolox_l', + 'atss_r50_fpn', + 'ssd_mobilenetv2', + 'yolox_s', + 'yolox_tiny', + 'openvino_model', + 'atss_mobilenetv2', + 'yolox_s_tile', + 'rtmdet_tiny', + 'atss_mobilenetv2_tile', + 'atss_resnext101', + 'ssd_mobilenetv2_tile', + ] + ''' + + +.. note:: + + If you're looking for a specific name, use the pattern argument. + + .. code-block:: python + + from otx.engine.utils.api import list_models + + model_lists = list_models(task="DETECTION", pattern="tile") + print(model_lists) + ''' + [ + 'yolox_tiny_tile', + 'ssd_mobilenetv2_tile', + 'yolox_l_tile', + 'yolox_s_tile', + 'yolox_x_tile', + 'atss_mobilenetv2_tile', + ] + ''' + + +You can also find the available model recipes in YAML form in the folder ``otx/recipe``. + +********* +Engine +********* + +The ``otx.engine.Engine`` class is the main entry point for using OpenVINO™ Training Extensions APIs. + +1. Setting ``task`` + +Specify ``task``. This is the task type for that ``Engine`` usage. +You can set the task by referencing the ``OTXTaskType`` in ``otx.core.types.task``. +If no task is specified, the task is detected and used via ``datamodule`` or ``data_root``. + +.. code-block:: python + + from otx.core.types.task import OTXTaskType + from otx.engine import Engine + + engine = Engine(task=OTXTaskType.DETECTION) + # or + engine = Engine(task="DETECTION") + +2. Setting ``work_dir`` + +Specify ``work_dir``. This is the workspace for that ``Engine``, and where output is stored. +The default value is currently ``./otx-workspace``. + +.. code-block:: python + + from otx.engine import Engine + + engine = Engine(work_dir="work_dir") + + +3. Setting device + +You can set the device by referencing the ``DeviceType`` in ``otx.core.types.device``. +The current default setting is ``auto``. + +.. code-block:: python + + from otx.core.types.device import DeviceType + from otx.engine import Engine + + engine = Engine(device=DeviceType.gpu) + # or + engine = Engine(device="gpu") + + +In addition, the ``Engine`` constructor can be associated with the Trainer's constructor arguments to control the Trainer's functionality. +Refer `lightning.Trainer `_. + +4. Using the OTX configuration we can configure the Engine. + +.. code-block:: python + + from otx.engine import Engine + + recipe = "src/otx/recipe/detection/atss_mobilenetv2.yaml" + engine = Engine.from_config( + config_path=recipe, + data_root="data/wgisd", + work_dir="./otx-workspace", + ) + + +********* +Training +********* + +Create an output model and start actual training: + +1. Below is an example using the ``atss_mobilenetv2`` model provided by OTX. + +.. code-block:: python + + from otx.engine import Engine + + engine = Engine(data_root="data/wgisd", model="atss_mobilenetv2") + engine.train() + +2. Alternatively, we can use the configuration file. + +.. code-block:: python + + from otx.engine import Engine + + config = "src/otx/recipe/detection/atss_mobilenetv2.yaml" + + engine = Engine.from_config(config_path=config, data_root="data/wgisd") + engine.train() + +.. note:: + + This can use callbacks provided by OTX and several training techniques. + However, in this case, no arguments are specified for train. + +3. If you want to specify the model, you can do so as shown below: + +The model used by the Engine is of type ``otx.core.model.entity.base.OTXModel``. + +.. tab-set:: + + .. tab-item:: Custom Model + + .. code-block:: python + + from otx.algo.detection.atss import ATSS + from otx.engine import Engine + + model = ATSS(num_classes=5, variant="mobilenetv2") + + engine = Engine(data_root="data/wgisd", model=model) + engine.train() + + .. tab-item:: Custom Model with checkpoint + + .. code-block:: python + + from otx.algo.detection.atss import ATSS + from otx.engine import Engine + + model = ATSS(num_classes=5, variant="mobilenetv2") + + engine = Engine(data_root="data/wgisd", model=model, checkpoint="") + engine.train() + + .. tab-item:: Custom Optimizer & Scheduler + + .. code-block:: python + + from torch.optim import SGD + from torch.optim.lr_scheduler import CosineAnnealingLR + from otx.algo.detection.atss import ATSS + from otx.engine import Engine + + model = ATSS(num_classes=5, variant="mobilenetv2") + optimizer = SGD(model.parameters(), lr=0.01, weight_decay=1e-4, momentum=0.9) + scheduler = CosineAnnealingLR(optimizer, T_max=10000, eta_min=0) + + engine = Engine( + ..., + model=model, + optimizer=optimizer, + scheduler=scheduler, + ) + engine.train() + +4. If you want to specify the datamodule, you can do so as shown below: + +The datamodule used by the Engine is of type ``otx.core.data.module.OTXDataModule``. + +.. code-block:: python + + from otx.core.data.module import OTXDataModule + from otx.engine import Engine + + datamodule = OTXDataModule(data_root="data/wgisd") + + engine = Engine(datamodule=datamodule) + engine.train() + +.. note:: + + If both ``data_root`` and ``datamodule`` enter ``Engine`` as input, ``Engine`` uses datamodule as the base. + + +5. You can use train-specific arguments with ``train()`` function. + +.. tab-set:: + + .. tab-item:: Change Max Epochs + + .. code-block:: python + + engine.train(max_epochs=10) + + .. tab-item:: Fix Training Seed & Set Deterministic + + .. code-block:: python + + engine.train(seed=1234, deterministic=True) + + .. tab-item:: Use Mixed Precision + + .. code-block:: python + + engine.train(precision="16") + + .. note:: + + This uses lightning's precision value. You can use the values below: + - "64", "32", "16", "bf16", + - 64, 32, 16 + + .. tab-item:: Change Validation Metric + + .. code-block:: python + + from otx.core.metrics.fmeasure import FMeasure + + metric = FMeasue(num_classes=5) + engine.train(metric=metric) + + .. tab-item:: Set Callbacks & Logger + + .. code-block:: python + + from lightning.pytorch.callbacks import EarlyStopping + from lightning.pytorch.loggers.tensorboard import TensorBoardLogger + + engine.train(callbacks=[EarlyStopping()], loggers=[TensorBoardLogger()]) + +In addition, the ``train()`` function can be associated with the Trainer's constructor arguments to control the Trainer's functionality. +Refer `lightning.Trainer `_. + +For example, if you want to use the ``limit_val_batches`` feature provided by Trainer, you can use it like this: + +.. code-block:: python + + # disable validation + engine.train(limit_val_batches=0) + +6. It's also easy to use HPOs. + +.. code-block:: python + + engine.train(run_hpo=True) + + +*********** +Evaluation +*********** + +If the training is already in place, we just need to use the code below: + +.. tab-set:: + + .. tab-item:: Evaluate Model + + .. code-block:: python + + engine.test() + + .. tab-item:: Evaluate Model with different checkpoint + + .. code-block:: python + + engine.test(checkpoint="") + + .. note:: + + The format that can enter the checkpoint is of type torch (.ckpt) or exported model (.onnx, .xml). + + .. tab-item:: Evaluate Model with different datamodule or dataloader + + .. code-block:: python + + from otx.core.data.module import OTXDataModule + + datamodule = OTXDataModule(data_root="data/wgisd") + engine.test(datamodule=datamodule) + + .. tab-item:: Evaluate Model with different metrics + + .. code-block:: python + + from otx.core.metrics.fmeasure import FMeasure + + metric = FMeasue(num_classes=5) + engine.test(metric=metric) + + +*********** +Exporting +*********** + +To export our model to OpenVINO™ IR format we need to create output model and run exporting task. +If the engine is trained, you can proceed with the export using the current engine's checkpoint: + +The default value for ``export_format`` is ``OPENVINO``. +The default value for ``export_precision`` is ``FP32``. + +.. tab-set:: + + .. tab-item:: Export OpenVINO™ IR + + .. code-block:: python + + engine.export() + + .. tab-item:: Export ONNX + + .. code-block:: python + + engine.export(export_format="ONNX") + + .. tab-item:: Export with explain features + + .. code-block:: python + + engine.export(explain=True) + + .. note:: + + This ensures that it is exported with a ``saliency_map`` and ``feature_vector`` that will be used in the XAI. + + .. tab-item:: Export with different checkpoint + + .. code-block:: python + + engine.export(checkpoint="") + + .. tab-item:: Export with FP16 + + .. code-block:: python + + engine.export(precision="FP16") + + +**** +XAI +**** + +To run the XAI with the OpenVINO™ IR model, we need to create an output model and run the XAI procedure: + +.. tab-set:: + + .. tab-item:: Run XAI + + .. code-block:: python + + engine.explain(checkpoint="") + + .. tab-item:: Evaluate Model with different datamodule or dataloader + + .. code-block:: python + + from otx.core.data.module import OTXDataModule + + datamodule = OTXDataModule(data_root="data/wgisd") + engine.explain(..., datamodule=datamodule) + + .. tab-item:: Dump saliency_map + + .. code-block:: python + + engine.explain(..., dump=True) + + +************ +Optimization +************ + +To run the optimization with PTQ on the OpenVINO™ IR model, we need to create an output model and run the optimization procedure: + +.. tab-set:: + + .. tab-item:: Run PTQ Optimization + + .. code-block:: python + + engine.optimize(checkpoint="") + + .. tab-item:: Evaluate Model with different datamodule or dataloader + + .. code-block:: python + + from otx.core.data.module import OTXDataModule + + datamodule = OTXDataModule(data_root="data/wgisd") + engine.optimize(..., datamodule=datamodule) + + +You can validate the optimized model as the usual model. For example for the NNCF model it will look like this: + +.. code-block:: python + + engine.test(checkpoint="") + +That's it. Now, we can use OpenVINO™ Training Extensions APIs to create, train, and deploy deep learning models using the OpenVINO™ Training Extension. diff --git a/docs/source/guide/get_started/cli_commands.rst b/docs/source/guide/get_started/cli_commands.rst new file mode 100644 index 00000000000..7779e49b6f3 --- /dev/null +++ b/docs/source/guide/get_started/cli_commands.rst @@ -0,0 +1,500 @@ +OpenVINO™ Training Extensions CLI Usage +========================================== + +All possible OpenVINO™ Training Extensions CLI commands are presented below along with some general examples of how to run specific functionality. There are :doc:`dedicated tutorials <../tutorials/base/how_to_train/index>` in our documentation with life-practical examples on specific datasets for each task. + +.. note:: + + To run CLI commands you need to prepare a dataset. Each task requires specific data formats. To know more about which formats are supported by each task, refer to :doc:`explanation section <../explanation/algorithms/index>` in the documentation. + Also, by default, the OTX CLI is written using jsonargparse, see jsonargparse or LightningCLI. + `Jsonargparse Documentation _` + +***** +Help +***** + +``otx --help`` show available sub-commands. + +.. code-block:: shell + + (otx) ...$ otx --help + ╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────╮ + │ Usage: otx [-h] [-v] {install,find,train,test,predict,export,optimize,explain} ... │ + │ │ + │ │ + │ OpenVINO Training-Extension command line tool │ + │ │ + │ │ + │ Options: │ + │ -h, --help Show this help message and exit. │ + │ -v, --version Display OTX version number. │ + │ │ + │ Subcommands: │ + │ For more details of each subcommand, add it as an argument followed by --help. │ + │ │ + │ │ + │ Available subcommands: │ + │ install Install OTX requirements. │ + │ find This shows the model provided by OTX. │ + │ train Trains the model using the provided LightningModule and OTXDataModule. │ + │ test Run the testing phase of the engine. │ + │ predict Run predictions using the specified model and data. │ + │ export Export the trained model to OpenVINO Intermediate Representation (IR) or ONNX formats. │ + │ optimize Applies NNCF.PTQ to the underlying models (now works only for OV models). │ + │ explain Run XAI using the specified model and data (test subset). │ + │ │ + ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + +.. note:: + + After installing the package, if torch is not installed properly, this will only show the ``install`` subcommand. You can refer to this :doc:`installation section `. + + +The subcommand can get help output in the following way. +For basic subcommand help, the Verbosity Level is 0. In this case, the CLI provides a Quick-Guide in markdown. + +.. code-block:: shell + + (otx) ...$ otx train --help + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ OpenVINO™ Training Extensions CLI Guide ┃ + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ + + Github Repository: + https://github.com/openvinotoolkit/training_extensions. + + A better guide is provided by the documentation. + ╭─ Quick-Start ─────────────────────────────────────────────────────────╮ + │ │ + │ 1 you can train with data_root only. then OTX will provide default │ + │ model. │ + │ │ + │ │ + │ otx train --data_root │ + │ │ + │ │ + │ 2 you can pick a model or datamodule as Config file or Class. │ + │ │ + │ │ + │ otx train │ + │ --data_root │ + │ --model --data │ + │ │ + │ │ + │ 3 Of course, you can override the various values with commands. │ + │ │ + │ │ + │ otx train │ + │ --data_root │ + │ --max_epochs --checkpoint │ + │ │ + │ │ + │ 4 If you have a complete configuration file, run it like this. │ + │ │ + │ │ + │ otx train --data_root --config │ + │ │ + │ │ + │ To get more overridable argument information, run the command below. │ + │ │ + │ │ + │ # Verbosity Level 1 │ + │ otx train [optional_arguments] -h -v │ + │ # Verbosity Level 2 │ + │ otx train [optional_arguments] -h -vv │ + │ │ + ╰───────────────────────────────────────────────────────────────────────╯ + +For Verbosity Level 1, it shows Quick-Guide & the essential arguments. + +.. code-block:: shell + + (otx) ...$ otx train --help -v + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ OpenVINO™ Training Extensions CLI Guide ┃ + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ + + Github Repository: + https://github.com/openvinotoolkit/training_extensions. + + A better guide is provided by the documentation. + ╭─ Quick-Start ─────────────────────────────────────────────────────────╮ + │ ... │ + ╰───────────────────────────────────────────────────────────────────────╯ + ╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────╮ + │ Usage: otx [options] train [-h] [-c CONFIG] [--print_config [=flags]] │ + │ [--data_root DATA_ROOT] [--task TASK] │ + │ [--engine CONFIG] │ + │ [--work_dir WORK_DIR] │ + │ [--engine.checkpoint CHECKPOINT] │ + │ [--engine.device {auto,gpu,cpu,tpu,ipu,hpu,mps}] │ + │ [--model.help CLASS_PATH_OR_NAME] │ + │ [--model CONFIG | CLASS_PATH_OR_NAME | .INIT_ARG_NAME VALUE] │ + │ [--data CONFIG] │ + │ [--optimizer CONFIG | CLASS_PATH_OR_NAME | .INIT_ARG_NAME VALUE] │ + │ [--scheduler CONFIG | CLASS_PATH_OR_NAME | .INIT_ARG_NAME VALUE] │ + │ │ + ... + +For Verbosity Level 2, it shows all available arguments. + +.. code-block:: shell + + (otx) ...$ otx train --help -vv + + +************ +print_config +************ + +Preview all configuration values that will be executed through that command line. + +.. code-block:: shell + + (otx) ...$ otx train --config --print_config + + +.. code-block:: yaml + + data_root: tests/assets/car_tree_bug + callback_monitor: val/map_50 + engine: + task: DETECTION + work_dir: ./otx-workspace + device: auto + model: + class_path: otx.algo.detection.atss.ATSS + init_args: + num_classes: 1000 + variant: mobilenetv2 + optimizer: ... + scheduler: ... + data: + task: DETECTION + config: + data_format: coco_instances + train_subset: ... + val_subset: ... + test_subset: ... + mem_cache_size: 1GB + mem_cache_img_max_size: null + image_color_channel: RGB + include_polygons: false + max_epochs: 2 + deterministic: false + precision: 16 + callbacks: ... + logger: ... + +Users can also pre-generate a config file with an example like the one below. + +.. code-block:: shell + + (otx) ...$ otx train --config --print_config > config.yaml + + +***** +Find +***** + +``otx find`` lists model templates and backbones available for the given task. Specify the task name with ``--task`` option. Use ``--pattern`` to find the model name from OTX. + +.. code-block:: shell + + (otx) ...$ otx find --help + ╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ + │ Usage: otx [options] find [-h] │ + │ [--task {ACTION_CLASSIFICATION,ACTION_DETECTION,ANOMALY_CLASSIFICATION,ANOMALY_DETECTION,ANOMALY_SEGMENTATION,MULTI_CLASS_CLS,MULTI_LABEL_CLS,H_LABEL_CLS,DETEC │ + │ [--pattern PATTERN] │ + │ │ + │ │ + │ Options: │ + │ -h, --help Show this help message and exit. │ + │ --task {ACTION_CLASSIFICATION,ACTION_DETECTION,ANOMALY_CLASSIFICATION,ANOMALY_DETECTION,ANOMALY_SEGMENTATION,MULTI_CLASS_CLS,MULTI_LABEL_CLS,H_LABEL_CLS,DETECTION,ROTATED_DETECTION,DE │ + │ Value for filtering by task. Default is None, which shows all recipes. (type: None, default: None) │ + │ --pattern PATTERN This allows you to filter the model name of the recipe. For example, if you want to find all models that contain the word 'efficient', you can use '--pattern │ + │ efficient' (type: None, default: None) │ + │ │ + ╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + + +Example to find ready-to-use templates for the detection task: + +.. code-block:: shell + + (otx) ...$ otx find --task DETECTION + ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Task ┃ Model Name ┃ Recipe Path ┃ + ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ DETECTION │ yolox_tiny │ recipe/detection/yolox_tiny.yaml │ + │ DETECTION │ atss_mobilenetv2_tile │ recipe/detection/atss_mobilenetv2_tile.yaml │ + │ DETECTION │ openvino_model │ recipe/detection/openvino_model.yaml │ + │ DETECTION │ atss_mobilenetv2 │ recipe/detection/atss_mobilenetv2.yaml │ + │ DETECTION │ atss_resnext101 │ recipe/detection/atss_resnext101.yaml │ + │ DETECTION │ yolox_l_tile │ recipe/detection/yolox_l_tile.yaml │ + │ DETECTION │ ssd_mobilenetv2_tile │ recipe/detection/ssd_mobilenetv2_tile.yaml │ + │ DETECTION │ atss_r50_fpn │ recipe/detection/atss_r50_fpn.yaml │ + │ DETECTION │ yolox_tiny_tile │ recipe/detection/yolox_tiny_tile.yaml │ + │ DETECTION │ yolox_s │ recipe/detection/yolox_s.yaml │ + │ DETECTION │ yolox_s_tile │ recipe/detection/yolox_s_tile.yaml │ + │ DETECTION │ rtmdet_tiny │ recipe/detection/rtmdet_tiny.yaml │ + │ DETECTION │ yolox_x │ recipe/detection/yolox_x.yaml │ + │ DETECTION │ yolox_x_tile │ recipe/detection/yolox_x_tile.yaml │ + │ DETECTION │ ssd_mobilenetv2 │ recipe/detection/ssd_mobilenetv2.yaml │ + │ DETECTION │ yolox_l │ recipe/detection/yolox_l.yaml │ + └───────────┴───────────────────────┴─────────────────────────────────────────────┘ + +Example to find yolo named model for the detection task: + +.. code-block:: shell + + (otx) ...$ otx find --task DETECTION --pattern 'yolo*' + ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Task ┃ Model Name ┃ Recipe Path ┃ + ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ DETECTION │ yolox_tiny │ recipe/detection/yolox_tiny.yaml │ + │ DETECTION │ yolox_x │ recipe/detection/yolox_x.yaml │ + │ DETECTION │ yolox_l_tile │ recipe/detection/yolox_l_tile.yaml │ + │ DETECTION │ yolox_s │ recipe/detection/yolox_s.yaml │ + │ DETECTION │ yolox_l │ recipe/detection/yolox_l.yaml │ + │ DETECTION │ yolox_x_tile │ recipe/detection/yolox_x_tile.yaml │ + │ DETECTION │ yolox_s_tile │ recipe/detection/yolox_s_tile.yaml │ + │ DETECTION │ yolox_tiny_tile │ recipe/detection/yolox_tiny_tile.yaml │ + └───────────┴─────────────────┴───────────────────────────────────────┘ + + + +---------------- +Dataset handling +---------------- + +If the train dataset root and validation dataset root are the same - pass the same path to both options. For example, you have a standard COCO format for object detection: + +.. code-block:: + + coco_data_root + |---- annotations + |---- instances_train.json + |---- instances_val.json + |---- images + |---- train + |---- 000.jpg + .... + |---- val + |---- 000.jpg + .... + + +Then pass the path to ``coco_data_root`` to both root options: + +.. code-block:: + + --data_root coco_data_root + + +********* +Training +********* + +``otx train`` trains a model (a particular model template) on a dataset: + +The results will be saved in ``./otx-workspace/`` folder by default. The output folder can be modified by ``--work_dir`` option. These files are used by other commands: ``export``, ``test``, ``demo``, etc. + +``otx train`` receives ``--config`` as a argument. ``config`` can be a path to the specific ``*.yaml`` file. Also, the path to data root should be passed to the CLI to start training. + + + +Example of the command line to start training using Auto-Configuration: + +.. code-block:: shell + + (otx) ...$ otx train --data_root --task + +You can use the recipe configuration provided by OTX. The corresponding configuration file can be found via ``otx find``. + +.. code-block:: shell + + (otx) ...$ otx train --config --data_root + +.. note:: + You also can visualize the training using ``Tensorboard`` as these logs are located in ``/tensorboard``. + +.. note:: + ``--data.config.mem_cache_size`` provides in-memory caching for decoded images in main memory. + If the batch size is large, such as for classification tasks, or if your dataset contains high-resolution images, + image decoding can account for a non-negligible overhead in data pre-processing. + This option can be useful for maximizing GPU utilization and reducing model training time in those cases. + If your machine has enough main memory, we recommend increasing this value as much as possible. + For example, you can cache approximately 10,000 of ``500x375~500x439`` sized images with ``--data.config.mem_cache_size 8GB``. + +It is also possible to start training by omitting the template and just passing the paths to dataset roots, then the :doc:`auto-configuration <../explanation/additional_features/auto_configuration>` will be enabled. Based on the dataset, OpenVINO™ Training Extensions will choose the task type and template with the best accuracy/speed trade-off. + +You can override the configurable arguments. +For example, that is how you can change the max epochs and the batch size for the training: + +.. code-block:: shell + + (otx) ...$ otx train ... --data.config.train_subset.batch_size --max_epochs + +.. note:: + + ``train``, ``test`` works based on ``lightning.Tranier``. You can change the Trainer component with the arguments of train and test. You can find more arguments in this documentation. + `Trainer _` + +********** +Exporting +********** + +``otx export`` exports a trained model to the OpenVINO™ IR format to efficiently run it on Intel hardware. + +The command below performs exporting to the ``{work_dir}/`` path. + +.. code-block:: shell + + (otx) ...$ otx export ... --checkpoint + +The command results in ``exported_model.xml``, ``exported_model.bin``. + +To use the exported model as an input for ``otx explain``, please dump additional outputs with internal information, using ``--explain``: + +.. code-block:: shell + + (otx) ...$ otx export ... --checkpoint --explain True + + +.. note:: + If ``.latest`` exists in work_dir, you can omit checkpoint and config. + You can also omit ``--work_dir`` if you run from the root of the workspace that contains ``.latest``. + + .. code-block:: shell + + (otx) ...$ otx export --work_dir + + # OR if you are in the workspace root + (otx) ...$ otx export + + +************ +Optimization +************ + +``otx optimize`` optimizes a model using `PTQ `_ depending on the model and transforms it to ``INT8`` format. + +- PTQ optimization used for models exported in the OpenVINO™ IR format + +Command example for optimizing OpenVINO™ model (.xml) with OpenVINO™ PTQ: + +.. code-block:: shell + + (otx) ...$ otx optimize ... --checkpoint \ + --data_root \ + + +Thus, to use PTQ pass the path to exported IR (.xml) model. + +.. note:: + If ``.latest`` exists in work_dir, you can omit checkpoint and config. + You can also omit ``--work_dir`` if you run from the root of the workspace that contains ``.latest``. + + .. code-block:: shell + + (otx) ...$ otx optimize --work_dir + + # OR if you are in the workspace root + (otx) ...$ otx optimize + +*********** +Evaluation +*********** + +``otx test`` runs the evaluation of a model on the specific dataset. + +The command below will evaluate the trained model on the provided dataset: + +.. code-block:: shell + + (otx) ...$ otx test ... --data_root \ + --checkpoint + +.. note:: + + It is possible to pass both PyTorch weights ``.pth`` or OpenVINO™ IR ``openvino.xml`` to ``--checkpoint`` option. + + +.. note:: + If ``.latest`` exists in work_dir, you can omit checkpoint and config. + You can also omit ``--work_dir`` if you run from the root of the workspace that contains ``.latest``. + + .. code-block:: shell + + (otx) ...$ otx test --work_dir + + # OR if you are in the workspace root + (otx) ...$ otx test + +*********** +Explanation +*********** + +``otx explain`` runs the explainable AI (XAI) algorithm on a specific model-dataset pair. It helps explain the model's decision-making process in a way that is easily understood by humans. + + +The command below will generate saliency maps (heatmaps with red colored areas of focus) of the trained model on the provided dataset and save the resulting images to ``output`` path: + +.. code-block:: shell + + (otx) ...$ otx explain --config \ + --checkpoint + +.. note:: + + It is possible to pass both PyTorch weights ``.pth`` or OpenVINO™ IR ``openvino.xml`` to ``--load-weights`` option. + +By default, the model is exported to the OpenVINO™ IR format without extra feature information needed for the ``explain`` function. To use OpenVINO™ IR model in ``otx explain``, please first export it with ``--explain`` parameter: + +.. code-block:: shell + + (otx) ...$ otx export ... --checkpoint \ + --explain True + (otx) ...$ otx explain ... --checkpoint outputs/openvino/with_features \ + + +*********** +Workspace +*********** + +If we run a typical Training example, will have a folder like the one below as output. + +.. code-block:: bash + + otx-workspace/ + .latest/ # Gather the most recent information. + train/ # Link to the output_dir where the most recent train was performed. + export/ # Link to the output_dir where the most recent export was performed. + .../ + 20240000_000000/ # Deliverables from OTX CLI + 20240000_000001/ # Deliverables from OTX CLI Second-Trial + + +OTX considers the folder with ``.latest`` to be the root of the entire Workspace. +``.latest`` soft-links to the most recently trained output folder. + +Case 1: If a user specifies an output ``work_dir`` (An already existing workspace) + +.. code-block:: shell + + (otx) ...$ otx train --work_dir otx-workspace + + +This will then use the ``.latest`` in the otx-workspace for training. + +Case 2: if a user executes a command from within the otx-workspace + +.. code-block:: shell + + cd otx-workspace + + (otx) ...$ otx train # Behave in the same way as the first training + (otx) ...$ otx test # Perform a test with the config and checkpoint from the last training baseline. + (otx) ...$ otx export # Perform a export with the config and checkpoint from the last training baseline. diff --git a/docs/source/guide/get_started/installation.rst b/docs/source/guide/get_started/installation.rst new file mode 100644 index 00000000000..fd4ce0d08f6 --- /dev/null +++ b/docs/source/guide/get_started/installation.rst @@ -0,0 +1,157 @@ +Installation +============ + +************** +Prerequisites +************** + +The current version of OpenVINO™ Training Extensions was tested in the following environment: + +- Ubuntu 20.04 +- Python >= 3.10 + + +*********************************************** +Install OpenVINO™ Training Extensions for users +*********************************************** + +1. Clone the training_extensions +repository with the following command: + +.. code-block:: shell + + git clone https://github.com/openvinotoolkit/training_extensions.git + cd training_extensions + git checkout develop + +2. Set up a +virtual environment. + +.. code-block:: shell + + # Create virtual env. + python -m venv .otx + + # Activate virtual env. + source .otx/bin/activate + +3. Install OpenVINO™ Training Extensions package from either: + +* A local source in development mode + +.. code-block:: shell + + pip install -e . + +* PyPI + +.. code-block:: shell + + pip install otx + +4. Install PyTorch & Requirements for training according to your system environment. + +.. code-block:: shell + + otx install -v + +[Optional] Refer to the `official installation guide `_ + +.. note:: + + Currently, only torch==2.1.1 was fully validated. (older versions are not supported due to security issues). + + +5. Once the package is installed in the virtual environment, you can use full +OpenVINO™ Training Extensions command line functionality. + +**************************************************** +Install OpenVINO™ Training Extensions for developers +**************************************************** + +Install ``tox`` and create a development environment: + +.. code-block:: shell + + pip install tox + # -- need to replace '310' below if another python version needed + tox devenv venv/otx -e unit-test-py310 + source venv/otx/bin/activate + +Then you may change code, and all fixes will be directly applied to the editable package. + +***************************************************** +Install OpenVINO™ Training Extensions by using Docker +***************************************************** + +1. By executing the following commands, it will build two Docker images: ``otx:${OTX_VERSION}-cuda`` and ``otx:${OTX_VERSION}-cuda-pretrained-ready``. + +.. code-block:: shell + + git clone https://github.com/openvinotoolkit/training_extensions.git + cd docker + ./build.sh + +2. After that, you can check whether the images are built correctly such as + +.. code-block:: shell + + docker image ls | grep otx + +Example: + +.. code-block:: shell + + otx 2.0.0-cuda-pretrained-ready 4f3b5f98f97c 3 minutes ago 14.5GB + otx 2.0.0-cuda 8d14caccb29a 8 minutes ago 10.4GB + + +``otx:${OTX_VERSION}-cuda`` is a minimal Docker image where OTX is installed with CUDA supports. On the other hand, ``otx:${OTX_VERSION}-cuda-pretrained-ready`` includes all the model pre-trained weights that OTX provides in addition to ``otx:${OTX_VERSION}-cuda``. + +********* +Run tests +********* + +To run some tests, need to have development environment on your host. The development requirements file (requirements/dev.txt) +would be used to setup them. + +.. code-block:: shell + + $ otx install --option dev + $ pytest tests/ + +Another option to run the tests is using the testing automation tool `tox `_. Following commands will install +the tool ``tox`` to your host and run all test codes inside of ``tests/`` folder. + +.. code-block:: + + $ pip install tox + $ tox -e tests-all-py310 -- tests/ + +.. note:: + + When running the ``tox`` command above first time, it will create virtual env by installing all dependencies of this project into + the newly created environment for your testing before running the actual testing. So, it is expected to wait more than 10 minutes + before to see the actual testing results. + +*************** +Troubleshooting +*************** + +1. If you have problems when you try to use ``pip install`` command, +please update pip version by following command: + +.. code-block:: + + python -m pip install --upgrade pip + +2. If you're facing a problem with ``torch`` or ``mmcv`` installation, please check that your CUDA version is compatible with torch version. +Consider updating CUDA and CUDA drivers if needed. +Check the `command example `_ to install CUDA 11.8 with drivers on Ubuntu 20.04. + +3. If you have access to the Internet through the proxy server only, +please use pip with proxy call as demonstrated by command below: + +.. code-block:: shell + + python -m pip install --proxy http://:@: diff --git a/docs/source/guide/get_started/introduction.rst b/docs/source/guide/get_started/introduction.rst new file mode 100644 index 00000000000..073e9f83f2d --- /dev/null +++ b/docs/source/guide/get_started/introduction.rst @@ -0,0 +1,139 @@ +.. raw:: html + +
+ Logo +
+ +Introduction +============ + +**OpenVINO™ Training Extensions** is a low-code transfer learning framework for Computer Vision. + +The CLI commands of the framework allows users to train, infer, optimize and deploy models easily and quickly even with low expertise in the deep learning field. OpenVINO™ Training Extensions offers diverse combinations of model architectures, learning methods, and task types based on `PyTorch `_ and `OpenVINO™ toolkit `_. + +OpenVINO™ Training Extensions provides a **“model template”** for every supported task type, which consolidates necessary information to build a model. Model templates are validated on various datasets and serve one-stop shop for obtaining the best models in general. If you are an experienced user, you can configure your own model based on `torchvision `_, `pytorchcv `_, `mmcv `_ and `OpenVINO Model Zoo (OMZ) `_ frameworks. + +Furthermore, OpenVINO™ Training Extensions provides :doc:`automatic configuration <../explanation/additional_features/auto_configuration>` of task types and hyperparameters. The framework will identify the most suitable model template based on your dataset, and choose the best hyperparameter configuration. The development team is continuously extending functionalities to make training as simple as possible so that single CLI command can obtain accurate, efficient and robust models ready to be integrated into your project. + +************ +Key Features +************ + +OpenVINO™ Training Extensions supports the following computer vision tasks: + +- **Classification**, including multi-class, multi-label and hierarchical image classification tasks. +- **Object detection** including rotated bounding box support +- **Semantic segmentation** +- **Instance segmentation** including tiling algorithm support +- **Action recognition** including action classification and detection +- **Anomaly recognition** tasks including anomaly classification, detection and segmentation + +OpenVINO™ Training Extensions supports the :doc:`following learning methods <../explanation/algorithms/index>`: + +- **Supervised**, incremental training, which includes class incremental scenario and contrastive learning for classification and semantic segmentation tasks + +OpenVINO™ Training Extensions will provide the :doc:`following features <../explanation/additional_features/index>` in coming releases: + +- **Distributed training** to accelerate the training process when you have multiple GPUs +- **Half-precision training** to save GPUs memory and use larger batch sizes +- Integrated, efficient :doc:`hyper-parameter optimization module <../explanation/additional_features/hpo>` (**HPO**). Through dataset proxy and built-in hyper-parameter optimizer, you can get much faster hyper-parameter optimization compared to other off-the-shelf tools. The hyperparameter optimization is dynamically scheduled based on your resource budget. +- OpenVINO™ Training Extensions uses `Datumaro `_ as the backend to handle datasets. On account of that, OpenVINO™ Training Extensions supports the most common academic field dataset formats for each task. In the future there will be more supported formats available to give more freedom of datasets format choice. +- Improved :doc:`auto-configuration functionality <../explanation/additional_features/auto_configuration>`. OpenVINO™ Training Extensions analyzes provided dataset and selects the proper task and model template to provide the best accuracy/speed trade-off. It will also make a random auto-split of your dataset if there is no validation set provided. + +********************* +Documentation content +********************* + +1. :octicon:`light-bulb` **Quick start guide**: + + .. grid:: + :gutter: 1 + + .. grid-item-card:: Installation Guide + :link: installation + :link-type: doc + :text-align: center + + .. grid-item-card:: API Quick-Start + :link: api_tutorial + :link-type: doc + :text-align: center + + .. grid-item-card:: CLI Commands + :link: cli_commands + :link-type: doc + :text-align: center + +2. :octicon:`book` **Tutorials**: + + .. grid:: 1 2 2 3 + :margin: 1 1 0 0 + :gutter: 1 + + .. grid-item-card:: Classification + :link: ../tutorials/base/how_to_train/classification + :link-type: doc + :text-align: center + + .. grid-item-card:: Detection + :link: ../tutorials/base/how_to_train/detection + :link-type: doc + :text-align: center + + .. grid-item-card:: Instance Segmentation + :link: ../tutorials/base/how_to_train/instance_segmentation + :link-type: doc + :text-align: center + + .. grid-item-card:: Semantic Segmentation + :link: ../tutorials/base/how_to_train/semantic_segmentation + :link-type: doc + :text-align: center + + .. grid-item-card:: Anomaly Task + :link: ../tutorials/base/how_to_train/anomaly_detection + :link-type: doc + :text-align: center + + .. grid-item-card:: Action Classification + :link: ../tutorials/base/how_to_train/action_classification + :link-type: doc + :text-align: center + + .. grid-item-card:: Action Detection + :link: ../tutorials/base/how_to_train/action_detection + :link-type: doc + :text-align: center + + .. grid-item-card:: Visual Prompting + :text-align: center + + .. grid-item-card:: Advanced + :link: ../tutorials/advanced/index + :link-type: doc + :text-align: center + +3. **Explanation section**: + + This section consists of an algorithms explanation and describes additional features that are supported by OpenVINO™ Training Extensions. + :ref:`Algorithms ` section includes a description of all supported algorithms: + + 1. Explanation of the task and main supervised training pipeline. + 2. Description of the supported datasets formats for each task. + 3. Available templates and models. + 4. Incremental learning approach. + 5. Semi-supervised and Self-supervised algorithms. + + :ref:`Additional Features ` section consists of: + + 1. Overview of model optimization algorithms. + 2. Hyperparameters optimization functionality (HPO). + 3. Auto-configuration algorithm to select the most appropriate training pipeline for a given dataset. + +4. **Reference**: + + This section gives an overview of the OpenVINO™ Training Extensions code base. There source code for Entities, classes and functions can be found. + +5. **Release Notes**: + + There can be found a description of new and previous releases. diff --git a/docs/source/guide/index.rst b/docs/source/guide/index.rst new file mode 100644 index 00000000000..73ca70741f2 --- /dev/null +++ b/docs/source/guide/index.rst @@ -0,0 +1,42 @@ +Guide +===== + + +.. toctree:: + :hidden: + :maxdepth: 3 + :caption: Get Started + + get_started/introduction + get_started/installation + get_started/cli_commands + get_started/api_tutorial + + +.. toctree:: + :hidden: + :caption: Tutorials + + tutorials/base/index.rst + tutorials/advanced/index.rst + + +.. toctree:: + :hidden: + :caption: Explanation + + explanation/algorithms/index + explanation/additional_features/index + + +.. toctree:: + :hidden: + :caption: Reference + + reference/index + + +.. toctree:: + :caption: Release Notes + + release_notes/index diff --git a/docs/source/guide/reference/index.rst b/docs/source/guide/reference/index.rst new file mode 100644 index 00000000000..6abaf03ba3d --- /dev/null +++ b/docs/source/guide/reference/index.rst @@ -0,0 +1,11 @@ +API reference +============= + +.. _api_reference: + +.. autosummary:: + :recursive: + :nosignatures: + :toctree: _autosummary + + otx diff --git a/docs/source/guide/release_notes/index.rst b/docs/source/guide/release_notes/index.rst new file mode 100644 index 00000000000..133b7350c9e --- /dev/null +++ b/docs/source/guide/release_notes/index.rst @@ -0,0 +1,190 @@ +Releases +######## + +.. toctree:: + :maxdepth: 1 + +v1.5.0 (4Q23) +------------- + +- Enable configurable confidence threshold for otx eval and export +- Add YOLOX variants as new object detector models +- Enable FeatureVectorHook to support action tasks +- Add ONNX metadata to detection, instance segmantation, and segmentation models +- Add a new feature to configure input size +- Introduce the OTXSampler and AdaptiveRepeatDataHook to achieve faster training at the small data regime +- Add a new object detector Lite-DINO +- Add Semi-SL Mean Teacher algorithm for Instance Segmentation task +- Official supports for YOLOX-X, YOLOX-L, YOLOX-S, ResNeXt101-ATSS +- Add new argument to track resource usage in train command +- Add Self-SL for semantic segmentation of SegNext families +- Adapt input size automatically based on dataset statistics +- Refine input data in-memory caching +- Adapt timeout value of initialization for distributed training +- Optimize data loading by merging load & resize operations w/ caching support for cls/det/iseg/sseg +- Support torch==2.0.1 +- Set "Auto" as default input size mode + + +v1.4.4 (4Q23) +------------- + +- Update ModelAPI configuration +- Add Anomaly modelAPI changes +- Update Image numpy access + +v1.4.3 (4Q23) +------------- + +- Re introduce adaptive scheduling for training + +v1.4.2 (4Q23) +------------- + +- Upgrade nncf version to 2.6.0 +- Bump datumaro version to 1.5.0 +- Set tox version constraint +- Add model category attributes to model template +- Minor bug fixes + +v1.4.1 (3Q23) +------------- + +- Update the README file in exportable code +- Minor bug fixes + +v1.4.0 (3Q23) +------------- + +- Support encrypted dataset training +- Add custom max iou assigner to prevent CPU OOM when large annotations are used +- Auto train type detection for Semi-SL, Self-SL and Incremental: "--train-type" now is optional +- Add per-class XAI saliency maps for Mask R-CNN model +- Add new object detector Deformable DETR +- Add new object detector DINO +- Add new visual prompting task +- Add new object detector ResNeXt101-ATSS +- Introduce channel_last parameter to improve the performance +- Decrease time for making a workspace +- Set persistent_workers and pin_memory as True in detection task +- New algorithm for Semi-SL semantic segmentation based on metric learning via class prototypes +- Self-SL for classification now can recieve just folder with any images to start contrastive pretraining +- Update OpenVINO version to 2023.0, and NNCF verion to 2.5 +- Improve XAI saliency map generation for tiling detection and tiling instance segmentation +- Remove CenterCrop from Classification test pipeline and editing missing docs link +- Switch to PTQ for sseg +- Minor bug fixes + +v1.3.1 (2Q23) +------------- +- Minor bug fixes + +v1.3.0 (2Q23) +------------- + +- Support direct annotation input for COCO format +- Action task supports multi GPU training +- Support storage cache in Apache Arrow using Datumaro for action tasks +- Add a simplified greedy labels postprocessing for hierarchical classification +- Support auto adapting batch size +- Support auto adapting num_workers +- Support noisy label detection for detection tasks +- Make semantic segmentation OpenVINO models compatible with ModelAPI +- Support label hierarchy through LabelTree in LabelSchema for classification task +- Enhance exportable code file structure, video inference and default value for demo +- Speedup OpenVINO inference in image classificaiton, semantic segmentation, object detection and instance segmentation tasks +- Refactoring of ONNX export functionality +- Minor bug fixes + +v1.2.4 (3Q23) +------------- +- Per-class saliency maps for M-RCNN +- Disable semantic segmentation soft prediction processing +- Update export and nncf hyperparameters +- Minor bug fixes + +v1.2.3 (2Q23) +------------- + +- Improve warning message for tiling configurable parameter +- Minor bug fixes + +v1.2.1 (2Q23) +------------- + +- Upgrade mmdeploy==0.14.0 from official PyPI +- Integrate new ignored loss in semantic segmentation +- Optimize YOLOX data pipeline +- Tiling Spatial Concatenation for OpenVINO IR +- Optimize counting train & inference speed and memory consumption +- Minor bug fixes + +v1.2.0 (2Q23) +------------- + +- Add generating feature cli_report.log in output for otx training +- Support multiple python versions up to 3.10 +- Support export of onnx models +- Add option to save images after inference in OTX CLI demo together with demo in exportable code +- Support storage cache in Apache Arrow using Datumaro for cls, det, seg tasks +- Add noisy label detection for multi-class classification task +- Clean up and refactor the output of the OTX CLI +- Enhance DetCon logic and SupCon for semantic segmentation +- Detection task refactoring +- Classification task refactoring +- Extend OTX explain CLI +- Segmentation task refactoring +- Action task refactoring +- Optimize data preprocessing time and enhance overall performance in semantic segmentation +- Support automatic batch size decrease when there is no enough GPU memory +- Minor bug fixes + +v1.1.2 (2Q23) +------------- + +- Minor bug fixes + + +v1.1.1 (1Q23) +------------- + +- Minor bug fixes + +v1.1.0 (1Q23) +------------- + +- Add FP16 IR export support +- Add in-memory caching in dataloader +- Add MoViNet template for action classification +- Add Semi-SL multilabel classification algorithm +- Integrate multi-gpu training for semi-supervised learning and self-supervised learning +- Add train-type parameter to otx train +- Add embedding of inference configuration to IR for classification +- Enable VOC dataset in OTX +- Add mmcls.VisionTransformer backbone support +- Parametrize saliency maps dumping in export +- Bring mmdeploy to action recognition model export & Test optimization of action tasks +- Update backbone lists +- Add explanation for XAI & minor doc fixes +- Refactor phase#1: MPA modules + + +v1.0.1 (1Q23) +------------- + +- Refine documents by proof review +- Separate installation for each tasks +- Improve POT efficiency by setting stat_requests_number parameter to 1 +- Minor bug fixes + + +v1.0.0 (1Q23) +------------- + +- Installation through PyPI + - Package will be renamed as OpenVINO™ Training Extensions +- CLI update + - Update ``otx find`` command to find configurations of tasks/algorithms + - Introduce ``otx build`` command to customize task or model configurations + - Automatic algorithm selection for the ``otx train`` command using the given input dataset +- Adaptation of `Datumaro `_ component as a dataset interface diff --git a/docs/source/guide/tutorials/advanced/configuration.rst b/docs/source/guide/tutorials/advanced/configuration.rst new file mode 100644 index 00000000000..e4aad3a5fac --- /dev/null +++ b/docs/source/guide/tutorials/advanced/configuration.rst @@ -0,0 +1,94 @@ +How to write OTX Configuration (recipe) +========================================== + +*************** +Configuration +*************** + +Example of ``recipe/classification/multi_class_cls`` + +.. code-block:: yaml + + model: + class_path: otx.algo.classification.mobilenet_v3_large.MobileNetV3ForMulticlassCls + init_args: + num_classes: 1000 + light: True + + optimizer: + class_path: torch.optim.SGD + init_args: + lr: 0.0058 + momentum: 0.9 + weight_decay: 0.0001 + + scheduler: + class_path: otx.algo.schedulers.WarmupReduceLROnPlateau + init_args: + warmup_steps: 10 + mode: max + factor: 0.5 + patience: 1 + monitor: val/accuracy + + engine: + task: MULTI_CLASS_CLS + device: auto + + callback_monitor: val/accuracy + data: ../../_base_/data/mmpretrain_base.yaml + +We can use the ``~.yaml`` with the above values configured. + +- ``engine`` +- ``model``, ``optimizer``, ``scheduler`` +- ``data`` +- ``callback_monitor`` + +The basic configuration is the same as the configuration configuration format for jsonargparse. +`Jsonargparse Documentation _` + +### Configuration overrides + +Here we provide a feature called ``overrides``. + +.. code-block:: yaml + + ... + + overrides: + data: + config: + train_subset: + transforms: + - type: LoadImageFromFile + - backend: cv2 + scale: 224 + type: RandomResizedCrop + - direction: horizontal + prob: 0.5 + type: RandomFlip + - type: PackInputs + ... + +This feature allows you to override the values need from the default configuration. +You can see the final configuration with the command below. + +.. code-block:: shell + + $ otx train --config --print_config + +### Callbacks & Logger overrides + +``callbacks`` and ``logger`` can currently be provided as a list of different callbacks and loggers. The way to override this is as follows. + +For example, if you want to change the patience of EarlyStopping, you can configure the overrides like this + +.. code-block:: yaml + + overrides: + ... + callbacks: + - class_path: ligthning.pytorch.callbacks.EarlyStopping + init_args: + patience: 3 diff --git a/docs/source/guide/tutorials/advanced/index.rst b/docs/source/guide/tutorials/advanced/index.rst new file mode 100644 index 00000000000..dd781303b4a --- /dev/null +++ b/docs/source/guide/tutorials/advanced/index.rst @@ -0,0 +1,10 @@ +Advanced Tutorials +================== + +.. toctree:: + :maxdepth: 1 + :hidden: + + configuration + +.. Once we have enough material, we might need to categorize these into `data`, `model learning` sections. \ No newline at end of file diff --git a/docs/source/guide/tutorials/base/explain.rst b/docs/source/guide/tutorials/base/explain.rst new file mode 100644 index 00000000000..20bee0eb974 --- /dev/null +++ b/docs/source/guide/tutorials/base/explain.rst @@ -0,0 +1,66 @@ +How to explain the model behavior +================================= + +This guide explains the model behavior, which is trained through :doc:`previous stage `. +It allows displaying the saliency maps, which provide the locality where the model gave an attention to predict a specific category. + +To be specific, this tutorial uses as an example of the ATSS model trained through ``otx train`` and saved as ``otx-workspace/.latest/train/checkpoints/epoch_*.pth``. + +.. note:: + + This tutorial uses an object detection model for example, however for other tasks the functionality remains the same - you just need to replace the input dataset with your own. + +For visualization we use images from WGISD dataset from the :doc:`object detection tutorial ` together with trained model. + +1. Activate the virtual environment +created in the previous step. + +.. code-block:: shell + + .otx/bin/activate + # or by this line, if you created an environment, using tox + . venv/otx/bin/activate + +2. ``otx explain`` returns saliency maps (heatmaps with red colored areas of focus) + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + Need to update! + + .. tab-item:: API + + .. code-block:: python + + Need to update! + + +3. To specify the algorithm of saliency map creation for classification, +we can define the ``--explain-algorithm`` parameter. + +- ``activationmap`` - for activation map classification algorithm +- ``eigencam`` - for Eigen-Cam classification algorithm +- ``classwisesaliencymap`` - for Recipro-CAM classification algorithm, this is a default method + +For detection task, we can choose between the following methods: + +- ``activationmap`` - for activation map detection algorithm +- ``classwisesaliencymap`` - for DetClassProbabilityMap algorithm (works for single-stage detectors only) + +.. note:: + + Learn more about Explainable AI and its algorithms in :doc:`XAI explanation section <../../explanation/additional_features/xai>` + + +4. As a result we will get a folder with a pair of generated +images for each image in ``--input``: + +- saliency map - where red color means more attention of the model +- overlay - where the saliency map is combined with the original image: + +.. image:: ../../../../utils/images/explain_wgisd.png + :width: 600 + diff --git a/docs/source/guide/tutorials/base/how_to_train/action_classification.rst b/docs/source/guide/tutorials/base/how_to_train/action_classification.rst new file mode 100644 index 00000000000..e25ba9b2f9d --- /dev/null +++ b/docs/source/guide/tutorials/base/how_to_train/action_classification.rst @@ -0,0 +1,270 @@ +Action Classification model +================================ + +This live example shows how to easily train, validate, optimize and export classification model on the `HMDB51 `_. +To learn more about Action Classification task, refer to :doc:`../../../explanation/algorithms/action/action_classification`. + +.. note:: + To learn more about managing the training process of the model including additional parameters and modification, refer to :doc:`./detection`. + +The process has been tested on the following configuration. + +- Ubuntu 20.04 +- NVIDIA GeForce RTX 3090 +- Intel(R) Core(TM) i9-10980XE +- CUDA Toolkit 11.6 + +.. note:: + + To learn more about the model, algorithm and dataset format, refer to :doc:`action classification explanation <../../../explanation/algorithms/action/action_classification>`. + + +************************* +Setup virtual environment +************************* + +1. You can follow the installation process from a :doc:`quick start guide <../../../get_started/installation>` +to create a universal virtual environment for OpenVINO™ Training Extensions. + +2. Activate your virtual +environment: + +.. code-block:: shell + + .otx/bin/activate + # or by this line, if you created an environment, using tox + . venv/otx/bin/activate + +*************************** +Dataset preparation +*************************** + +According to the `documentation `_ provided by mmaction2, you need to ensure that the `HMDB51 `_ dataset is structured as follows: + +.. code-block:: + + training_extensions + ├── data + │ ├── hmdb51 + │ │ ├── hmdb51_{train,val}_split_{1,2,3}_rawframes.txt + │ │ ├── hmdb51_{train,val}_split_{1,2,3}_videos.txt + │ │ ├── annotations + │ │ ├── videos + │ │ │ ├── brush_hair + │ │ │ │ ├── April_09_brush_hair_u_nm_np1_ba_goo_0.avi + │ │ │ ├── wave + │ │ │ │ ├── 20060723sfjffbartsinger_wave_f_cm_np1_ba_med_0.avi + │ │ ├── rawframes + │ │ │ ├── brush_hair + │ │ │ │ ├── April_09_brush_hair_u_nm_np1_ba_goo_0 + │ │ │ │ │ ├── img_00001.jpg + │ │ │ │ │ ├── img_00002.jpg + │ │ │ │ │ ├── ... + │ │ │ │ │ ├── flow_x_00001.jpg + │ │ │ │ │ ├── flow_x_00002.jpg + │ │ │ │ │ ├── ... + │ │ │ │ │ ├── flow_y_00001.jpg + │ │ │ │ │ ├── flow_y_00002.jpg + │ │ │ ├── ... + │ │ │ ├── wave + │ │ │ │ ├── 20060723sfjffbartsinger_wave_f_cm_np1_ba_med_0 + │ │ │ │ ├── ... + │ │ │ │ ├── winKen_wave_u_cm_np1_ri_bad_1 + +Once you have the dataset structured properly, copy ``mmaction2/data`` folder, which contains hmdb51 dataset, to ``training_extensions/data``. +Then, you can now convert it to the `CVAT `_ format using the following command: + +.. code-block:: shell + + Need to update! + +The resulting folder structure will be as follows: + +.. code-block:: + + hmdb51 + ├── rawframes + ├── videos + ├── annotations + └── CVAT + ├── train (3570 videos) + │ ├── Video_0 + │ │ ├── annotations.xml + │ │ └── images [101 frames] + │ ├── Video_1 + │ │ ├── annotations.xml + │ │ └── images [105 frames] + │ └── Video_2 + │ ├── annotations.xml + │ └── images [64 frames] + │ + └── valid (1530 videos) + ├── Video_0 + │ ├── annotations.xml + │ └── images [85 frames] + ├── Video_1 + │ ├── annotations.xml + │ └── images [89 frames] + └── Video_2 + ├── annotations.xml + └── images [60 frames] + +********* +Training +********* + +1. You need to choose, which action classification model you want to train. +To see the list of supported templates, run the following command: + +.. note:: + + OpenVINO™ Training Extensions supports X3D and MoViNet template now, other architecture will be supported in future. + +.. code-block:: + + (otx) ...$ otx find --task action_classification + + +-----------------------+--------------------------------------+---------+-----------------------------------------------------------------------+ + | TASK | ID | NAME | BASE PATH | + +-----------------------+--------------------------------------+---------+-----------------------------------------------------------------------+ + | ACTION_CLASSIFICATION | Custom_Action_Classification_X3D | X3D | ../otx/algorithms/action/configs/classification/x3d/template.yaml | + | ACTION_CLASSIFICATION | Custom_Action_Classification_MoViNet | MoViNet | ../otx/algorithms/action/configs/classification/movinet/template.yaml | + +-----------------------+--------------------------------------+---------+-----------------------------------------------------------------------+ + +All commands will be run on the X3D model. It's a light model, that achieves competitive accuracy while keeping the inference fast. + +2. Prepare an OpenVINO™ Training Extensions workspace for +the action classification task by running the following command: + +.. code-block:: + + (otx) ...$ otx build --task action_classification --train-data-roots data/hmdb51/CVAT/train/ --val-data-roots data/hmdb51/CVAT/valid + [*] Workspace Path: otx-workspace-ACTION_CLASSIFICATION + [*] Load Model Template ID: Custom_Action_Classification_X3D + [*] Load Model Name: X3D + [*] - Updated: otx-workspace-ACTION_CLASSIFICATION/model.py + [*] - Updated: otx-workspace-ACTION_CLASSIFICATION/data_pipeline.py + [*] Update data configuration file to: otx-workspace-ACTION_CLASSIFICATION/data.yaml + + (otx) ...$ cd ./otx-workspace-ACTION_CLASSIFICATION + +It will create **otx-workspace-ACTION_CLASSIFICATION** with all necessary configs for X3D and prepare ``data.yaml`` to simplify CLI commands. + + +3. To begin training, simply run ``otx train`` +from **within the workspace directory**: + +.. code-block:: + + (otx) ...$ otx train + +That's it! The training will return artifacts: ``weights.pth`` and ``label_schema.json``, which are needed as input for the further commands: ``export``, ``eval``, ``optimize``, etc. + +The training time highly relies on the hardware characteristics. For example, the training took about 10 minutes on a single NVIDIA GeForce RTX 3090. + +After that, you have the PyTorch action classification model trained with OpenVINO™ Training Extensions, which you can use for evaluation, export, optimization and deployment. + +*********** +Validation +*********** + +1. To evaluate the trained model on a specific dataset, use the ``otx eval`` command with +the following arguments: + +The eval function receives test annotation information and model snapshot, trained in the previous step. +Keep in mind that ``label_schema.json`` file contains meta information about the dataset and it should be in the same folder as the model snapshot. + +``otx eval`` will output a frame-wise accuracy for action classification. Note, that top-1 accuracy during training is video-wise accuracy. + +2. The command below will run validation on the dataset +and save performance results in ``outputs/performance.json`` file: + +.. code-block:: + + (otx) ...$ otx eval --test-data-roots ../data/hmdb51/CVAT/valid \ + --load-weights models/weights.pth \ + --output outputs + +You will get a similar validation output: + +.. code-block:: + + ... + + 2023-02-22 00:08:45,156 - mmaction - INFO - Model architecture: X3D + 2023-02-22 00:08:56,766 - mmaction - INFO - Inference completed + 2023-02-22 00:08:56,766 - mmaction - INFO - called evaluate() + 2023-02-22 00:08:59,469 - mmaction - INFO - Final model performance: Performance(score: 0.6646406490691239, dashboard: (3 metric groups)) + 2023-02-22 00:08:59,470 - mmaction - INFO - Evaluation completed + Performance(score: 0.6646406490691239, dashboard: (3 metric groups)) + +********* +Export +********* + +1. ``otx export`` exports a trained Pytorch `.pth` model to the OpenVINO™ Intermediate Representation (IR) format. +It allows running the model on the Intel hardware much more efficiently, especially on the CPU. Also, the resulting IR model is required to run PTQ optimization. IR model consists of two files: ``openvino.xml`` for weights and ``openvino.bin`` for architecture. + +2. Run the command line below to export the trained model +and save the exported model to the ``openvino`` folder. + +.. code-block:: + + (otx) ...$ otx export --load-weights models/weights.pth \ + --output openvino + + ... + 2023-02-21 22:54:32,518 - mmaction - INFO - Model architecture: X3D + Successfully exported ONNX model: /tmp/OTX-task-a7wekgbc/openvino.onnx + mo --input_model=/tmp/OTX-task-a7wekgbc/openvino.onnx --mean_values=[0.0, 0.0, 0.0] --scale_values=[255.0, 255.0, 255.0] --output_dir=/tmp/OTX-task-a7wekgbc --output=logits --data_type=FP32 --source_layout=??c??? --input_shape=[1, 1, 3, 8, 224, 224] + [ WARNING ] Use of deprecated cli option --data_type detected. Option use in the following releases will be fatal. + [ INFO ] The model was converted to IR v11, the latest model format that corresponds to the source DL framework input/output format. While IR v11 is backwards compatible with OpenVINO Inference Engine API v1.0, please use API v2.0 (as of 2022.1) to take advantage of the latest improvements in IR v11. + Find more information about API v2.0 and IR v11 at https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html + [ SUCCESS ] Generated IR version 11 model. + [ SUCCESS ] XML file: /tmp/OTX-task-a7wekgbc/openvino.xml + [ SUCCESS ] BIN file: /tmp/OTX-task-a7wekgbc/openvino.bin + 2023-02-21 22:54:35,424 - mmaction - INFO - Exporting completed + + +3. Check the accuracy of the IR optimimodel and the consistency between the exported model and the PyTorch model, +using ``otx eval`` and passing the IR model path to the ``--load-weights`` parameter. + +.. code-block:: + + (otx) ...$ otx eval --test-data-roots ../data/hmdb51/CVAT/valid \ + --load-weights openvino/openvino.xml \ + --output outputs/openvino + + ... + + Performance(score: 0.6357698983041397, dashboard: (3 metric groups)) + + +************* +Optimization +************* + +1. You can further optimize the model with ``otx optimize``. +Currently, quantization jobs that include PTQ is supported for X3D template. MoViNet will be supported in near future. + +The optimized model will be quantized to ``INT8`` format. +Refer to :doc:`optimization explanation <../../../explanation/additional_features/models_optimization>` section for more details on model optimization. + +2. Example command for optimizing +OpenVINO™ model (.xml) with OpenVINO™ PTQ. + +.. code-block:: + + (otx) ...$ otx optimize --load-weights openvino/openvino.xml \ + --output ptq_model + + ... + + Performance(score: 0.6252587703095486, dashboard: (3 metric groups)) + +Keep in mind that PTQ will take some time (generally less than NNCF optimization) without logging to optimize the model. + +3. Now, you have fully trained, optimized and exported an +efficient model representation ready-to-use action classification model. + +The examples are provided with an object detection model, but it is easy to apply them for action classification by substituting the object detection model with classification one. diff --git a/docs/source/guide/tutorials/base/how_to_train/action_detection.rst b/docs/source/guide/tutorials/base/how_to_train/action_detection.rst new file mode 100644 index 00000000000..6103735beb0 --- /dev/null +++ b/docs/source/guide/tutorials/base/how_to_train/action_detection.rst @@ -0,0 +1,225 @@ +Action Detection model +================================ + +This live example shows how to easily train and validate for spatio-temporal action detection model on the subset of `JHMDB `_. +To learn more about Action Detection task, refer to :doc:`../../../explanation/algorithms/action/action_detection`. + +.. note:: + + To learn deeper how to manage training process of the model including additional parameters and its modification, refer to :doc:`./detection`. + +The process has been tested on the following configuration. + +- Ubuntu 20.04 +- NVIDIA GeForce RTX 3090 +- Intel(R) Core(TM) i9-10980XE +- CUDA Toolkit 11.1 + +************************* +Setup virtual environment +************************* + +1. You can follow the installation process from a :doc:`quick start guide <../../../get_started/installation>` +to create a universal virtual environment for OpenVINO™ Training Extensions. + +2. Activate your virtual +environment: + +.. code-block:: + + .otx/bin/activate + # or by this line, if you created an environment, using tox + . venv/otx/bin/activate + + +*************************** +Dataset preparation +*************************** + +Although we offer conversion codes from `ava dataset format `_ to `cvat dataset format `_ from `this code `_, for easy beginning you can download subset of JHMDB dataset, which already transformed to CVAT format from `this link `_. + +If you download data from link and extract to ``training_extensions/data`` folder(you should make data folder at first), you can see the structure below: + +.. code-block:: + + training_extensions + └── data + └── JHMDB_5% + ├── train + │ └── brush_hair_Brushing_Hair_with_Beth_brush_hair_h_nm_np1_le_goo_0 + │ ├── annotations.xml + │ └── images [40 frames] + │ + │── test + │ └── brush_hair_Aussie_Brunette_Brushing_Long_Hair_brush_hair_u_nm_np1_fr_med_0 + │ ├── annotations.xml + │ └── images [40 frames] + │ + │── train.pkl + └── test.pkl + + +********* +Training +********* + +1. First of all, you need to choose which action detection model you want to train. +The list of supported templates for action detection is available with the command line below: + +.. note:: + + The characteristics and detailed comparison of the models could be found in :doc:`Explanation section <../../../explanation/algorithms/action/action_detection>`. + +.. code-block:: + + (otx) ...$ otx find --task action_detection + + +------------------+---------------------------------------+---------------+-------------------------------------------------------------------------+ + | TASK | ID | NAME | BASE PATH | + +------------------+---------------------------------------+---------------+-------------------------------------------------------------------------+ + | ACTION_DETECTION | Custom_Action_Detection_X3D_FAST_RCNN | X3D_FAST_RCNN | src/otx/algorithms/action/configs/detection/x3d_fast_rcnn/template.yaml | + +------------------+---------------------------------------+---------------+-------------------------------------------------------------------------+ + +To have a specific example in this tutorial, all commands will be run on the X3D_FAST_RCNN model. It's a light model, that achieves competitive accuracy while keeping the inference fast. + +2. Next, we need to create workspace +for various tasks we provide. + +Let's prepare an OpenVINO™ Training Extensions action detection workspace running the following command: + +.. code-block:: + + (otx) ...$ otx build x3d_fast_rcnn --train-data-roots ./data/JHMDB_5%/train --val-data-roots ./data/JHMDB_5%/test + + [*] Workspace Path: otx-workspace-ACTION_DETECTION + [*] Load Model Template ID: Custom_Action_Detection_X3D_FAST_RCNN + [*] Load Model Name: X3D_FAST_RCNN + [*] - Updated: otx-workspace-ACTION_DETECTION/model.py + [*] - Updated: otx-workspace-ACTION_DETECTION/data_pipeline.py + [*] Update data configuration file to: otx-workspace-ACTION_DETECTION/data.yaml + + (otx) ...$ cd ./otx-workspace-ACTION_DETECTION + +It will create **otx-workspace-ACTION_DETECTION** with all necessary configs for X3D_FAST_RCNN, prepared ``data.yaml`` to simplify CLI commands launch and splitted dataset. + +3. To start training we need to call ``otx train`` +command in our workspace: + +.. code-block:: + + (otx) ...$ otx train + +That's it! The training will return artifacts: ``weights.pth`` and ``label_schema.json``, which are needed as input for the further commands: ``export``, ``eval``, ``optimize``, etc. + +The training time highly relies on the hardware characteristics, for example on 1 NVIDIA GeForce RTX 3090 the training took about 70 minutes. + +After that, we have the PyTorch action detection model trained with OpenVINO™ Training Extensions. + +*********** +Validation +*********** + +1. ``otx eval`` runs evaluation of a trained +model on a specific dataset. + +The eval function receives test annotation information and model snapshot, trained in the previous step. +Please note, ``label_schema.json`` file contains meta information about the dataset and it should be located in the same folder as the model snapshot. + +``otx eval`` will output a mAP score for spatio-temporal action detection. + +2. The command below will run validation on our dataset +and save performance results in ``outputs/performance.json`` file: + +.. code-block:: + + (otx) ...$ otx eval --test-data-roots ../data/JHMDB_5%/test \ + --load-weights models/weights.pth \ + --output outputs + +We will get a similar to this validation output after some validation time (about 2 minutes): + +.. code-block:: + + 2023-02-21 22:42:14,540 - mmaction - INFO - Loaded model weights from Task Environment + 2023-02-21 22:42:14,540 - mmaction - INFO - Model architecture: X3D_FAST_RCNN + 2023-02-21 22:42:14,739 - mmaction - INFO - Patching pre proposals... + 2023-02-21 22:42:14,749 - mmaction - INFO - Done. + 2023-02-21 22:44:24,345 - mmaction - INFO - Inference completed + 2023-02-21 22:44:24,347 - mmaction - INFO - called evaluate() + 2023-02-21 22:44:26,349 - mmaction - INFO - Final model performance: Performance(score: 0.5086285195277019, dashboard: (1 metric groups)) + 2023-02-21 22:44:26,349 - mmaction - INFO - Evaluation completed + Performance(score: 0.5086285195277019, dashboard: (1 metric groups)) + +.. note:: + + Currently we don't support export and optimize task in action detection. We will support these features very near future. + + +********* +Export +********* + +1. ``otx export`` exports a trained Pytorch `.pth` model to the OpenVINO™ Intermediate Representation (IR) format. +It allows running the model on the Intel hardware much more efficiently, especially on the CPU. Also, the resulting IR model is required to run PTQ optimization. IR model consists of two files: ``openvino.xml`` for weights and ``openvino.bin`` for architecture. + +2. Run the command line below to export the trained model +and save the exported model to the ``openvino`` folder. + +.. code-block:: + + (otx) ...$ otx export + + 2023-03-24 15:03:35,993 - mmdeploy - INFO - Export PyTorch model to ONNX: /tmp/OTX-task-ffw8llin/openvino.onnx. + 2023-03-24 15:03:44,450 - mmdeploy - INFO - Args for Model Optimizer: mo --input_model="/tmp/OTX-task-ffw8llin/openvino.onnx" --output_dir="/tmp/OTX-task-ffw8llin/" --output="bboxes,labels" --input="input" --input_shape="[1, 3, 32, 256, 256]" --mean_values="[123.675, 116.28, 103.53]" --scale_values="[58.395, 57.12, 57.375]" --source_layout=bctwh + 2023-03-24 15:03:46,707 - mmdeploy - INFO - [ INFO ] The model was converted to IR v11, the latest model format that corresponds to the source DL framework input/output format. While IR v11 is backwards compatible with OpenVINO Inference Engine API v1.0, please use API v2.0 (as of 2022.1) to take advantage of the latest improvements in IR v11. + Find more information about API v2.0 and IR v11 at https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html + [ SUCCESS ] Generated IR version 11 model. + [ SUCCESS ] XML file: /tmp/OTX-task-ffw8llin/openvino.xml + [ SUCCESS ] BIN file: /tmp/OTX-task-ffw8llin/openvino.bin + +2023-03-24 15:03:46,707 - mmdeploy - INFO - Successfully exported OpenVINO model: /tmp/OTX-task-ffw8llin/openvino.xml +2023-03-24 15:03:46,756 - mmaction - INFO - Exporting completed + + +3. Check the accuracy of the IR model and the consistency between the exported model and the PyTorch model, +using ``otx eval`` and passing the IR model path to the ``--load-weights`` parameter. + +.. code-block:: + + (otx) ...$ otx eval --test-data-roots ../data/JHMDB_5%/test \ + --load-weights model-exported/openvino.xml \ + --save-performance model-exported/performance.json + + ... + + Performance(score: 0.47351524879614754, dashboard: (3 metric groups)) + + +************* +Optimization +************* + +1. You can further optimize the model with ``otx optimize``. +Currently, only PTQ is supported for action detection. NNCF will be supported in near future. + +The optimized model will be quantized to ``INT8`` format. +Refer to :doc:`optimization explanation <../../../explanation/additional_features/models_optimization>` section for more details on model optimization. + +2. Example command for optimizing +OpenVINO™ model (.xml) with OpenVINO™ PTQ. + +.. code-block:: + + (otx) ...$ otx optimize --load-weights openvino/openvino.xml \ + --save-model-to ptq_model + + ... + + [*] Update data configuration file to: data.yaml + Statistics collection: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [04:16<00:00, 1.17it/s]Biases correction: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 168/168 [00:15<00:00, 10.63it/s][>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 1572/1572, 7.3 task/s, elapsed: 216s, ETA: 0s + Performance(score: 0.4621155288822204, dashboard: (1 metric groups)) + +Keep in mind that PTQ will take some time (generally less than NNCF optimization) without logging to optimize the model. + +3. Now, you have fully trained, optimized and exported an +efficient model representation ready-to-use action detection model. diff --git a/docs/source/guide/tutorials/base/how_to_train/anomaly_detection.rst b/docs/source/guide/tutorials/base/how_to_train/anomaly_detection.rst new file mode 100644 index 00000000000..7f15cbdf329 --- /dev/null +++ b/docs/source/guide/tutorials/base/how_to_train/anomaly_detection.rst @@ -0,0 +1,398 @@ +Anomaly Detection Tutorial +================================ + +This tutorial demonstrates how to train, evaluate, and deploy a classification, detection, or segmentation model for anomaly detection in industrial or medical applications. +Read :doc:`../../../explanation/algorithms/anomaly/index` for more information about the Anomaly tasks. + +.. note:: + To learn more about managing the training process of the model including additional parameters and its modification, refer to :doc:`./detection`. + +The process has been tested with the following configuration: + +- Ubuntu 20.04 +- NVIDIA GeForce RTX 3090 +- Intel(R) Core(TM) i9-11900 +- CUDA Toolkit 11.8 + + +***************************** +Setup the Virtual environment +***************************** + +1. To create a universal virtual environment for OpenVINO™ Training Extensions, +please follow the installation process in the :doc:`quick start guide <../../../get_started/installation>`. + +2. Activate your virtual +environment: + +.. code-block:: shell + + .otx/bin/activate + # or by this line, if you created an environment, using tox + . venv/otx/bin/activate + +************************** +Dataset Preparation +************************** + +1. For this example, we will use the `MVTec `_ dataset. +You can download the dataset from the link above. We will use the ``bottle`` category for this tutorial. + +2. This is how it might look like in your +file system: + +.. code-block:: + + datasets/MVTec/bottle + ├── ground_truth + │ ├── broken_large + │ │ ├── 000_mask.png + │ │ ├── 001_mask.png + │ │ ├── 002_mask.png + │ │ ... + │ ├── broken_small + │ │ ├── 000_mask.png + │ │ ├── 001_mask.png + │ │ ... + │ └── contamination + │ ├── 000_mask.png + │ ├── 001_mask.png + │ ... + ├── license.txt + ├── readme.txt + ├── test + │ ├── broken_large + │ │ ├── 000.png + │ │ ├── 001.png + │ │ ... + │ ├── broken_small + │ │ ├── 000.png + │ │ ├── 001.png + │ │ ... + │ ├── contamination + │ │ ├── 000.png + │ │ ├── 001.png + │ │ ... + │ └── good + │ ├── 000.png + │ ├── 001.png + │ ... + └── train + └── good + ├── 000.png + ├── 001.png + ... + +*************************** +Training +*************************** + +1. For this example let's look at the +anomaly detection tasks + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx find --task ANOMALY_DETECTION + ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Task ┃ Model Name ┃ Recipe Path ┃ + ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ ANOMALY_DETECTION │ stfpm │ src/otx/recipe/anomaly_detection/stfpm.yaml │ + │ ANOMALY_DETECTION │ padim │ src/otx/recipe/anomaly_detection/padim.yaml │ + └───────────────────┴────────────┴─────────────────────────────────────────────┘ + + .. tab-item:: API + + .. code-block:: python + + from otx.engine.utils.api import list_models + + model_lists = list_models(task="ANOMALY_DETECTION") + print(model_lists) + ''' + ['stfpm', 'padim'] + ''' + +You can see two anomaly detection models, STFPM and PADIM. For more detail on each model, refer to Anomalib's `STFPM `_ and `PADIM `_ documentation. + +2. Let's proceed with PADIM for +this example. + +.. tab-set:: + + .. tab-item:: CLI (auto-config) + + .. code-block:: shell + + (otx) ...$ otx train --data_root datasets/MVTec/bottle \ + --task ANOMALY_DETECTION + + .. tab-item:: CLI (with config) + + .. code-block:: shell + + (otx) ...$ otx train --config src/otx/recipe/anomaly_detection/padim.yaml \ + --data_root datasets/MVTec/bottle + + .. tab-item:: API (from_config) + + .. code-block:: python + + from otx.engine import Engine + + data_root = "datasets/MVTec/bottle" + recipe = "src/otx/recipe/anomaly_detection/padim.yaml" + + engine = Engine.from_config( + config_path=recipe, + data_root=data_root, + work_dir="otx-workspace", + ) + + engine.train(...) + + .. tab-item:: API + + .. code-block:: python + + from otx.engine import Engine + + data_root = "datasets/MVTec/bottle" + + engine = Engine( + model="padim", + data_root=data_root, + task="ANOMALY_DETECTION", + work_dir="otx-workspace", + ) + + engine.train(...) + + +3. ``(Optional)`` Additionally, we can tune training parameters such as batch size, learning rate, patience epochs. +Learn more about specific parameters using ``otx train --help -v`` or ``otx train --help -vv``. + +For example, to decrease the batch size to 4, fix the number of epochs to 100, extend the command line above with the following line. + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx train ... --data.config.train_subset.batch_size 4 \ + --max_epochs 100 + + .. tab-item:: API + + .. code-block:: python + + from otx.core.config.data import DataModuleConfig, SubsetConfig + from otx.core.data.module import OTXDataModule + from otx.engine import Engine + + data_config = DataModuleConfig(..., train_subset=SubsetConfig(..., batch_size=4)) + datamodule = OTXDataModule(..., config=data_config) + + engine = Engine(..., datamodule=datamodule) + + engine.train(max_epochs=100) + +4. The training result ``checkpoints/*.ckpt`` file is located in ``{work_dir}`` folder, +while training logs can be found in the ``{work_dir}/{timestamp}`` dir. + +This will start training and generate artifacts for commands such as ``export`` and ``optimize``. You will notice the ``otx-workspace`` directory in your current working directory. This is where all the artifacts are stored. + +************** +Evaluation +************** + +Now we have trained the model, let's see how it performs on a specific dataset. In this example, we will use the same dataset to generate evaluation metrics. To perform evaluation you need to run the following commands: + +.. tab-set:: + + .. tab-item:: CLI (with work_dir) + + .. code-block:: shell + + (otx) ...$ otx test --work_dir otx-workspace + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Test metric ┃ DataLoader 0 ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ image_AUROC │ 0.8 │ + │ image_F1Score │ 0.8 │ + │ pixel_AUROC │ 0.8 │ + │ pixel_F1Score │ 0.8 │ + │ test/data_time │ 0.6517705321311951 │ + │ test/iter_time │ 0.6630784869194031 │ + └───────────────────────────┴───────────────────────────┘ + + .. tab-item:: CLI (with config) + + .. code-block:: shell + + (otx) ...$ otx test --config src/otx/recipe/anomaly_detection/padim.yaml \ + --data_root datasets/MVTec/bottle \ + --checkpoint otx-workspace/20240313_042421/checkpoints/epoch_010.ckpt + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Test metric ┃ DataLoader 0 ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ image_AUROC │ 0.8 │ + │ image_F1Score │ 0.8 │ + │ pixel_AUROC │ 0.8 │ + │ pixel_F1Score │ 0.8 │ + │ test/data_time │ 0.6517705321311951 │ + │ test/iter_time │ 0.6630784869194031 │ + └───────────────────────────┴───────────────────────────┘ + + .. tab-item:: API + + .. code-block:: python + + engine.test() + + +The primary metric here is the f-measure computed against the ground-truth bounding boxes. It is also called the local score. In addition, f-measure is also used to compute the global score. The global score is computed based on the global label of the image. That is, the image is anomalous if it contains at least one anomaly. This global score is stored as an additional metric. + +.. note:: + + All task types report Image-level F-measure as the primary metric. In addition, both localization tasks (anomaly detection and anomaly segmentation) also report localization performance (F-measure for anomaly detection and Dice-coefficient for anomaly segmentation). + +******* +Export +******* + +1. ``otx export`` exports a trained Pytorch `.pth` model to the OpenVINO™ Intermediate Representation (IR) format. +It allows running the model on the Intel hardware much more efficient, especially on the CPU. Also, the resulting IR model is required to run PTQ optimization. IR model consists of 2 files: ``exported_model.xml`` for weights and ``exported_model.bin`` for architecture. + +2. We can run the below command line to export the trained model +and save the exported model to the ``openvino`` folder: + +.. tab-set:: + + .. tab-item:: CLI (with work_dir) + + .. code-block:: shell + + (otx) ...$ otx export --work_dir otx-workspace + ... + Elapsed time: 0:00:06.588245 + + .. tab-item:: CLI (with config) + + .. code-block:: shell + + (otx) ...$ otx export ... --checkpoint otx-workspace/20240313_042421/checkpoints/epoch_010.ckpt + ... + Elapsed time: 0:00:06.588245 + + .. tab-item:: API + + .. code-block:: python + + engine.export() + +Now that we have the exported model, let's check its performance using ``otx test``: + +.. tab-set:: + + .. tab-item:: CLI (with work_dir) + + .. code-block:: shell + + (otx) ...$ otx test --work_dir otx-workspace \ + --checkpoint otx-workspace/20240313_052847/exported_model.xml \ + --engine.device cpu + ... + + .. tab-item:: CLI (with config) + + .. code-block:: shell + + (otx) ...$ otx test --config src/otx/recipe/anomaly_detection/padim.yamll \ + --data_root data/wgisd \ + --checkpoint otx-workspace/20240312_052847/exported_model.xml \ + --engine.device cpu + ... + + .. tab-item:: API + + .. code-block:: python + + exported_model = engine.export() + engine.test(checkpoint=exported_model) + + +************ +Optimization +************ + +Anomaly tasks can be optimized either in PTQ or NNCF format. The model will be quantized to ``INT8`` format. +For more information refer to the :doc:`optimization explanation <../../../explanation/additional_features/models_optimization>` section. + + +1. Let's start with PTQ +optimization. + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx optimize --work_dir otx-workspace \ + --checkpoint otx-workspace/20240312_052847/exported_model.xml + + ... + Statistics collection ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 30/30 • 0:00:14 • 0:00:00 + Applying Fast Bias correction ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 58/58 • 0:00:02 • 0:00:00 + Elapsed time: 0:00:24.958733 + + .. tab-item:: API + + .. code-block:: python + + ckpt_path = "otx-workspace/20240312_052847/exported_model.xml" + engine.optimize(checkpoint=ckpt_path) + +Please note, that PTQ will take some time without logging to optimize the model. + +3. Finally, we can also evaluate the optimized model by passing +it to the ``otx test`` function. + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx test --work_dir otx-workspace \ + --checkpoint otx-workspace/20240313_055042/optimized_model.xml \ + --engine.device cpu + + ... + Elapsed time: 0:00:10.260521 + + .. tab-item:: API + + .. code-block:: python + + ckpt_path = "otx-workspace/20240313_055042/optimized_model.xml" + engine.test(checkpoint=ckpt_path) + + +******************************* +Segmentation and Classification +******************************* + +While the above example shows Anomaly Detection, you can also train Anomaly Segmentation and Classification models. +To see what tasks are available, you can pass ``ANOMALY_SEGMENTATION`` and ``ANOMALY_CLASSIFICATION`` to ``otx find`` mentioned in the `Training`_ section. You can then use the same commands to train, evaluate, export and optimize the models. + +.. note:: + + The Segmentation and Detection tasks also require that the ``ground_truth`` masks be present to ensure that the localization metrics are computed correctly. + The ``ground_truth`` masks are not required for the Classification task. + diff --git a/docs/source/guide/tutorials/base/how_to_train/classification.rst b/docs/source/guide/tutorials/base/how_to_train/classification.rst new file mode 100644 index 00000000000..1316c903826 --- /dev/null +++ b/docs/source/guide/tutorials/base/how_to_train/classification.rst @@ -0,0 +1,264 @@ +Classification model +================================ + +This live example shows how to easily train, validate, optimize and export classification model on the `flowers dataset `_ from TensorFlow. +To learn more about Classification task, refer to :doc:`../../../explanation/algorithms/classification/index`. + +.. note:: + + To learn deeper how to manage training process of the model including additional parameters and its modification, refer to :doc:`./detection`. + +The process has been tested on the following configuration. + +- Ubuntu 20.04 +- NVIDIA GeForce RTX 3090 +- Intel(R) Core(TM) i9-10980XE +- CUDA Toolkit 11.8 + +.. note:: + + While this example shows how to work with :doc:`multi-class classification <../../../explanation/algorithms/classification/multi_class_classification>`, it is easy to extend it for the :doc:`multi-label <../../../explanation/algorithms/classification/multi_label_classification>` or :doc:`hierarchical <../../../explanation/algorithms/classification/hierarhical_classification>` classification. + Substitute the dataset with a multi-label or hierarchical one. Everything else remains the same. + + +************************* +Setup virtual environment +************************* + +1. You can follow the installation process from a :doc:`quick start guide <../../../get_started/installation>` +to create a universal virtual environment for OpenVINO™ Training Extensions. + +2. Activate your virtual +environment: + +.. code-block:: shell + + .otx/bin/activate + # or by this line, if you created an environment, using tox + . venv/otx/bin/activate + +*************************** +Dataset preparation +*************************** + +Download and prepare a `flowers dataset `_ +with the following command: + +.. code-block:: shell + + cd data + wget http://download.tensorflow.org/example_images/flower_photos.tgz + tar -xzvf flower_photos.tgz + cd .. + +| + +.. image:: ../../../../../utils/images/flowers_example.jpg + :width: 600 + +| + +This dataset contains images of 5 different flower categories and is stored in the ImageNet format which is supported by OpenVINO™ Training Extensions: + +.. code-block:: + + flower_photos + ├── daisy + ├── dandelion + ├── roses + ├── sunflowers + ├── tulips + + +********* +Training +********* + +1. First of all, you need to choose which classification model you want to train. +The list of supported templates for classification is available with the command line below. + +.. note:: + + The characteristics and detailed comparison of the models could be found in :doc:`Explanation section <../../../explanation/algorithms/classification/multi_class_classification>`. + + You also can modify the architecture of supported models with various backbones. To do that, please refer to the :doc:`advanced tutorial for model customization <../../advanced/backbones>`. + +.. code-block:: shell + + (otx) ...$ otx find --task MULTI_CLSS_CLS + ┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Task ┃ Model Name ┃ Recipe Path ┃ + ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ MULTI_CLASS_CLS │ openvino_model │ src/otx/recipe/classification/multi_class_cls/openvino_model.yaml │ + │ MULTI_CLASS_CLS │ tv_efficientnet_b0 │ src/otx/recipe/classification/multi_class_cls/tv_efficientnet_b0.yaml │ + │ MULTI_CLASS_CLS │ tv_resnet_50 │ src/otx/recipe/classification/multi_class_cls/tv_resnet_50.yaml │ + │ MULTI_CLASS_CLS │ efficientnet_v2_light │ src/otx/recipe/classification/multi_class_cls/efficientnet_v2_light.yaml │ + │ MULTI_CLASS_CLS │ tv_efficientnet_b3 │ src/otx/recipe/classification/multi_class_cls/tv_efficientnet_b3.yaml │ + │ MULTI_CLASS_CLS │ efficientnet_b0_light │ src/otx/recipe/classification/multi_class_cls/efficientnet_b0_light.yaml │ + │ MULTI_CLASS_CLS │ tv_efficientnet_v2_l │ src/otx/recipe/classification/multi_class_cls/tv_efficientnet_v2_l.yaml │ + │ MULTI_CLASS_CLS │ tv_efficientnet_b1 │ src/otx/recipe/classification/multi_class_cls/tv_efficientnet_b1.yaml │ + │ MULTI_CLASS_CLS │ tv_mobilenet_v3_small │ src/otx/recipe/classification/multi_class_cls/tv_mobilenet_v3_small.yaml │ + │ MULTI_CLASS_CLS │ otx_mobilenet_v3_large │ src/otx/recipe/classification/multi_class_cls/otx_mobilenet_v3_large.yaml │ + │ MULTI_CLASS_CLS │ otx_deit_tiny │ src/otx/recipe/classification/multi_class_cls/otx_deit_tiny.yaml │ + │ MULTI_CLASS_CLS │ tv_efficientnet_b4 │ src/otx/recipe/classification/multi_class_cls/tv_efficientnet_b4.yaml │ + │ MULTI_CLASS_CLS │ otx_efficientnet_v2 │ src/otx/recipe/classification/multi_class_cls/otx_efficientnet_v2.yaml │ + │ MULTI_CLASS_CLS │ mobilenet_v3_large_light │ src/otx/recipe/classification/multi_class_cls/mobilenet_v3_large_light.yaml │ + │ MULTI_CLASS_CLS │ otx_efficientnet_b0 │ src/otx/recipe/classification/multi_class_cls/otx_efficientnet_b0.yaml │ + │ MULTI_CLASS_CLS │ otx_dino_v2 │ src/otx/recipe/classification/multi_class_cls/otx_dino_v2.yaml │ + │ MULTI_CLASS_CLS │ otx_dino_v2_linear_probe │ src/otx/recipe/classification/multi_class_cls/otx_dino_v2_linear_probe.yaml │ + └─────────────────┴──────────────────────────┴────────────────────────────────────────────────────────────────────────────────┘ + +To have a specific example in this tutorial, all commands will be run on the :ref:`otx_mobilenet_v3_large ` model. It's a light model, that achieves competitive accuracy while keeping the inference fast. + +2. Next, you need to create train/validation sets. OpenVINO™ Training Extensions supports auto-split functionality for the multi-class classification. +For other classification types you need to prepare splits in advance. + +.. note:: + + Currently, OpenVINO™ Training Extensions supports auto-split only for multi-class classification. For the multi-label and hierarchical tasks you need to prepare data splits in advance. + +Let's prepare an OpenVINO™ Training Extensions classification workspace running the following command: + +.. code-block:: shell + + (otx) ...$ otx train --config src/otx/recipe/classification/multi_class_cls/otx_mobilenet_v3_large.yaml --data_root data/flower_photos --print_config + + data_root: data/flower_photos + work_dir: otx-regression + callback_monitor: val/accuracy + disable_infer_num_classes: false + engine: + task: MULTI_CLASS_CLS + device: auto + data: + ... + +3. To start training you need to call ``otx train`` + +.. code-block:: shell + + (otx) ...$ otx train --config src/otx/recipe/classification/multi_class_cls/otx_mobilenet_v3_large.yaml --data_root data/flower_photos + +That's it! The training will return artifacts: ``weights.pth`` and ``label_schema.json``, which are needed as input for the further commands: ``export``, ``eval``, ``optimize``, etc. + +The training time highly relies on the hardware characteristics, for example on 1 NVIDIA GeForce RTX 3090 the training took about 8 minutes. + +After that, you have the PyTorch classification model trained with OpenVINO™ Training Extensions, which you can use for evaluation, export, optimization and deployment. + +.. note:: + If you specified ``--workspace``, you also can visualize the training using ``Tensorboard`` as these logs are located in ``/tf_logs``. + +*********** +Validation +*********** + +1. ``otx eval`` runs evaluation of a trained +model on a specific dataset. + +The eval function receives test annotation information and model snapshot, trained in the previous step. +Please note, ``label_schema.json`` file contains meta information about the dataset and it should be located in the same folder as the model snapshot. + +``otx eval`` will calculate a top-1 accuracy score for multi-class classification. + +2. The command below will run validation on our dataset +and save performance results in ``performance.json`` file: + +.. code-block:: + + (otx) ...$ otx eval --test-data-roots splitted_dataset/val \ + --load-weights models/weights.pth \ + --output outputs + +You will get a similar validation output: + +.. code-block:: + + ... + + 2023-02-03 23:43:29,514 | INFO : run task done. + 2023-02-03 23:43:35,859 | INFO : called evaluate() + 2023-02-03 23:43:35,870 | INFO : Accuracy after evaluation: 0.9659400544959128 + 2023-02-03 23:43:35,871 | INFO : Evaluation completed + Performance(score: 0.9659400544959128, dashboard: (3 metric groups)) + +********* +Export +********* + +1. ``otx export`` exports a trained Pytorch `.pth` model to the OpenVINO™ Intermediate Representation (IR) format. +It allows running the model on the Intel hardware much more efficient, especially on the CPU. Also, the resulting IR model is required to run PTQ optimization. IR model consists of 2 files: ``openvino.xml`` for weights and ``openvino.bin`` for architecture. + +2. You can run the below command line to export the trained model +and save the exported model to the ``openvino_model`` folder: + +.. code-block:: + + (otx) ...$ otx export --load-weights models/weights.pth \ + --output openvino_model + + ... + + 2023-02-02 03:23:03,057 | INFO : run task done. + 2023-02-02 03:23:03,064 | INFO : Exporting completed + + +3. You can check the accuracy of the IR model and the consistency between the exported model and the PyTorch model, +using ``otx eval`` and passing the IR model path to the ``--load-weights`` parameter. + +.. code-block:: + + (otx) ...$ otx eval --test-data-roots splitted_dataset/val \ + --load-weights openvino_model/openvino.xml \ + --output openvino_model + + ... + + Performance(score: 0.9659400544959128, dashboard: (3 metric groups)) + + +************* +Optimization +************* + +1. You can further optimize the model with ``otx optimize``. +It uses NNCF or PTQ depending on the model and transforms it to ``INT8`` format. + +Please, refer to :doc:`optimization explanation <../../../explanation/additional_features/models_optimization>` section for more details on model optimization. + +2. Command example for optimizing +a PyTorch model (`.pth`) with OpenVINO™ NNCF. + +.. code-block:: + + (otx) ...$ otx optimize --load-weights models/weights.pth --output nncf_model + + ... + + INFO:nncf:Loaded 983/983 parameters + 2023-02-04 00:06:11,725 | INFO : run task done. + 2023-02-04 00:06:16,924 | INFO : called evaluate() + 2023-02-04 00:06:16,935 | INFO : Accuracy after evaluation: 0.9591280653950953 + 2023-02-04 00:06:16,936 | INFO : Evaluation completed + Performance(score: 0.9591280653950953, dashboard: (3 metric groups)) + +The optimization time relies on the hardware characteristics, for example on 1 NVIDIA GeForce RTX 3090 and Intel(R) Core(TM) i9-10980XE it took about 10 minutes. + +3. Command example for optimizing +OpenVINO™ model (.xml) with OpenVINO™ PTQ. + +.. code-block:: + + (otx) ...$ otx optimize --load-weights openvino_model/openvino.xml \ + --output ptq_model + + ... + + Performance(score: 0.9577656675749319, dashboard: (3 metric groups)) + +Please note, that PTQ will take some time (generally less than NNCF optimization) without logging to optimize the model. + +4. Now you have fully trained, optimized and exported an +efficient model representation ready-to-use classification model. + +The following tutorials provide further steps on how to :doc:`deploy <../deploy>` and use your model in the :doc:`demonstration mode <../demo>` and visualize results. +The examples are provided with an object detection model, but it is easy to apply them for classification by substituting the object detection model with classification one. diff --git a/docs/source/guide/tutorials/base/how_to_train/detection.rst b/docs/source/guide/tutorials/base/how_to_train/detection.rst new file mode 100644 index 00000000000..5be5dc518fa --- /dev/null +++ b/docs/source/guide/tutorials/base/how_to_train/detection.rst @@ -0,0 +1,573 @@ +Object Detection model +====================== + +This tutorial reveals end-to-end solution from installation to model export and optimization for object detection task on a specific example. + +To learn more about Object Detection task, refer to :doc:`../../../explanation/algorithms/object_detection/object_detection`. + +On this page, we show how to train, validate, export and optimize ATSS model on WGISD public dataset. + +To have a specific example in this tutorial, all commands will be run on the ATSS model. It's a medium model, that achieves relatively high accuracy while keeping the inference fast. + +The process has been tested on the following configuration. + +- Ubuntu 20.04 +- NVIDIA GeForce RTX 3090 +- Intel(R) Core(TM) i9-11900 +- CUDA Toolkit 11.8 + + + +************************* +Setup virtual environment +************************* + +1. You can follow the installation process from a :doc:`quick start guide <../../../get_started/installation>` +to create a universal virtual environment for OpenVINO™ Training Extensions. + +2. Activate your virtual +environment: + +.. code-block:: shell + + .otx/bin/activate + # or by this line, if you created an environment, using tox + . venv/otx/bin/activate + + +.. _wgisd_dataset_descpiption: + +*************************** +Dataset preparation +*************************** + +.. note:: + + Currently, we support the following object detection dataset formats: + + - `COCO `_ + - `Pascal-VOC `_ + - `YOLO `_ + +1. Clone a repository with +`WGISD dataset `_. + +.. code-block:: shell + + mkdir data ; cd data + git clone https://github.com/thsant/wgisd.git + cd wgisd + git checkout 6910edc5ae3aae8c20062941b1641821f0c30127 + + +This dataset contains images of grapevines with the annotation for different varieties of grapes. + +- ``CDY`` - Chardonnay +- ``CFR`` - Cabernet Franc +- ``CSV`` - Cabernet Sauvignon +- ``SVB`` - Sauvignon Blanc +- ``SYH`` - Syrah + +It's a great example to start with. The model achieves high accuracy right from the beginning of the training due to relatively large and focused objects. Also, these objects are distinguished by a person, so we can check inference results just by looking at images. + +| + +.. image:: ../../../../../utils/images/wgisd_gt_sample.jpg + :width: 600 + :alt: this image uploaded from this `source `_ + +| + +2. To run the training using :doc:`auto-configuration feature <../../../explanation/additional_features/auto_configuration>`, +we need to reformat the dataset according to this structure: + +.. code-block:: shell + + wgisd + ├── annotations/ + ├── instances_train.json + ├── instances_val.json + (Optional) + └── instances_test.json + ├──images/ + (The split on folders is optional) + ├── train + ├── val + └── test + (There may be more extra unrelated folders) + +We can do that by running these commands: + +.. code-block:: shell + + # format images folder + mv data images + + # format annotations folder + mv coco_annotations annotations + + # rename annotations to meet *_train.json pattern + mv annotations/train_bbox_instances.json annotations/instances_train.json + mv annotations/test_bbox_instances.json annotations/instances_val.json + cp annotations/instances_val.json annotations/instances_test.json + + cd ../.. + +********* +Training +********* + +1. First of all, you need to choose which object detection model you want to train. +The list of supported templates for object detection is available with the command line below. + +.. note:: + + The characteristics and detailed comparison of the models could be found in :doc:`Explanation section <../../../explanation/algorithms/object_detection/object_detection>`. + + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx find --task DETECTION --pattern atss + ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Task ┃ Model Name ┃ Recipe Path ┃ + ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ DETECTION │ atss_mobilenetv2_tile │ src/otx/recipe/detection/atss_mobilenetv2_tile.yaml │ + │ DETECTION │ atss_r50_fpn │ src/otx/recipe/detection/atss_r50_fpn.yaml │ + │ DETECTION │ atss_resnext101 │ src/otx/recipe/detection/atss_resnext101.yaml │ + │ DETECTION │ atss_mobilenetv2 │ src/otx/recipe/detection/atss_mobilenetv2.yaml │ + └───────────┴───────────────────────┴────────────────────────────────────────────────────────────────┘ + + .. tab-item:: API + + .. code-block:: python + + from otx.engine.utils.api import list_models + + model_lists = list_models(task="DETECTION", pattern="atss") + print(model_lists) + ''' + [ + 'atss_r50_fpn', + 'atss_mobilenetv2', + 'atss_mobilenetv2_tile', + 'atss_resnext101', + ] + ''' + +.. _detection_workspace: + +2. On this step we will configure configuration +with: + +- all necessary configs for atss_mobilenetv2 +- train/validation sets, based on provided annotation. + +It may be counterintuitive, but for ``--data_root`` we need to pass the path to the dataset folder root (in our case it's ``data/wgisd``) instead of the folder with validation images. +This is because the function automatically detects annotations and images according to the expected folder structure we achieved above. + +Let's check the object detection configuration running the following command: + +.. code-block:: shell + + # or its config path + (otx) ...$ otx train --config src/otx/recipe/detection/atss_mobilenetv2.yaml --data_root data/wgisd --print_config + + ... + data_root: data/wgisd + work_dir: otx-workspace + callback_monitor: val/map_50 + disable_infer_num_classes: false + engine: + task: DETECTION + device: auto + data: + ... + +.. note:: + + If you want to get configuration as yaml file, please use ``--print_config`` parameter and ``> configs.yaml``. + + .. code-block:: shell + + (otx) ...$ otx train --config src/otx/recipe/detection/atss_mobilenetv2.yaml --data_root data/wgisd --print_config > configs.yaml + # Update configs.yaml & Train configs.yaml + (otx) ...$ otx train --config configs.yaml + + +3. ``otx train`` trains a model (a particular model template) +on a dataset and results: + +Here are the main outputs can expect with CLI: +- ``{work_dir}/{timestamp}/checkpoints/epoch_*.ckpt`` - a model checkpoint file. +- ``{work_dir}/{timestamp}/configs.yaml`` - The configuration file used in the training can be reused to reproduce the training. +- ``{work_dir}/.latest`` - The results of each of the most recently executed subcommands are soft-linked. This allows you to skip checkpoints and config file entry as a workspace. + +.. tab-set:: + + .. tab-item:: CLI (auto-config) + + .. code-block:: shell + + (otx) ...$ otx train --data_root data/wgisd + + .. tab-item:: CLI (with config) + + .. code-block:: shell + + (otx) ...$ otx train --config src/otx/recipe/detection/atss_mobilenetv2.yaml --data_root data/wgisd + + .. tab-item:: API (from_config) + + .. code-block:: python + + from otx.engine import Engine + + data_root = "data/wgisd" + recipe = "src/otx/recipe/detection/atss_mobilenetv2.yaml" + + engine = Engine.from_config( + config_path=recipe, + data_root=data_root, + work_dir="otx-workspace", + ) + + engine.train(...) + + .. tab-item:: API + + .. code-block:: python + + from otx.engine import Engine + + data_root = "data/wgisd" + + engine = Engine( + model="atss_mobilenetv2, + data_root=data_root, + work_dir="otx-workspace", + ) + + engine.train(...) + + +4. ``(Optional)`` Additionally, we can tune training parameters such as batch size, learning rate, patience epochs or warm-up iterations. +Learn more about specific parameters using ``otx train --help -v`` or ``otx train --help -vv``. + +For example, to decrease the batch size to 4, fix the number of epochs to 100, extend the command line above with the following line. + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx train ... --data.config.train_subset.batch_size 4 \ + --max_epochs 100 + + .. tab-item:: API + + .. code-block:: python + + from otx.core.config.data import DataModuleConfig, SubsetConfig + from otx.core.data.module import OTXDataModule + from otx.engine import Engine + + data_config = DataModuleConfig(..., train_subset=SubsetConfig(..., batch_size=4)) + datamodule = OTXDataModule(..., config=data_config) + + engine = Engine(..., datamodule=datamodule) + + engine.train(max_epochs=100) + + +5. The training result ``checkpoints/*.ckpt`` file is located in ``{work_dir}`` folder, +while training logs can be found in the ``{work_dir}/{timestamp}`` dir. + +.. note:: + We also can visualize the training using ``Tensorboard`` as these logs are located in ``{work_dir}/{timestamp}/tensorboard``. + +.. code-block:: + + otx-workspace + ├── outputs/ + ├── 20240403_134256/ + ├── csv/ + ├── checkpoints/ + | └── epoch_*.pth + ├── tensorboard/ + └── configs.yaml + └── .latest + └── train/ + ... + +The training time highly relies on the hardware characteristics, for example on 1 NVIDIA GeForce RTX 3090 the training took about 3 minutes. + +After that, we have the PyTorch object detection model trained with OpenVINO™ Training Extensions, which we can use for evaliation, export, optimization and deployment. + +*********** +Evaluation +*********** + +1. ``otx test`` runs evaluation of a +trained model on a particular dataset. + +Test function receives test annotation information and model snapshot, trained in previous step. + +The default metric is mAP_50 measure. + +2. That's how we can evaluate the snapshot in ``otx-workspace`` +folder on WGISD dataset and save results to ``otx-workspace``: + +.. tab-set:: + + .. tab-item:: CLI (with work_dir) + + .. code-block:: shell + + (otx) ...$ otx test --work_dir otx-workspace + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Test metric ┃ DataLoader 0 ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ test/data_time │ 0.025369757786393166 │ + │ test/map_50 │ 0.8693901896476746 │ + │ test/iter_time │ 0.08180806040763855 │ + └───────────────────────────┴───────────────────────────┘ + + .. tab-item:: CLI (with config) + + .. code-block:: shell + + (otx) ...$ otx test --config src/otx/recipe/detection/atss_mobilenetv2.yaml \ + --data_root data/wgisd \ + --checkpoint otx-workspace/20240312_051135/checkpoints/epoch_033.ckpt + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Test metric ┃ DataLoader 0 ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ test/data_time │ 0.025369757786393166 │ + │ test/map_50 │ 0.8693901896476746 │ + │ test/iter_time │ 0.08180806040763855 │ + └───────────────────────────┴───────────────────────────┘ + + .. tab-item:: API + + .. code-block:: python + + engine.test() + + +3. The output of ``{work_dir}/{timestamp}/csv/version_0/metrics.csv`` consists of +a dict with target metric name and its value. + + +********* +Export +********* + +1. ``otx export`` exports a trained Pytorch `.pth` model to the OpenVINO™ Intermediate Representation (IR) format. +It allows to efficiently run it on Intel hardware, especially on CPU, using OpenVINO™ runtime. +Also, the resulting IR model is required to run PTQ optimization in the section below. IR model contains 2 files: ``exported_model.xml`` for weights and ``exported_model.bin`` for architecture. + +2. That's how we can export the trained model ``{work_dir}/{timestamp}/checkpoints/epoch_*.ckpt`` +from the previous section and save the exported model to the ``{work_dir}/{timestamp}/`` folder. + +.. tab-set:: + + .. tab-item:: CLI (with work_dir) + + .. code-block:: shell + + (otx) ...$ otx export --work_dir otx-workspace + ... + Elapsed time: 0:00:06.588245 + + .. tab-item:: CLI (with config) + + .. code-block:: shell + + (otx) ...$ otx export ... --checkpoint otx-workspace/20240312_051135/checkpoints/epoch_033.ckpt + ... + Elapsed time: 0:00:06.588245 + + .. tab-item:: API + + .. code-block:: python + + engine.export() + + +3. We can check the accuracy of the IR model and the consistency between the exported model and the PyTorch model, +using ``otx test`` and passing the IR model path to the ``--checkpoint`` parameter. + +.. tab-set:: + + .. tab-item:: CLI (with work_dir) + + .. code-block:: shell + + (otx) ...$ otx test --work_dir otx-workspace \ + --checkpoint otx-workspace/20240312_052847/exported_model.xml \ + --engine.device cpu + ... + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Test metric ┃ DataLoader 0 ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ test/map │ 0.5444773435592651 │ + │ test/map_50 │ 0.8693901896476746 │ + │ test/map_75 │ 0.5761404037475586 │ + │ test/map_large │ 0.561242401599884 │ + │ test/map_medium │ 0.2926788330078125 │ + │ test/map_per_class │ -1.0 │ + │ test/map_small │ -1.0 │ + │ test/mar_1 │ 0.055956535041332245 │ + │ test/mar_10 │ 0.45759353041648865 │ + │ test/mar_100 │ 0.6809769868850708 │ + │ test/mar_100_per_class │ -1.0 │ + │ test/mar_large │ 0.6932432055473328 │ + │ test/mar_medium │ 0.46584922075271606 │ + │ test/mar_small │ -1.0 │ + └───────────────────────────┴───────────────────────────┘ + + .. tab-item:: CLI (with config) + + .. code-block:: shell + + (otx) ...$ otx test --config src/otx/recipe/detection/atss_mobilenetv2.yaml \ + --data_root data/wgisd \ + --checkpoint otx-workspace/20240312_052847/exported_model.xml \ + --engine.device cpu + ... + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Test metric ┃ DataLoader 0 ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ test/map │ 0.5444773435592651 │ + │ test/map_50 │ 0.8693901896476746 │ + │ test/map_75 │ 0.5761404037475586 │ + │ test/map_large │ 0.561242401599884 │ + │ test/map_medium │ 0.2926788330078125 │ + │ test/map_per_class │ -1.0 │ + │ test/map_small │ -1.0 │ + │ test/mar_1 │ 0.055956535041332245 │ + │ test/mar_10 │ 0.45759353041648865 │ + │ test/mar_100 │ 0.6809769868850708 │ + │ test/mar_100_per_class │ -1.0 │ + │ test/mar_large │ 0.6932432055473328 │ + │ test/mar_medium │ 0.46584922075271606 │ + │ test/mar_small │ -1.0 │ + └───────────────────────────┴───────────────────────────┘ + + .. tab-item:: API + + .. code-block:: python + + exported_model = engine.export() + engine.test(checkpoint=exported_model) + + +4. ``Optional`` Additionally, we can tune confidence threshold via the command line. +Learn more about template-specific parameters using ``otx export --help``. + +For example, If you want to get the ONNX model format you can run it like below. + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx export ... --checkpoint otx-workspace/20240312_051135/checkpoints/epoch_033.ckpt --export_format ONNX + + .. tab-item:: API + + .. code-block:: python + + engine.export(..., export_format="ONNX") + +If you also want to export ``saliency_map``, a feature related to explain, and ``feature_vector`` information for XAI, you can do the following. + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx export ... --checkpoint otx-workspace/20240312_051135/checkpoints/epoch_033.ckpt --explain True + + .. tab-item:: API + + .. code-block:: python + + engine.export(..., explain=True) + + +************* +Optimization +************* + +1. We can further optimize the model with ``otx optimize``. +It uses PTQ depending on the model and transforms it to ``INT8`` format. + +``PTQ`` optimization is used for models exported in the OpenVINO™ IR format. It decreases the floating-point precision to integer precision of the exported model by performing the post-training optimization. + +To learn more about optimization, refer to `NNCF repository `_. + +2. Command example for optimizing OpenVINO™ model (.xml) +with OpenVINO™ PTQ. + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx optimize --work_dir otx-workspace \ + --checkpoint otx-workspace/20240312_052847/exported_model.xml + + ... + Statistics collection ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 30/30 • 0:00:14 • 0:00:00 + Applying Fast Bias correction ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 58/58 • 0:00:02 • 0:00:00 + Elapsed time: 0:00:24.958733 + + .. tab-item:: API + + .. code-block:: python + + ckpt_path = "otx-workspace/20240312_052847/exported_model.xml" + engine.optimize(checkpoint=ckpt_path) + + +The optimization time highly relies on the hardware characteristics, for example on 1 NVIDIA GeForce RTX 3090 it took about 10 minutes. +Please note, that PTQ will take some time without logging to optimize the model. + +3. Finally, we can also evaluate the optimized model by passing +it to the ``otx test`` function. + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx test --work_dir otx-workspace \ + --checkpoint otx-workspace/20240312_055042/optimized_model.xml \ + --engine.device cpu + + ... + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Test metric ┃ DataLoader 0 ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ test/map_50 │ 0.8693901896476746 │ + └───────────────────────────┴───────────────────────────┘ + Elapsed time: 0:00:10.260521 + + .. tab-item:: API + + .. code-block:: python + + ckpt_path = "otx-workspace/20240312_055042/optimized_model.xml" + engine.test(checkpoint=ckpt_path) + +Now we have fully trained, optimized and exported an efficient model representation ready-to-use object detection model. diff --git a/docs/source/guide/tutorials/base/how_to_train/index.rst b/docs/source/guide/tutorials/base/how_to_train/index.rst new file mode 100644 index 00000000000..edf5816d514 --- /dev/null +++ b/docs/source/guide/tutorials/base/how_to_train/index.rst @@ -0,0 +1,56 @@ +How to train, validate, export and optimize the model +================================================================ + +.. grid:: 1 2 2 3 + :margin: 1 1 0 0 + :gutter: 1 + + .. grid-item-card:: Classification + :link: classification + :link-type: doc + :text-align: center + + .. grid-item-card:: Detection + :link: detection + :link-type: doc + :text-align: center + + .. grid-item-card:: Instance Segmentation + :link: instance_segmentation + :link-type: doc + :text-align: center + + .. grid-item-card:: Semantic Segmentation + :link: semantic_segmentation + :link-type: doc + :text-align: center + + .. grid-item-card:: Anomaly Task + :link: anomaly_detection + :link-type: doc + :text-align: center + + .. grid-item-card:: Action Classification + :link: action_classification + :link-type: doc + :text-align: center + + .. grid-item-card:: Action Detection + :link: action_detection + :link-type: doc + :text-align: center + + .. grid-item-card:: Visual Prompting + :text-align: center + +.. toctree:: + :maxdepth: 1 + :hidden: + + classification + detection + instance_segmentation + semantic_segmentation + anomaly_detection + action_classification + action_detection diff --git a/docs/source/guide/tutorials/base/how_to_train/instance_segmentation.rst b/docs/source/guide/tutorials/base/how_to_train/instance_segmentation.rst new file mode 100644 index 00000000000..69e699ac2ae --- /dev/null +++ b/docs/source/guide/tutorials/base/how_to_train/instance_segmentation.rst @@ -0,0 +1,479 @@ +Instance Segmentation model +================================ + +This tutorial reveals end-to-end solution from installation to model export and optimization for instance segmentation task on a specific example. +On this page, we show how to train, validate, export and optimize Mask-RCNN model on a toy dataset. + +To learn more about Instance Segmentation task, refer to :doc:`../../../explanation/algorithms/segmentation/instance_segmentation`. + + +.. note:: + + To learn deeper how to manage training process of the model including additional parameters and its modification. + +The process has been tested on the following configuration. + +- Ubuntu 20.04 +- NVIDIA GeForce RTX 3090 +- Intel(R) Core(TM) i9-11900 +- CUDA Toolkit 11.8 + +************************* +Setup virtual environment +************************* + +1. You can follow the installation process from a :doc:`quick start guide <../../../get_started/installation>` +to create a universal virtual environment for OpenVINO™ Training Extensions. + +2. Activate your virtual +environment: + +.. code-block:: shell + + .otx/bin/activate + # or by this line, if you created an environment, using tox + . venv/otx/bin/activate + + +*************************** +Dataset preparation +*************************** + +.. note:: + + Currently, we support the following instance segmentation dataset formats: + + - `COCO `_ + + +1. Clone a repository with +`car-seg dataset `_. + +.. code-block:: shell + + mkdir data ; cd data + wget https://ultralytics.com/assets/carparts-seg.zip + unzip carparts-seg.zip + + +This dataset contains images of grapevines with the annotation for different varieties of grapes. + +- ``CDY`` - Chardonnay +- ``CFR`` - Cabernet Franc +- ``CSV`` - Cabernet Sauvignon +- ``SVB`` - Sauvignon Blanc +- ``SYH`` - Syrah + +| + +.. image:: ../../../../../utils/images/wgisd_dataset_sample.jpg + :width: 600 + :alt: this image uploaded from this `source `_ + +| + +2. Check the file structure of downloaded dataset, +we will need the following file structure: + +.. code-block:: shell + + wgisd + ├── annotations/ + ├── instances_train.json + ├── instances_val.json + (Optional) + └── instances_test.json + ├──images/ + (Optional) + ├── train + ├── val + └── test + (There may be more extra unrelated folders) + +We can do that by running these commands: + +.. code-block:: shell + + # format images folder + mv data images + + # format annotations folder + mv coco_annotations annotations + + # rename annotations to meet *_train.json pattern + mv annotations/train_polygons_instances.json annotations/instances_train.json + mv annotations/test_polygons_instances.json annotations/instances_val.json + cp annotations/instances_val.json annotations/instances_test.json + + cd ../.. + +.. note:: + We can use this dataset in the detection tutorial. refer to :doc:`./detection`. + +********* +Training +********* + +1. First of all, you need to choose which instance segmentation model you want to train. +The list of supported templates for instance segmentation is available with the command line below. + +.. note:: + + The characteristics and detailed comparison of the models could be found in :doc:`Explanation section <../../../explanation/algorithms/segmentation/instance_segmentation>`. + + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx find --task INSTANCE_SEGMENTATION + + ┏━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Task ┃ Model Name ┃ Recipe Path ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ INSTANCE_SEGMENTATION │ openvino_model │ src/otx/recipe/instance_segmentation/openvino_model.yaml │ + │ INSTANCE_SEGMENTATION │ maskrcnn_r50 │ src/otx/recipe/instance_segmentation/maskrcnn_r50.yaml │ + │ INSTANCE_SEGMENTATION │ maskrcnn_r50_tile │ src/otx/recipe/instance_segmentation/maskrcnn_r50_tile.yaml │ + │ INSTANCE_SEGMENTATION │ maskrcnn_swint │ src/otx/recipe/instance_segmentation/maskrcnn_swint.yaml │ + │ INSTANCE_SEGMENTATION │ maskrcnn_efficientnetb2b │ src/otx/recipe/instance_segmentation/maskrcnn_efficientnetb2b.yaml │ + │ INSTANCE_SEGMENTATION │ rtmdet_inst_tiny │ src/otx/recipe/instance_segmentation/rtmdet_inst_tiny.yaml │ + │ INSTANCE_SEGMENTATION │ maskrcnn_efficientnetb2b_tile │ src/otx/recipe/instance_segmentation/maskrcnn_efficientnetb2b_tile.yaml │ + │ INSTANCE_SEGMENTATION │ maskrcnn_swint_tile │ src/otx/recipe/instance_segmentation/maskrcnn_swint_tile.yaml │ + └───────────────────────┴───────────────────────────────┴────────────────────────────────────────────────────────────────────────────────────┘ + + .. tab-item:: API + + .. code-block:: python + + from otx.engine.utils.api import list_models + + model_lists = list_models(task="INSTANCE_SEGMENTATION") + print(model_lists) + ''' + [ + 'maskrcnn_swint', + 'maskrcnn_r50', + 'maskrcnn_r50_tile', + 'rtmdet_inst_tiny', + 'maskrcnn_swint_tile', + 'maskrcnn_efficientnetb2b_tile', + 'openvino_model', + 'maskrcnn_efficientnetb2b', + ] + ''' + +2. On this step we will configure configuration +with: + +- all necessary configs for maskrcnn_r50 +- train/validation sets, based on provided annotation. + +It may be counterintuitive, but for ``--data_root`` we need to pass the path to the dataset folder root (in our case it's ``data/wgisd``) instead of the folder with validation images. +This is because the function automatically detects annotations and images according to the expected folder structure we achieved above. + +Let's check the object detection configuration running the following command: + +.. code-block:: shell + + # or its config path + (otx) ...$ otx train --config src/otx/recipe/instance_segmentation/maskrcnn_r50.yaml --data_root data/wgisd --print_config + + ... + data_root: data/wgisd + work_dir: otx-workspace + callback_monitor: val/map_50 + disable_infer_num_classes: false + engine: + task: INSTANCE_SEGMENTATION + device: auto + data: + ... + +.. note:: + + If you want to get configuration as yaml file, please use ``--print_config`` parameter and ``> configs.yaml``. + + .. code-block:: shell + + (otx) ...$ otx train --config src/otx/recipe/instance_segmentation/maskrcnn_r50.yaml --data_root data/wgisd --print_config > configs.yaml + # Update configs.yaml & Train configs.yaml + (otx) ...$ otx train --config configs.yaml + +3. To start training we need to call ``otx train`` + +Here are the main outputs can expect with CLI: +- ``{work_dir}/{timestamp}/checkpoints/epoch_*.ckpt`` - a model checkpoint file. +- ``{work_dir}/{timestamp}/configs.yaml`` - The configuration file used in the training can be reused to reproduce the training. +- ``{work_dir}/.latest`` - The results of each of the most recently executed subcommands are soft-linked. This allows you to skip checkpoints and config file entry as a workspace. + +.. tab-set:: + + .. tab-item:: CLI (auto-config) + + .. code-block:: shell + + (otx) ...$ otx train --data_root data/wgisd --task INSTANCE_SEGMENTATION + + .. tab-item:: CLI (with config) + + .. code-block:: shell + + (otx) ...$ otx train --config src/otx/recipe/instance_segmentation/maskrcnn_r50.yaml --data_root data/wgisd + + .. tab-item:: API (from_config) + + .. code-block:: python + + from otx.engine import Engine + + data_root = "data/wgisd" + recipe = "src/otx/recipe/instance_segmentation/maskrcnn_r50.yaml" + + engine = Engine.from_config( + config_path=recipe, + data_root=data_root, + work_dir="otx-workspace", + ) + + engine.train(...) + + .. tab-item:: API + + .. code-block:: python + + from otx.engine import Engine + + data_root = "data/wgisd" + + engine = Engine( + model="maskrcnn_r50", + task="INSTANCE_SEGMENTATION", + data_root=data_root, + work_dir="otx-workspace", + ) + + engine.train(...) + +.. note:: + + Because the dataset structure is mostly the same as detection, INSTANCE_SEGMENTATION requires the task type to be specified to enable auto-configuration. + +The training time highly relies on the hardware characteristics, for example on 1 NVIDIA GeForce RTX 3090 the training took about 10 minutes with full dataset. + +4. ``(Optional)`` Additionally, we can tune training parameters such as batch size, learning rate, patience epochs or warm-up iterations. +Learn more about template-specific parameters using ``otx train params --help``. + +It can be done by manually updating parameters in the ``template.yaml`` file in your workplace or via the command line. + +For example, to decrease the batch size to 4, fix the number of epochs to 100 and disable early stopping, extend the command line above with the following line. + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx train ... --data.config.train_subset.batch_size 4 \ + --max_epochs 100 + + .. tab-item:: API + + .. code-block:: python + + from otx.core.config.data import DataModuleConfig, SubsetConfig + from otx.core.data.module import OTXDataModule + from otx.engine import Engine + + data_config = DataModuleConfig(..., train_subset=SubsetConfig(..., batch_size=4)) + datamodule = OTXDataModule(..., config=data_config) + + engine = Engine(..., datamodule=datamodule) + + engine.train(max_epochs=100) + + +5. The training result ``checkpoints/*.ckpt`` file is located in ``{work_dir}`` folder, +while training logs can be found in the ``{work_dir}/{timestamp}`` dir. + +.. note:: + We also can visualize the training using ``Tensorboard`` as these logs are located in ``{work_dir}/{timestamp}/tensorboard``. + +.. code-block:: + + otx-workspace + └── outputs/ + ├── 20240403_134256/ + | ├── csv/ + | ├── checkpoints/ + | | └── epoch_*.pth + | ├── tensorboard/ + | └── configs.yaml + └── .latest + └── train/ + ... + +After that, we have the PyTorch instance segmentation model trained with OpenVINO™ Training Extensions, which we can use for evaluation, export, optimization and deployment. + +*********** +Validation +*********** + +1. ``otx test`` runs evaluation of a trained +model on a specific dataset. + +The test function receives test annotation information and model snapshot, trained in the previous step. + +``otx test`` will output a mAP_50 for instance segmentation. + +2. The command below will run validation on our dataset +and save performance results in ``otx-workspace``: + +.. tab-set:: + + .. tab-item:: CLI (with work_dir) + + .. code-block:: shell + + (otx) ...$ otx test --work_dir otx-workspace + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Test metric ┃ DataLoader 0 ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ test/data_time │ 0.0007903117220848799 │ + │ test/iter_time │ 0.062202490866184235 │ + │ test/map │ 0.33679962158203125 │ + │ test/map_50 │ 0.5482384562492371 │ + │ test/map_75 │ 0.37118086218833923 │ + └───────────────────────────┴───────────────────────────┘ + + .. tab-item:: CLI (with config) + + .. code-block:: shell + + (otx) ...$ otx test --config src/otx/recipe/instance_segmentation/maskrcnn_r50.yaml \ + --data_root data/wgisd \ + --checkpoint otx-workspace/20240312_051135/checkpoints/epoch_059.ckpt + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Test metric ┃ DataLoader 0 ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ test/data_time │ 0.0007903117220848799 │ + │ test/iter_time │ 0.062202490866184235 │ + │ test/map │ 0.33679962158203125 │ + │ test/map_50 │ 0.5482384562492371 │ + │ test/map_75 │ 0.37118086218833923 │ + └───────────────────────────┴───────────────────────────┘ + + .. tab-item:: API + + .. code-block:: python + + engine.test() + + +3. The output of ``{work_dir}/{timestamp}/csv/version_0/metrics.csv`` consists of +a dict with target metric name and its value. + + +********* +Export +********* + +1. ``otx export`` exports a trained Pytorch `.pth` model to the +OpenVINO™ Intermediate Representation (IR) format. + +It allows running the model on the Intel hardware much more efficient, especially on the CPU. Also, the resulting IR model is required to run PTQ optimization. IR model consists of 2 files: ``exported_model.xml`` for weights and ``exported_model.bin`` for architecture. + +2. We can run the below command line to export the trained model +and save the exported model to the ``{work_dir}/{timestamp}/`` folder. + +.. tab-set:: + + .. tab-item:: CLI (with work_dir) + + .. code-block:: shell + + (otx) ...$ otx export --work_dir otx-workspace + ... + Elapsed time: 0:00:06.588245 + + .. tab-item:: CLI (with config) + + .. code-block:: shell + + (otx) ...$ otx export ... --checkpoint otx-workspace/20240312_051135/checkpoints/epoch_033.ckpt + ... + Elapsed time: 0:00:06.588245 + + .. tab-item:: API + + .. code-block:: python + + engine.export() + + +************* +Optimization +************* + +1. We can further optimize the model with ``otx optimize``. +It uses NNCF or PTQ depending on the model and transforms it to ``INT8`` format. + +Please, refer to :doc:`optimization explanation <../../../explanation/additional_features/models_optimization>` section to get the intuition of what we use under the hood for optimization purposes. + +2. Command example for optimizing +OpenVINO™ model (.xml) with OpenVINO™ PTQ. + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx optimize --work_dir otx-workspace \ + --checkpoint otx-workspace/20240312_052847/exported_model.xml + + ... + Statistics collection ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 30/30 • 0:00:14 • 0:00:00 + Applying Fast Bias correction ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 58/58 • 0:00:02 • 0:00:00 + Elapsed time: 0:00:24.958733 + + .. tab-item:: API + + .. code-block:: python + + ckpt_path = "otx-workspace/20240312_052847/exported_model.xml" + engine.optimize(checkpoint=ckpt_path) + +Please note, that PTQ will take some time (generally less than NNCF optimization) without logging to optimize the model. + +3. Finally, we can also evaluate the optimized model by passing +it to the ``otx test`` function. + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx test --work_dir otx-workspace \ + --checkpoint otx-workspace/20240312_055042/optimized_model.xml \ + --engine.device cpu + + ... + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ Test metric ┃ DataLoader 0 ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ test/map_50 │ 0.5482384562492371 │ + └───────────────────────────┴───────────────────────────┘ + Elapsed time: 0:00:10.260521 + + .. tab-item:: API + + .. code-block:: python + + ckpt_path = "otx-workspace/20240312_055042/optimized_model.xml" + engine.test(checkpoint=ckpt_path) + +3. Now we have fully trained, optimized and exported an +efficient model representation ready-to-use instance segmentation model. diff --git a/docs/source/guide/tutorials/base/how_to_train/semantic_segmentation.rst b/docs/source/guide/tutorials/base/how_to_train/semantic_segmentation.rst new file mode 100644 index 00000000000..07a9731940b --- /dev/null +++ b/docs/source/guide/tutorials/base/how_to_train/semantic_segmentation.rst @@ -0,0 +1,251 @@ +Semantic Segmentation model +================================ + +This tutorial demonstrates how to train and optimize a semantic segmentation model using the VOC2012 dataset from the PASCAL Visual Object Classes Challenge 2012. +The trained model will be used to segment images by assigning a label to each pixel of the input image. + +To learn more about Segmentation task, refer to :doc:`../../../explanation/algorithms/segmentation/semantic_segmentation`. + +.. note:: + To learn more about managing the training process of the model including additional parameters and its modification, refer to :doc:`./detection`. + +The process has been tested on the following configuration. + +- Ubuntu 20.04 +- NVIDIA GeForce RTX 3090 +- Intel(R) Core(TM) i9-11900 +- CUDA Toolkit 11.8 + +************************* +Setup virtual environment +************************* + +1. You can follow the installation process from a :doc:`quick start guide <../../../get_started/installation>` +to create a universal virtual environment for OpenVINO™ Training Extensions. + +2. Activate your virtual +environment: + +.. code-block:: shell + + .otx/bin/activate + # or by this line, if you created an environment, using tox + . venv/otx/bin/activate + +*************************** +Dataset preparation +*************************** + +Download and prepare `VOC2012 dataset `_ with the following command: + +.. code-block:: + + cd data + wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar + tar -xvf VOCtrainval_11-May-2012.tar + cd .. + +.. image:: ../../../../../utils/images/voc_example.png + :width: 600 + +The dataset contains a set of RGB images with 20 semantic labels such as aeroplane, bicycle, bird, car, person, etc. The images are stored in the following format: + +.. code-block:: + + VOC2012 + ├── Annotations + ├── ImageSets + ├── JPEGImages + ├── SegmentationClass + ├── SegmentationObject + + + +********* +Training +********* + +1. First of all, you need to choose which semantic segmentation model you want to train. +The list of supported templates for semantic segmentation is available with the command line below. + +.. note:: + + The characteristics and detailed comparison of the models could be found in :doc:`Explanation section <../../../explanation/algorithms/segmentation/semantic_segmentation>`. + + We also can modify the architecture of supported models with various backbones, please refer to the :doc:`advanced tutorial for model customization <../../advanced/backbones>`. + +.. code-block:: + + (otx) ...$ otx find --task segmentation + + +--------------+-----------------------------------------------------+--------------------+------------------------------------------------------------------------------+ + | TASK | ID | NAME | BASE PATH | + +--------------+-----------------------------------------------------+--------------------+------------------------------------------------------------------------------+ + | SEGMENTATION | Custom_Semantic_Segmentation_Lite-HRNet-18_OCR | Lite-HRNet-18 | src/otx/algorithms/segmentation/configs/ocr_lite_hrnet_18/template.yaml | + | SEGMENTATION | Custom_Semantic_Segmentation_Lite-HRNet-18-mod2_OCR | Lite-HRNet-18-mod2 | src/otx/algorithms/segmentation/configs/ocr_lite_hrnet_18_mod2/template.yaml | + | SEGMENTATION | Custom_Semantic_Segmentation_Lite-HRNet-s-mod2_OCR | Lite-HRNet-s-mod2 | src/otx/algorithms/segmentation/configs/ocr_lite_hrnet_s_mod2/template.yaml | + | SEGMENTATION | Custom_Semantic_Segmentation_Lite-HRNet-x-mod3_OCR | Lite-HRNet-x-mod3 | src/otx/algorithms/segmentation/configs/ocr_lite_hrnet_x_mod3/template.yaml | + +--------------+-----------------------------------------------------+--------------------+------------------------------------------------------------------------------+ + +.. note:: + + We do not attach an OCR head for supported models in default. We remain the suffix '_OCR' in ID just for backward compatibility. + +To have a specific example in this tutorial, all commands will be run on the :ref:`Lite-HRNet-18-mod2 ` model. It's a light model, that achieves competitive accuracy while keeping the inference fast. + + +2. Next, we need to create train/validation sets. +OpenVINO™ Training Extensions supports auto-split functionality for semantic segmentation. + +.. note:: + + Currently, OpenVINO™ Training Extensions supports auto-split only for public VOC dataset format in semantic segmentation. We should specify the validation roots in the argument ``--val-data-roots`` when using other supported segmentation dataset. To learn about dataset formats for semantic segmentation, please refer to the :doc:`explanation section <../../../explanation/algorithms/segmentation/semantic_segmentation>`. + +Let's prepare an OpenVINO™ Training Extensions semantic segmentation workspace running the following command: + +.. code-block:: + + (otx) ...$ otx build --train-data-roots data/VOCdevkit/VOC2012 --model Lite-HRNet-18-mod2 + + [*] Load Model Template ID: Custom_Semantic_Segmentation_Lite-HRNet-18-mod2_OCR + [*] Load Model Name: Lite-HRNet-18-mod2 + + ... + + [*] Update data configuration file to: otx-workspace-SEGMENTATION/data.yaml + + (otx) ...$ cd ./otx-workspace-SEGMENTATION + +It will create **otx-workspace-SEGMENTATION** with all necessary configs for Lite-HRNet-18-mod2, prepared ``data.yaml`` to simplify CLI commands launch and splitted dataset. + +3. To start training we need to call ``otx train`` +command in our workspace: + +.. code-block:: + + (otx) ...$ otx train + +That's it! The training will return artifacts: ``weights.pth`` and ``label_schema.json``, which are needed as input for the further commands: ``export``, ``eval``, ``optimize``, etc. + +After that, we have the PyTorch model trained with OpenVINO™ Training Extensions, which we can use for evaluation, export, optimization and deployment. + +*********** +Validation +*********** + +1. ``otx eval`` runs evaluation of a trained +model on a specific dataset. +The eval function receives test annotation information and model snapshot, trained in the previous step. +Please note, ``label_schema.json`` file contains meta information about the dataset and it should be located in the same folder as the model snapshot. + +``otx eval`` will output a ``mDice`` score for semantic segmentation. + +2. The command below will run validation on our splitted dataset. We can use other test dataset as well by specifying the path where test data exists in argument ``--test-data-roots``. +By running this example command, the performance results evaluated by our splitted validation dataset are saved in ``performance.json`` file: + +.. code-block:: + + (otx) ...$ otx eval --test-data-roots splitted_dataset/val \ + --load-weights models/weights.pth \ + --output outputs + +Finally, we get the validation output: + +.. code-block:: + + ... + + 2023-02-21 18:09:56,134 | INFO : run task done. + 2023-02-21 18:09:57,807 | INFO : called evaluate() + 2023-02-21 18:09:57,807 | INFO : Computing mDice + 2023-02-21 18:09:58,508 | INFO : mDice after evaluation: 0.9659400544959128 + Performance(score: 0.9659400544959128, dashboard: (1 metric groups)) + +In ``outputs/performance.json`` file, the validation output score is saved as: + +.. code-block:: + + {"Dice Average": 0.9659400544959128} + + +********* +Export +********* + +1. ``otx export`` exports a trained Pytorch `.pth` model to the OpenVINO™ Intermediate Representation (IR) format. +It allows running the model on the Intel hardware much more efficient, especially on the CPU. Also, the resulting IR model is required to run PTQ optimization. IR model consists of 2 files: ``openvino.xml`` for weights and ``openvino.bin`` for architecture. + +2. We can run the below command line to export the trained model +and save the exported model to the ``openvino_model`` folder. + +.. code-block:: + + (otx) ...$ otx export --load-weights models/weights.pth \ + --output openvino_model + + ... + + 2023-02-02 03:23:03,057 | INFO : run task done. + 2023-02-02 03:23:03,064 | INFO : Exporting completed + + +3. We can check the ``mDice`` score of the IR model and the consistency between the exported model and the PyTorch model, +using ``otx eval`` and passing the IR model path to the ``--load-weights`` parameter. + +.. code-block:: + + (otx) ...$ otx eval --test-data-roots splitted_dataset/val \ + --load-weights openvino_model/openvino.xml \ + --output openvino_model + + ... + + Performance(score: 0.9659400544959128, dashboard: (1 metric groups)) + + +************* +Optimization +************* + +1. We can further optimize the model with ``otx optimize``. +It uses NNCF or PTQ depending on the model and transforms it to ``INT8`` format. + +Please, refer to :doc:`optimization explanation <../../../explanation/additional_features/models_optimization>` section to get the intuition of what we use under the hood for optimization purposes. + +2. Command example for optimizing +a PyTorch model (`.pth`) with OpenVINO™ NNCF. + +.. code-block:: + + (otx) ...$ otx optimize --load-weights models/weights.pth --output nncf_model + + ... + + INFO:nncf:Loaded 5286/5286 parameters + 2023-02-21 18:09:56,134 | INFO : run task done. + 2023-02-21 18:09:57,807 | INFO : called evaluate() + 2023-02-21 18:09:57,807 | INFO : Computing mDice + 2023-02-21 18:09:58,508 | INFO : mDice after evaluation: 0.9659400544959128 + Performance(score: 0.9659400544959128, dashboard: (1 metric groups)) + +The optimization time relies on the hardware characteristics, for example on 1 NVIDIA GeForce RTX 3090 and Intel(R) Core(TM) i9-10980XE it took about 15 minutes. + +3. Command example for optimizing +OpenVINO™ model (.xml) with OpenVINO™ PTQ. + +.. code-block:: + + (otx) ...$ otx optimize --load-weights openvino_model/openvino.xml \ + --output ptq_model + + ... + + Performance(score: 0.9577656675749319, dashboard: (1 metric groups)) + +Please note, that PTQ will take some time (generally less than NNCF optimization) without logging to optimize the model. + +4. Now we have fully trained, optimized and exported an +efficient model representation ready-to-use semantic segmentation model. + +The following tutorials provide further steps on how to :doc:`deploy <../deploy>` and use your model in the :doc:`demonstration mode <../demo>` and visualize results. +The examples are provided with an object detection model, but it is easy to apply them for semantic segmentation by substituting the object detection model with segmentation one. \ No newline at end of file diff --git a/docs/source/guide/tutorials/base/index.rst b/docs/source/guide/tutorials/base/index.rst new file mode 100644 index 00000000000..a852f3f6517 --- /dev/null +++ b/docs/source/guide/tutorials/base/index.rst @@ -0,0 +1,22 @@ +Base Tutorials +============== + +.. grid:: + :gutter: 1 + + .. grid-item-card:: Train to Export Model + :link: how_to_train/index + :link-type: doc + :text-align: center + + .. grid-item-card:: Explain Model + :link: explain + :link-type: doc + :text-align: center + +.. toctree:: + :maxdepth: 2 + :hidden: + + how_to_train/index + explain diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000000..40a5d5809a3 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,15 @@ +Welcome to Intel OpenVINO Training Extensions's develop documentation! +====================================================================== + +.. toctree:: + :hidden: + + guide/index + +****************** +Indices and tables +****************** + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/utils/images/anomaly_tasks.png b/docs/utils/images/anomaly_tasks.png new file mode 100644 index 00000000000..d576c04622f Binary files /dev/null and b/docs/utils/images/anomaly_tasks.png differ diff --git a/docs/utils/images/car_tree_bug_gt_sample.png b/docs/utils/images/car_tree_bug_gt_sample.png new file mode 100644 index 00000000000..fb66333c6c4 Binary files /dev/null and b/docs/utils/images/car_tree_bug_gt_sample.png differ diff --git a/docs/utils/images/dota_tiling_example.jpg b/docs/utils/images/dota_tiling_example.jpg new file mode 100644 index 00000000000..b260de0b3fd Binary files /dev/null and b/docs/utils/images/dota_tiling_example.jpg differ diff --git a/docs/utils/images/draem.png b/docs/utils/images/draem.png new file mode 100644 index 00000000000..a791020b63b Binary files /dev/null and b/docs/utils/images/draem.png differ diff --git a/docs/utils/images/explain_wgisd.png b/docs/utils/images/explain_wgisd.png new file mode 100644 index 00000000000..26069336840 Binary files /dev/null and b/docs/utils/images/explain_wgisd.png differ diff --git a/docs/utils/images/flowers.jpg b/docs/utils/images/flowers.jpg new file mode 100644 index 00000000000..cfc2653b2b2 Binary files /dev/null and b/docs/utils/images/flowers.jpg differ diff --git a/docs/utils/images/flowers_example.jpg b/docs/utils/images/flowers_example.jpg new file mode 100644 index 00000000000..07b6bece429 Binary files /dev/null and b/docs/utils/images/flowers_example.jpg differ diff --git a/docs/utils/images/instance_seg_example.png b/docs/utils/images/instance_seg_example.png new file mode 100644 index 00000000000..c3fc68c786a Binary files /dev/null and b/docs/utils/images/instance_seg_example.png differ diff --git a/docs/utils/images/label_tree.png b/docs/utils/images/label_tree.png new file mode 100644 index 00000000000..59511ec0af2 Binary files /dev/null and b/docs/utils/images/label_tree.png differ diff --git a/docs/utils/images/otx-logo-black.png b/docs/utils/images/otx-logo-black.png new file mode 100644 index 00000000000..e58e2ab4dbf Binary files /dev/null and b/docs/utils/images/otx-logo-black.png differ diff --git a/docs/utils/images/otx-logo.png b/docs/utils/images/otx-logo.png new file mode 100644 index 00000000000..d7d843752f1 Binary files /dev/null and b/docs/utils/images/otx-logo.png differ diff --git a/docs/utils/images/padim.png b/docs/utils/images/padim.png new file mode 100644 index 00000000000..a7b3d7c6081 Binary files /dev/null and b/docs/utils/images/padim.png differ diff --git a/docs/utils/images/semantic_seg_example.png b/docs/utils/images/semantic_seg_example.png new file mode 100644 index 00000000000..b010818a50e Binary files /dev/null and b/docs/utils/images/semantic_seg_example.png differ diff --git a/docs/utils/images/stfpm.png b/docs/utils/images/stfpm.png new file mode 100644 index 00000000000..6d135663f2a Binary files /dev/null and b/docs/utils/images/stfpm.png differ diff --git a/docs/utils/images/voc_example.png b/docs/utils/images/voc_example.png new file mode 100644 index 00000000000..b306aa73ef1 Binary files /dev/null and b/docs/utils/images/voc_example.png differ diff --git a/docs/utils/images/vpm_ref_prediction.png b/docs/utils/images/vpm_ref_prediction.png new file mode 100644 index 00000000000..04d6bfa008d Binary files /dev/null and b/docs/utils/images/vpm_ref_prediction.png differ diff --git a/docs/utils/images/vpm_ref_result.png b/docs/utils/images/vpm_ref_result.png new file mode 100644 index 00000000000..d652d0a5dcd Binary files /dev/null and b/docs/utils/images/vpm_ref_result.png differ diff --git a/docs/utils/images/vpm_tgt_prediction.png b/docs/utils/images/vpm_tgt_prediction.png new file mode 100644 index 00000000000..c258a6ca07e Binary files /dev/null and b/docs/utils/images/vpm_tgt_prediction.png differ diff --git a/docs/utils/images/wgisd_dataset_sample.jpg b/docs/utils/images/wgisd_dataset_sample.jpg new file mode 100644 index 00000000000..3bf361dc113 Binary files /dev/null and b/docs/utils/images/wgisd_dataset_sample.jpg differ diff --git a/docs/utils/images/wgisd_gt_sample.jpg b/docs/utils/images/wgisd_gt_sample.jpg new file mode 100644 index 00000000000..8ee564f8a25 Binary files /dev/null and b/docs/utils/images/wgisd_gt_sample.jpg differ diff --git a/docs/utils/images/wgisd_pr2_sample.jpg b/docs/utils/images/wgisd_pr2_sample.jpg new file mode 100644 index 00000000000..b5846906759 Binary files /dev/null and b/docs/utils/images/wgisd_pr2_sample.jpg differ diff --git a/docs/utils/images/wgisd_pr_sample.jpg b/docs/utils/images/wgisd_pr_sample.jpg new file mode 100644 index 00000000000..0841dea4867 Binary files /dev/null and b/docs/utils/images/wgisd_pr_sample.jpg differ diff --git a/docs/utils/images/xai_cls.jpg b/docs/utils/images/xai_cls.jpg new file mode 100644 index 00000000000..602d77b2eb2 Binary files /dev/null and b/docs/utils/images/xai_cls.jpg differ diff --git a/docs/utils/images/xai_det.jpg b/docs/utils/images/xai_det.jpg new file mode 100644 index 00000000000..fabecf82f57 Binary files /dev/null and b/docs/utils/images/xai_det.jpg differ diff --git a/docs/utils/images/xai_example.jpg b/docs/utils/images/xai_example.jpg new file mode 100644 index 00000000000..5e6a981ca0c Binary files /dev/null and b/docs/utils/images/xai_example.jpg differ diff --git a/pyproject.toml b/pyproject.toml index 29ec1343a8e..0e977f25f66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ docs = [ "pydata-sphinx-theme==0.12.0", "sphinx-tabs", "sphinx-panels", + "sphinx-design", "sphinx-copybutton==0.5.0", "sphinx-autoapi", "sphinxemoji", diff --git a/tox.ini b/tox.ini index c1c7ab0e9f8..2fb0c99e73c 100644 --- a/tox.ini +++ b/tox.ini @@ -74,6 +74,9 @@ commands = deps = {[testenv:unit-test-py310]deps} .[docs] +commands_pre = + ; [TODO]: Needs to be fixed so that this is not duplicated for each test run + otx install -v change_dir = {toxinidir}/docs allowlist_externals = make