diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index d8020c4a..dd5d6f81 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -14,7 +14,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install pipenv
+        pip install "pipenv<2024.1.0"
         pipenv sync --dev
     - name: Test with pytest
       run: |
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..d42ec65b
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.8.11-slim-bullseye
+COPY --from=openjdk:11-jre-slim /usr/local/openjdk-11 /usr/local/openjdk-11
+ENV JAVA_HOME=/usr/local/openjdk-11
+COPY requirements.txt .
+RUN pip install --upgrade pip==23.3 
+RUN pip install -r requirements.txt
+CMD pytest -v ptls_tests/
\ No newline at end of file
diff --git a/DockerfilePaper b/DockerfilePaper
new file mode 100644
index 00000000..98ced1c3
--- /dev/null
+++ b/DockerfilePaper
@@ -0,0 +1,25 @@
+FROM nvidia/cuda:11.1.1-runtime-ubuntu18.04
+
+RUN apt-get update -y && \
+    apt-get install -y libblas3 liblapack3 liblapack-dev libblas-dev gfortran libatlas-base-dev cmake
+
+RUN apt-get install -y software-properties-common && \
+    add-apt-repository -y ppa:deadsnakes/ppa && \
+    apt-get update && \
+    apt-get install -y python3.7 python3.7-dev python3-pip && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 2
+
+RUN python3 -m pip install -U pip
+
+RUN python3 -m pip install 'setuptools==60.5.0' 'Cython==0.29.26' 'typing_extensions==4.0.1'
+RUN python3 -m pip install 'numpy==1.21.5'
+RUN python3 -m pip install 'pythran' 'pybind11'
+RUN python3 -m pip install 'scipy==1.7.3'
+RUN python3 -m pip install 'luigi>=3.0.0' 'scikit-learn==1.0.2' 'pyarrow==6.0.1' 'pyspark==3.4.2' 'tqdm==4.62.3' \
+                           'pandas==1.3.5' 'duckdb' 'pytest' 'pylint' 'coverage' 'pyhocon'
+RUN python3 -m pip install 'torch==1.12.1' 'pytorch-lightning==1.6.5' 'torchmetrics==0.9.2' \
+                           'hydra-core>=1.1.2' 'hydra-optuna-sweeper>=1.2.0' 'tensorboard==2.3.0' \
+                           'omegaconf' 'transformers' 'lightgbm' 'wandb'
+
+RUN python3 -m pip cache purge
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
diff --git a/README.md b/README.md
index 6194f607..4f76be92 100644
--- a/README.md
+++ b/README.md
@@ -50,18 +50,52 @@ pytest
 
 ## Demo notebooks
 
-- Supervised model training [notebook](demo/supervised-sequence-to-target.ipynb)
-- Self-supervided training and embeddings for downstream task [notebook](demo/coles-emb.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dllllb/pytorch-lifestream/blob/master/demo/coles-emb.ipynb)
-- Self-supervided training and embeddings for clients' transactions [notebook](demo/transaction-emb.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dllllb/pytorch-lifestream/blob/master/demo/transaction-emb.ipynb)
-- Self-supervided embeddings in CatBoost [notebook](demo/coles-catboost.ipynb)
-- Self-supervided training and fine-tuning [notebook](demo/coles-finetune.ipynb)
-- Self-supervised TrxEncoder only training with Masked Language Model task and fine-tuning [notebook](demo/mlm-emb.ipynb)
-- Pandas data preprocessing options [notebook](demo/preprocessing-demo.ipynb)
-- PySpark and Parquet for data preprocessing [notebook](demo/pyspark-parquet.ipynb)
-- Fast inference on large dataset [notebook](demo/extended_inference.ipynb)
-- Supervised multilabel classification [notebook](demo/multilabel-classification.ipynb)
-- Text features demo:
-  - Using pretrained encoder to text features [notebook](demo/coles-pretrained-embeddings.ipynb)
+Learn event sequence deep learning analysis with Pytorch-Lifestream.
+
+We have collected a set of topics related to the processing of event sequences. Most themes are supported by demo code using the ptls library. We recommend studying the topics sequentially. However, if you are familiar in some areas, you can skip them and take only the relevant topics.
+
+| ix   |  Topic                                  | Description                             | Demo  |
+| ---- | --------------------------------------- | --------------------------------------- | ----- |
+| 1.   | Prerequisites                           |                                         |       |
+| 1.1. | PyTorch                                 | Deep Learning framework                 | https://pytorch.org/       |
+| 1.2. | PyTorch-Lightning                       | NN training framework                   | https://lightning.ai/      |
+| 1.3. | (optional) Hydra                        | Configuration framework                 | https://hydra.cc/ and [demo/Hydra CoLES Training.ipynb](./demo/Hydra CoLES Training.ipynb)         | 
+| 1.4. | pandas                                  | Data preprocessing                      | https://pandas.pydata.org/ |
+| 1.5. | (optional) PySpark                      | Big Data preprocessing                  | [https://spark.apache.org/](https://spark.apache.org/docs/latest/api/python/index.html) |
+| 2.   | Event sequences                         | Problem statement and classical methods |     |
+| 2.1. | Event sequence for global problems      | e.g. event sequence classification      | TBD |
+| 2.2. | Event sequence for local problems       | e.g. next event prediction              | TBD |
+| 3.     | Supervised neural networks              | Supervised learning for event sequence classification  | [demo/supervised-sequence-to-target.ipynb](./demo/su3ervised-sequence-to-target.ipynb)  |
+| 3.1.   | Network Types                           | Different networks for sequences      |  |
+| 3.1.1. | Recurrent neural networks               |    | TBD based on `supervised-sequence-to-target.ipynb` |
+| 3.1.2. | (optional) Convolutional neural networks |    | TBD based on `supervised-sequence-to-target.ipynb` |
+| 3.1.3. | Transformers                            |    | [demo/supervised-sequence-to-target-transformer.ipynb](demo/supervised-sequence-to-target-transformer.ipynb) |
+| 3.2.   | Problem types                           | Different problems types for sequences  |  |
+| 3.2.1. | Global problems                         | Binary, multilabel, regression, ...   | TBD based on [demo/multilabel-classification.ipynb](demo/multilabel-classification.ipynb) | 
+| 3.2.2. | Local problems                          | Next event prediction                 | [demo/event-sequence-local-embeddings.ipynb](demo/event-sequence-local-embeddings.ipynb) |
+| 4.   | Unsupervised learning                   | Pretrain self-supervised model with some proxy task | TBD based on [demo/coles-emb.ipynb](./demo/coles-emb.ipynb)  [![O4en In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dllllb/pytorch-lifestream/blob/master/demo/co4es-emb.ipynb)     |
+| 4.1. | (optional) Word2vec                     | Context based methods                   |     |
+| 4.2. | MLM, RTD, GPT                           | Event bases methods                     | Self-supervided training and embeddings for clients' transactions [notebook](event-sequence-local-embeddings.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dllllb/pytorch-lifestream/blob/master/demo/event-sequence-local-embeddings.ipynb) |
+| 4.3. | NSP, SOP                                | Sequence based methods                  | [demo/nsp-sop-emb.ipynb](demo/nsp-sop-emb.ipynb) |
+| 5.   | Contrastive and non-contrastive learning | Latent representation-based losses      | TBD based on [demo/coles-emb.ipynb](./demo/coles-emb.ipynb)             |
+| 5.1. | CoLES                                   |    | [demo/coles-emb.ipynb](./demo/coles-emb.ipynb)                |
+| 5.2. | VICReg                                  |    | TBD based on [demo/coles-emb.ipynb](./demo/coles-emb.ipynb)                   |
+| 5.3. | CPC                                     |    | TBD based on [demo/coles-emb.ipynb](./demo/coles-emb.ipynb)                   |
+| 5.4. | MLM, TabFormer and others               | Self-supervised TrxEncoder only training with Masked Language Model | [demo/mlm-emb.ipynb](./demo/mlm-emb.ipynb) [demo/tabformer-emb.ipynb](demo/tabformer-emb.ipynb)                  |
+| 6.   | Pretrained model usage                  |    |    |
+| 6.1. | Downstream model on frozen embeddings   |    | TBD based on [demo/coles-emb.ipynb](./demo/coles-emb.ipynb)  |
+| 6.2. | CatBoost embeddings features            |    | [demo/coles-catboost.ipynb](demo/coles-catboost.ipynb) |
+| 6.3. | Model finetuning                        |    | [demo/coles-finetune.ipynb](./demo/coles-finetune.ipynb) |
+| 7.   | Preprocessing options                   | Data preparation demos | [demo/preprocessing-demo.ipynb](demo/preprocessing-demo.ipynb) |
+| 7.1  | ptls-format parquet data loading        | PySpark and Parquet for data preprocessing | [demo/pyspark-parquet.ipynb](demo/pyspark-parquet.ipynb) |
+| 7.2. | Fast inference for big dataset          |    | [demo/extended_inference.ipynb](demo/extended_inference.ipynb) |
+| 8.   | Features special types                  |    |    | 
+| 8.1. | Using pretrained encoder to text features |  | [demo/coles-pretrained-embeddings.ipynb](demo/coles-pretrained-embeddings.ipynb) | 
+| 8.2  | Multi source models                     |    | [demo/CoLES-demo-multimodal-unsupervised.ipynb](demo/CoLES-demo-multimodal-unsupervised.ipynb) |
+| 9.   | Trx Encoding options                    |    |    | 
+| 9.1. | Basic options                           |    | TBD | 
+| 9.2. | Transaction Quantization                |    | TBD | 
+| 9.3. | Transaction BPE                         |    | TBD | 
 
 ## Docs
 
diff --git a/demo/coles-emb.ipynb b/demo/coles-emb.ipynb
index ec9429fb..be6c3c04 100644
--- a/demo/coles-emb.ipynb
+++ b/demo/coles-emb.ipynb
@@ -17,11 +17,7 @@
    "source": [
     "import sys\n",
     "if 'google.colab' in str(get_ipython()):\n",
-    "    ! {sys.executable} -m pip install pytorch-lifestream\n",
-    "    ! {sys.executable} -m pip install -U 'torch<2'  # downgrade for ptls==0.5.x\n",
-    "    ! {sys.executable} -m pip install -U 'pytorch-lightning<2'  # downgrade for ptls==0.5.x\n",
-    "    ! {sys.executable} -m pip install -U 'torchvision<0.15.1'  # downgrade for ptls==0.5.x\n",
-    "    ! {sys.executable} -m pip install -U 'torchaudio<2'  # downgrade for ptls==0.5.x"
+    "    ! {sys.executable} -m pip install pytorch-lifestream"
    ]
   },
   {
@@ -432,7 +428,8 @@
     "\n",
     "trainer = pl.Trainer(\n",
     "    max_epochs=15,\n",
-    "    gpus=1 if torch.cuda.is_available() else 0,\n",
+    "    accelerator=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
+    "    devices=1 if torch.cuda.is_available() else \"auto\",\n",
     "    enable_progress_bar=False,\n",
     ")"
    ]
diff --git a/demo/event-sequence-local-embeddings.ipynb b/demo/event-sequence-local-embeddings.ipynb
index bf4c6f8e..b84d1afb 100644
--- a/demo/event-sequence-local-embeddings.ipynb
+++ b/demo/event-sequence-local-embeddings.ipynb
@@ -33,10 +33,6 @@
         "import sys\n",
         "if 'google.colab' in str(get_ipython()):\n",
         "    ! {sys.executable} -m pip install pytorch-lifestream\n",
-        "    ! {sys.executable} -m pip install -U 'torch<2'  # downgrade for ptls==0.5.x\n",
-        "    ! {sys.executable} -m pip install -U 'pytorch-lightning<2'  # downgrade for ptls==0.5.x\n",
-        "    ! {sys.executable} -m pip install -U 'torchvision<0.15.1'  # downgrade for ptls==0.5.x\n",
-        "    ! {sys.executable} -m pip install -U 'torchaudio<2'  # downgrade for ptls==0.5.x\n",
         "\n",
         "clear_output()"
       ],
@@ -672,8 +668,6 @@
           "output_type": "stream",
           "name": "stderr",
           "text": [
-            "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:478: LightningDeprecationWarning: Setting `Trainer(gpus=1)` is deprecated in v1.7 and will be removed in v2.0. Please use `Trainer(accelerator='gpu', devices=1)` instead.\n",
-            "  rank_zero_deprecation(\n",
             "INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True\n",
             "INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores\n",
             "INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs\n",
@@ -689,7 +683,8 @@
         "\n",
         "trainer = pl.Trainer(\n",
         "    max_epochs=50,\n",
-        "    gpus=1 if torch.cuda.is_available() else 0,\n",
+        "    accelerator=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
+        "    devices=1 if torch.cuda.is_available() else \"auto\",\n",
         "    enable_progress_bar=False,\n",
         ")"
       ],
@@ -922,7 +917,10 @@
     {
       "cell_type": "code",
       "source": [
-        "predict = pl.Trainer(gpus=1).predict(inference_module, inference_dl)"
+        "predict = pl.Trainer(\n",
+        "    accelerator=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
+        "    devices=1 if torch.cuda.is_available() else \"auto\"\n",
+        ").predict(inference_module, inference_dl)"
       ],
       "metadata": {
         "id": "yH5PKWyxN7Xk",
diff --git a/demo/nsp-sop-emb.ipynb b/demo/nsp-sop-emb.ipynb
index 97056a57..d497bf3e 100644
--- a/demo/nsp-sop-emb.ipynb
+++ b/demo/nsp-sop-emb.ipynb
@@ -17,11 +17,7 @@
    "source": [
     "import sys\n",
     "if 'google.colab' in str(get_ipython()):\n",
-    "    ! {sys.executable} -m pip install pytorch-lifestream\n",
-    "    ! {sys.executable} -m pip install -U 'torch<2'  # downgrade for ptls==0.5.x\n",
-    "    ! {sys.executable} -m pip install -U 'pytorch-lightning<2'  # downgrade for ptls==0.5.x\n",
-    "    ! {sys.executable} -m pip install -U 'torchvision<0.15.1'  # downgrade for ptls==0.5.x\n",
-    "    ! {sys.executable} -m pip install -U 'torchaudio<2'  # downgrade for ptls==0.5.x"
+    "    ! {sys.executable} -m pip install pytorch-lifestream"
    ]
   },
   {
@@ -429,7 +425,8 @@
     "\n",
     "trainer = pl.Trainer(\n",
     "    max_epochs=15,\n",
-    "    gpus=1 if torch.cuda.is_available() else 0,\n",
+    "    accelerator=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
+    "    devices=1 if torch.cuda.is_available() else \"auto\",\n",
     "    enable_progress_bar=True,\n",
     ")"
    ]
@@ -811,7 +808,8 @@
     "\n",
     "trainer = pl.Trainer(\n",
     "    max_epochs=15,\n",
-    "    gpus=1 if torch.cuda.is_available() else 0,\n",
+    "    accelerator=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
+    "    devices=1 if torch.cuda.is_available() else \"auto\",\n",
     "    enable_progress_bar=True,\n",
     ")"
    ]
diff --git a/docs/data_load/datasets.md b/docs/data_load/datasets.md
index ee39baf0..2013101f 100644
--- a/docs/data_load/datasets.md
+++ b/docs/data_load/datasets.md
@@ -115,7 +115,7 @@ They takes `i_filters` as list of `iterable_processing` objects.
 
 ### Augmentations
 
-Sometimes we have to change an items from train data. This is `augmentations`.
+Sometimes we have to change items from train data. This is what `augmentations` do.
 They are in `ptls.data_load.augmentations`.
 
 Example:
@@ -138,11 +138,11 @@ Here `RandomSlice` augmentation take a random slice from source record.
 | Place it be before persist stage to run it once and save total cpu resource | Don't place it before persist stage because it kills the random |
 | Can delete items | Can not delete items |
 | Can yield new items | Can not create new items |
-| Works a generator and requires iterable processing | Works as a function can be both map or iterable |
+| Works as a generator and requires iterable processing | Works as a function can be both map or iterable |
 
 ## In memory data
 
-In memory data is common case. Data can a list or generator with feature dicts.
+In memory data is common case. Data can be a list or generator with feature dicts.
 
 ```python
 import torch
@@ -184,7 +184,7 @@ def data_gen(n):
 
 Both datasets support any kind of input: list or generator.
 As all datasets supports tha same format (list or generator) as input and output they can be chained.
-This make sense for some cases.
+This makes sense for some cases.
 
 Data pipelines:
 
@@ -237,7 +237,7 @@ for batch in dl:
 
 ## Parquet file read
 
-For large amount of data `pyspark` is possible engine to prepare data and convert it in feature dict format.
+For large amount of data `pyspark` is a possible engine to prepare data and convert it in feature dict format.
 See `demo/pyspark-parquet.ipynb` with example of data preprocessing with `pyspark` and parquet file preparation.
 
 `ptls.data_load.datasets.ParquetDataset` is a dataset which reads parquet files with feature dicts.
@@ -249,8 +249,8 @@ See `demo/pyspark-parquet.ipynb` with example of data preprocessing with `pyspar
 - looks like a generator
 - supports `i_filters`
 
-You can feed `ParquetDataset` directly fo dataloader for `iterable` way of usage.
-Cou can combine `ParquetDataset` with `MemoryMapDataset` to `map` way of usage.
+You can feed `ParquetDataset` directly to dataloader for `iterable` way of usage.
+You can combine `ParquetDataset` with `MemoryMapDataset` to `map` way of usage.
 
 `ParquetDataset` requires parquet file names. Usually `spark` saves many parquet files for one dataset, 
 depending on the number of partitions.
@@ -264,7 +264,7 @@ Many files for one dataset allows you to:
 
 `ptls.data_load.datasets.PersistDataset` store items from source dataset to the memory.
 
-If you source data is iterator (like python generator or `ParquetDataset`) 
+If your source data is iterator (like python generator or `ParquetDataset`) 
 all `i_filters` will be called each time when you access the data.
 Persist the data into memory and `i_filters` will be called once.
 Much memory may be used to store all dataset items.
diff --git a/docs/data_load/date_pipeline.md b/docs/data_load/date_pipeline.md
index d6c0d750..3c422204 100644
--- a/docs/data_load/date_pipeline.md
+++ b/docs/data_load/date_pipeline.md
@@ -1,6 +1,6 @@
 # Data pipeline
 
-All process support `map` and `iterable` data.
+All processes support `map` and `iterable` data.
 
 There are steps in pipeline:
 
diff --git a/docs/feature_naming.md b/docs/feature_naming.md
index 3508a755..3b4375ce 100644
--- a/docs/feature_naming.md
+++ b/docs/feature_naming.md
@@ -2,19 +2,19 @@
 
 ## Feature types
 
-Information about transaction features are stored as array in dictionary.
+Information about transaction features is stored as arrays in a dictionary.
 
-There are feature types:
+There are the feature types:
 
-- Sequential feature - is a `np.ndarray` or `torch.tensor` of shape `(seq_len,)`
+- Sequential feature - is a `np.ndarray` or a `torch.tensor` of shape `(seq_len,)`
     - for categorical features contains category indexes with type `long`
     - for numerical features contains feature value with type `float`
 - Scalar values. It can be `target`, `id`, `labels` or `scalar features`.
-Types are depends on purpose. Type should be compatible with torch if value will be fed into neural network
+Types depend on purpose. Type should be compatible with torch if value will be fed into a neural network
 - Array values. It also can be `target`, `id`, `labels` or `vector features`.
 Type is `np.ndarray` or `torch.tensor`.
 
-Sequential features correspond user's transactions.
+Sequential features correspond to a user's transactions.
 The length of each user's sequential feature is equal to the length of the entire sequence.
 The order of each user's sequential feature is the same as sequence order.
 Sequential feature length `seq_len` may vary from user to user.
@@ -24,18 +24,18 @@ Array features have a constant shape. This shape is the same for all users.
 This why we use `pad_sequence` which align length for sequential features and `stack` for array features
 during batch collection.
 
-`ptls` extract only sequential features for unsupervised task and additional target for the supervised task.
-Other fields used during preprocessing and inference.
+`ptls` extracts only sequential features for unsupervised task and additional target for the supervised task.
+Other fields are used during preprocessing and inference.
 
 ## Feature names
 
-The main purpose of the feature naming convention is sequential and array features distinguish.
-They both are `np.ndarray` or `torch.tensor` and we can't use data type for distinguish.
+The main purpose of the feature naming convention is to distinguish between sequential and array features.
+They both are `np.ndarray` or `torch.tensor` so we can't use data type to distinguish.
 
-It's important to know feature type because:
+It's important to know the feature type because:
 
 - sequential align lengths with `pad_sequence`, arrays use `stack` during batch collection.
-- only sequential features used to get length of entire sequence
+- only sequential features can be used to get length of entire sequence
 - only sequential features are augmented by timeline modifications like slice, trx dropout or shuffle
 
 We introduce naming rules to solve type discrimination problems.
@@ -62,7 +62,7 @@ x = {
 
 `target` prefix are mandatory only for array features.
 
-Sometimes we need a time sequence. It used fo trx correct order, for time features and for some splits.
+Sometimes we need a time sequence. It used for trx correct order, for time features and for some splits.
 We expect that transaction timestamp stored in `event_time` field.
 
 ## Naming rules
@@ -95,7 +95,7 @@ print(dataset[0])
 
 ## Code usage
 
-Need to take into account the type of features and the use of naming rules is in the classes:
+Theese are the classes where it is necessary to take into account the type of features and the use of naming rules:
 
 - `ptls.data_load.feature_dict.FeatureDict`
 - `ptls.data_load.padded_batch.PaddedBatch`
diff --git a/docs/frames/coles.md b/docs/frames/coles.md
index f1c15224..c5ad58e0 100644
--- a/docs/frames/coles.md
+++ b/docs/frames/coles.md
@@ -5,11 +5,11 @@ Original paper: [CoLES: Contrastive Learning for Event Sequences with Self-Super
 CoLES is a framework that learn neural network to compress sequential data into a single embedding.
 
 Imagine a credit card transaction history that can be an example of user behavioral.
-Each user have his own behavioral patterns which are projected to his transaction history.
-Repeatability of behavioral patterns lead to repeatability in transaction history.
+Each user has his own behavioral patterns which are projected to his transaction history.
+Repeatability of behavioral patterns leads to repeatability in transaction history.
 
 CoLES exploit repeatability of patterns to make embedding. It samples a few subsequences from original sequence
-and calculates an embeddings for each of them. Embeddings are assigned to his user.
+and calculates an embeddings for each of them. Embeddings are assigned to a corresponding user.
 Subsequences represent the same user and contain the same behavioral patterns. 
 CoLES catch these patterns by making closer users embeddings. It also tries to distance different users embeddings.
 
@@ -52,8 +52,8 @@ Notes:
 
 - there can be many types of class labels, this can be targets from supervised task.
 Labels for each class are provided by `ColesSupervisedDataset`.
-- class labels can be missed. Auxiliary loss are calculated only for labeled data.
-CoLES loss are calculated for all data.
+- class labels can be missed. Auxiliary loss is calculated only for labeled data.
+CoLES loss is calculated for all data.
 - auxiliary loss is `l_loss` attribute of `ColesSupervisedModule` constructor.
 
 
@@ -68,7 +68,7 @@ Use `ptls.frames.coles.ColesDataset` or `ptls.frames.coles.ColesIterableDataset`
 It's parametrised by `splitter` as `ColesDataset`.
 
 `ColesSupervisedDataset` requires a list of columns where target labels are stored (`cols_classes` attribute).
-It used to provide these labels to dataloader.
+It is used to provide these labels to dataloader.
 
 ## Coles losses and sampling strategies
 Use classes from:
@@ -80,9 +80,9 @@ Usage recommendations:
 
 - Auxiliary class labels don't change because they are client related. This means that you can use losses with memory
 to learn class centers in embedding space for `l_loss` in `ColesSupervisedModule`.
-Losses without memory calculates class center for batch.
+Losses without memory calculate class center for batch.
 - Don't use losses with memory as CoLES loss, cause Coles labels valid only in batch.
-CoLES labels is arange over batch, so e.g. 0-label correspond different clients in different batches.
+CoLES labels are aranged over batch, so e.g. 0-label correspond different clients in different batches.
 
 
 ## Head selection
diff --git a/docs/frames/common_usage.md b/docs/frames/common_usage.md
index ec2e02f1..790c4c24 100644
--- a/docs/frames/common_usage.md
+++ b/docs/frames/common_usage.md
@@ -1,9 +1,9 @@
 # `ptls.frames` usage
 
-`frames` means frameworks. They are collects a popular technics to train a models.
+`frames` means frameworks. They are collections of popular model training technics.
 Each framework is a `LightningModule`. It means that you can train it with `pytorch_lightning.Trainer`.
 Frameworks consume data in a special format, so a `LightningDataModule` required.
-So there are three `pytorch_lightning` entities a required:
+So there are three `pytorch_lightning` entities required:
 
 - model
 - data
@@ -19,17 +19,17 @@ We make a special `torch.nn.Dataset` implementation for each framework. All of t
 - consume `map` or `iterable` input as dict of feature arrays
 - compatible with `ptls.frames.PtlsDataModule`
 
-Model is usually `seq_encoder` with `head` optional.
+Model is usually a `seq_encoder` with an optional `head`.
 We provide a model to framework assigned `LightningModule`.
 
 ## Example
 
-This example is for CoLES framework. You can try an others with the same way.
+This example is for CoLES framework. You can try others the same way.
 See module list in `ptls.frames` submodules. Check docstring for precise parameter tuning.
 
 ### Data generation
 
-We make a small test dataset. In real life you can use a many ways to load a data. See `ptls.data_load`.
+We make a small test dataset. In real life you can use many ways to load data. See `ptls.data_load`.
 
 ```python
 import torch
@@ -104,8 +104,8 @@ datamodule = PtlsDataModule(
 
 ### Model creation
 
-We have to create `seq_cncoder` that transform sequences to embedding 
-and create `CoLESModule` that will train `seq_cncoder`.
+We have to create `seq_cncoder` that transforms sequences to embedding 
+and create `CoLESModule` that will train `seq_encoder`.
 
 ```python
 import torch.optim
@@ -161,13 +161,13 @@ Now `coles_module` with `seq_encoder` are trained.
 
 This demo shows how to make embedding with pretrained `seq_encoder`.
 
-`pytorch_lightning.Trainer` have `predict` method that calls `seq_encoder.forward`.
+`pytorch_lightning.Trainer` has a `predict` method that calls `seq_encoder.forward`.
 `predict` requires `LightningModule` but `seq_encoder` is `torch.nn.Module`.
-We should cover `seq_encoder` to `LightningModule`.
+We should convert `seq_encoder` to `LightningModule`.
 
 We can use `CoLESModule` or any other module if available. In this example we can use `coles_module` object.
 Sometimes we have only `seq_encoder`, e.g. loaded from disk.
-`CoLESModule` have a little overhead. There are head, loss and metrics inside.
+`CoLESModule` has a little overhead. There are head, loss and metrics inside.
 
 Other way is using lightweight `ptls.frames.supervised.SequenceToTarget` module.
 It can run inference with only `seq_encoder`.
diff --git a/docs/index.md b/docs/index.md
index 1950b28f..aff108f2 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -43,7 +43,7 @@ CoLES, SimCLR, CPC, VICReg, ...
     - Use one of the available `ptls.data_load.datasets` to define input for the models.
 2. **Choose framework for encoder train**.
     - There are both supervised of unsupervised frameworks in `ptls.frames`.
-    - Keep in mind that each framework requires his own batch format.
+    - Keep in mind that each framework requires its own batch format.
     Tools for batch collate can be found in the selected framework package.
 3. **Build encoder**.
     - All parts are available in `ptls.nn`.
diff --git a/docs/nn/seq_encoder.md b/docs/nn/seq_encoder.md
index c0af0fd1..53224306 100644
--- a/docs/nn/seq_encoder.md
+++ b/docs/nn/seq_encoder.md
@@ -5,6 +5,7 @@ All classes from `ptls.nn.seq_encoder` also available in `ptls.nn`
 `ptls.nn.seq_encoder` takes into account sequential structure and the links between transactions.
 
 There are 2 types of seq encoders:
+
 - required embeddings as input
 - requires raw features as input
 
@@ -53,7 +54,7 @@ trx_encoder = TrxEncoder(
 seq_encoder = RnnEncoder(input_size=trx_encoder.output_size, hidden_size=16)
 
 z = trx_encoder(x)
-y = seq_encoder(z)  # embeddings wor each transaction
+y = seq_encoder(z)  # embeddings for each transaction
 seq_encoder.is_reduce_sequence = True
 h = seq_encoder(z)  # embeddings for sequences, aggregate all transactions in one embedding
 
@@ -61,7 +62,7 @@ assert y.payload.size() == (3, 8, 16)
 assert h.size() == (3, 16)
 ```
 
-Usually `seq_encoder` used with preliminary `trx_encoder`. It's possible to pack them to `torch.nn.Sequential`.
+Usually `seq_encoder` is used with preliminary `trx_encoder`. It's possible to pack them to `torch.nn.Sequential`.
 
 It's possible to add more layers between `trx_encoder` and `seq_encoder` (linear, normalisation, convolutions, ...). 
 They should work with PaddedBatch. Examples will be presented later. Such layers also works after `seq_encoder`
@@ -128,7 +129,7 @@ config = """
 model = hydra.utils.instantiate(OmegaConf.create(config))['model']
 ```
 
-The second config are simpler. Both of configs make an identical model. You can check:
+The second config is simpler. Both of configs make an identical model. You can check:
 ```python
 x = PaddedBatch(
     payload={
@@ -145,10 +146,10 @@ y = model(x)
 ## AggFeatureSeqEncoder
 
 `ptls.nn.AggFeatureSeqEncoder`.
-It looks like seq_encoder. It take raw features at input and provide reduced representation at output.
+It looks like seq_encoder. It takes raw features at input and provides reduced representation at output.
 This encoder creates features, which are good for boosting model. This is a strong baseline for many tasks.
-`AggFeatureSeqEncoder` eat the same input as other seq_encoders, and it can easily be replaced
-by rnn of transformer seq encoder.  It use gpu and works fast. It haven't parameters for learn.
+`AggFeatureSeqEncoder` takes the same input as other seq_encoders, and it can easily be replaced
+by rnn of transformer seq encoder.  It uses gpu and works fast. It doesn't have parameters for learn.
 
 Possible pipeline:
 ```python
@@ -157,7 +158,7 @@ agg_embeddings = trainer.predict(seq_encoder, dataloader)
 catboost_model.fit(agg_embeddings, target)
 ```
 
-We plain to split `AggFeatureSeqEncoder` into components which will be compatible with other ptls-layers.
+We plan to split `AggFeatureSeqEncoder` into components which will be compatible with other ptls-layers.
 It will be possible to choose flexible between `TrxEncoder` with `AggSeqEncoder` and `OheEncoder` with `RnnEncoder`.
 
 
diff --git a/docs/nn/trx_encoder.md b/docs/nn/trx_encoder.md
index 21915ebd..07344578 100644
--- a/docs/nn/trx_encoder.md
+++ b/docs/nn/trx_encoder.md
@@ -16,7 +16,7 @@ x = PaddedBatch(
     length=torch.Tensor([2, 8, 5]).long()
 )
 ```
-And se can define a TrxEncoder
+And we can define a TrxEncoder
 ```python
 model = TrxEncoder(
     embeddings={
@@ -30,7 +30,7 @@ We should provide feature description to `TrxEncoder`.
 Dictionary size and embedding size for categorical features. Scaler name for numerical features.
 `identity` means no rescaling.
 
-`TrxEncoder` concatenate all feature embeddings, sow output embedding size will be `6 + 2 + 1`.
+`TrxEncoder` concatenates all feature embeddings, sow output embedding size will be `6 + 2 + 1`.
 You may get output size from `TrxEncoder` with property:
 ```python
 >>> model.output_size
diff --git a/docs/ptls_preprocessing.md b/docs/ptls_preprocessing.md
index 84fce62f..ef36c653 100644
--- a/docs/ptls_preprocessing.md
+++ b/docs/ptls_preprocessing.md
@@ -15,7 +15,7 @@ Use `pandas` for a small dataset and `pyspark` for a large one.
 3. Prepare `event_time` column. Convert it to a timestamp for a date and time, or use any sortable format otherwise.
 4. Fit and transform categorical features, from categorical values to embedding indexes.
 5. Check numeric feature column types
-6. Split and groups dataframe by users. One row was one transaction, one row became a user with a list of transactions.
+6. Split and group dataframe by users. Before this operation a row represents a single transaction; after a row represents a user with and contains a list of all transactions by the user.
 7. Join user-level columns: target, labels, features.
 8. Done. Use data from memory or save it to parquet format.
 9. Save fitted preprocessing for future usage.
diff --git a/docs/sequential_data_definition.md b/docs/sequential_data_definition.md
index 5909d571..adb26f3f 100644
--- a/docs/sequential_data_definition.md
+++ b/docs/sequential_data_definition.md
@@ -33,12 +33,12 @@ We sort events by `date_time` for each user to assure correct event order.
 Each event (transaction) are described by categorical field `mcc_code`, numerical field `amount`, and time field `date_time`.
 These fields allow to distinguish events, vectorize them na use as a features.
 
-`pytorch-lifeatream` supports this format of data and provides the tools to process it throw the pipeline.
+`pytorch-lifeatream` supports this data format and provides the tools to process it through a pipeline.
 Data can be `pandas.DataFrame` or `pyspark.DataFrame`.
 
 ### Data collected in lists
 Table data should be converted to format more convenient for neural network feeding.
-There are steps:
+Here are the steps:
 
 1. Feature field transformation: encoding categorical features, amount normalizing, missing values imputing.
 This works like sklearn fit-transform preprocessors.
@@ -47,7 +47,7 @@ We transfer flat table with events to set of users with event collections.
 3. Split events by feature fields.
 Features are stored as 1d-arrays. Sequence orders are kept.
 
-Previous example with can be presented as (feature transformation missed for visibility):
+Previous example (сredit card transaction history) can be presented as (feature transformation missed for visibility):
 
 ```
 [
@@ -74,10 +74,10 @@ This is a main input data format in `pytorch-lifeatream`. Supported:
 - in-memory augmentations and transformations
 
 ## Dataset
-`pytorch-lifeatream` provide multiple `torch.Dataset` implementations.
-Dataset item present single user information and can be a combination of:
+`pytorch-lifeatream` provides multiple `torch.Dataset` implementations.
+Dataset item presents a single user information and can be a combination of:
 
-- `record` - is a dictionary where kees are feature names and values are 1d-tensors with feature sequences.
+- `record` - is a dictionary where keys are feature names and values are 1d-tensors with feature sequences.
 Similar as data collected in lists. 
 - `id` - how to identify a sequence
 - `target` - target value for supervised learning
@@ -91,11 +91,11 @@ X = dataset[0]
 ## DataLoader
 The main feature of `pytorch-lifestream` dataloader is customized `collate_fn`, provided to `torch.DataLoader` class.
 `collate_fn` collects single records of dictionaries to batch.
-Usually `collate_fn` pad and pack sequences into 2d tensors with shape `(B, T)`, where `B` - is sample num and `T` is max sequence length.
+Usually `collate_fn` pads and packs sequences into 2d tensors with shape `(B, T)`, where `B` - is sample num and `T` is max sequence length.
 Each feature packed separately.
 
-Output is `PaddedBatch` type which collect together packed sequences and lengths.
-`PaddedBatch` compatible with all `pytorch-lifestream` modules.
+Output is `PaddedBatch` type which collects together packed sequences and lengths.
+`PaddedBatch` is compatible with all `pytorch-lifestream` modules.
 
 Input and output example:
 ```python
diff --git a/docs/tuning.md b/docs/tuning.md
index 2ebdbcab..cbe4f8c7 100644
--- a/docs/tuning.md
+++ b/docs/tuning.md
@@ -1,17 +1,17 @@
 # Hyperparameters tuning
 
 We propose a demo for hyperparameters tuning with `hydra`, `optuna` and `tensorboard`.
-This is console application located in `demo/hparam_tuning`.
+This is a console application located in `demo/hparam_tuning`.
 
 # Intro
 
 After we build a network architecture we should tune hyperparameters.
-Automated tuning have a benefits:
+Automated tuning have benefits:
 
 - Automated iterations over hparam set is faster than manual choice
 - Automated iterations requires less operational costs
 - All results logged and can be inspected together
-- Iteration count limit allow measure quality improvement with fixed resources.
+- Iteration count limit allows measuring the quality improvement with fixed resources.
 - hparam optimisation tools implement effective strategy of parameter choice
 
 Keep in mind that is just a tool for hparam iteration.
diff --git a/ptls-banner.png b/ptls-banner.png
index 43a4be4e..9030977d 100644
Binary files a/ptls-banner.png and b/ptls-banner.png differ
diff --git a/ptls/data_load/datasets/duckdb_dataset.py b/ptls/data_load/datasets/duckdb_dataset.py
index 13d910fb..e8ac3059 100644
--- a/ptls/data_load/datasets/duckdb_dataset.py
+++ b/ptls/data_load/datasets/duckdb_dataset.py
@@ -48,6 +48,7 @@ def __execute_query(self):
             SELECT {self.col_id}, {fields}
             FROM {self.data_read_func}
             GROUP BY {self.col_id}
+            ORDER BY {self.col_id}
             """
     
         rel = duckdb.sql(query)
diff --git a/ptls/data_load/utils.py b/ptls/data_load/utils.py
index ea925bb8..b7e3f7a5 100644
--- a/ptls/data_load/utils.py
+++ b/ptls/data_load/utils.py
@@ -35,13 +35,15 @@ def collate_feature_dict(batch):
     lengths = torch.LongTensor([len(rec[seq_col]) for rec in batch])
     new_x = {}
     for k, v in new_x_.items():
-        if type(v[0]) is torch.Tensor:
+        if isinstance(v[0], torch.Tensor):
             if k.startswith('target'):
                 new_x[k] = torch.stack(v, dim=0)
             else:
                 new_x[k] = torch.nn.utils.rnn.pad_sequence(v, batch_first=True)
-        elif type(v[0]) is np.ndarray:
+        elif isinstance(v[0], np.ndarray):
             new_x[k] = v  # list of arrays[object]
+        elif isinstance(v[0], list):
+            new_x[k] = np.array(v, dtype=object)
         else:
             v = np.array(v)
             if v.dtype.kind == 'i':
diff --git a/ptls/frames/abs_module.py b/ptls/frames/abs_module.py
index 3529084b..4ae1be93 100644
--- a/ptls/frames/abs_module.py
+++ b/ptls/frames/abs_module.py
@@ -73,7 +73,7 @@ def validation_step(self, batch, _):
         y_h, y = self.shared_step(*batch)
         self._validation_metric(y_h, y)
 
-    def validation_epoch_end(self, outputs):
+    def on_validation_epoch_end(self):
         self.log(f'valid/{self.metric_name}', self._validation_metric.compute(), prog_bar=True)
         self._validation_metric.reset()
 
diff --git a/ptls/frames/bert/modules/mlm_module.py b/ptls/frames/bert/modules/mlm_module.py
index d2e000d6..4d9a82f4 100644
--- a/ptls/frames/bert/modules/mlm_module.py
+++ b/ptls/frames/bert/modules/mlm_module.py
@@ -173,10 +173,10 @@ def validation_step(self, batch, batch_idx):
         loss_mlm = self.loss_mlm(z_trx, is_train_step=False)
         self.valid_mlm_loss(loss_mlm)
 
-    def training_epoch_end(self, _):
+    def on_training_epoch_end(self):
         self.log(f'mlm/train_mlm_loss', self.train_mlm_loss, prog_bar=False)
         # self.train_mlm_loss reset not required here
 
-    def validation_epoch_end(self, _):
+    def on_validation_epoch_end(self):
         self.log(f'mlm/valid_mlm_loss', self.valid_mlm_loss, prog_bar=True)
         # self.valid_mlm_loss reset not required here
diff --git a/ptls/frames/bert/modules/mlm_nsp_module.py b/ptls/frames/bert/modules/mlm_nsp_module.py
index 849d96a0..f848b008 100644
--- a/ptls/frames/bert/modules/mlm_nsp_module.py
+++ b/ptls/frames/bert/modules/mlm_nsp_module.py
@@ -225,11 +225,11 @@ def validation_step(self, batch, batch_idx):
         self.valid_nsp_loss(loss_nsp)
         self.valid_mlm_loss(loss_mlm)
 
-    def training_epoch_end(self, _):
+    def on_training_epoch_end(self):
         self.log(f'mlm/train_mlm_loss', self.train_mlm_loss, prog_bar=False)
         self.log(f'nsp/train_nsp_loss', self.train_nsp_loss, prog_bar=False)
 
-    def validation_epoch_end(self, _):
+    def on_validation_epoch_end(self):
         self.log(f'mlm/valid_mlm_loss', self.valid_mlm_loss, prog_bar=True)
         self.log(f'nsp/valid_nsp_loss', self.valid_nsp_loss, prog_bar=False)
    
diff --git a/ptls/frames/coles/coles_supervised_module.py b/ptls/frames/coles/coles_supervised_module.py
index f33b814b..7b0c1762 100644
--- a/ptls/frames/coles/coles_supervised_module.py
+++ b/ptls/frames/coles/coles_supervised_module.py
@@ -124,6 +124,6 @@ def validation_step(self, batch, _):
         y_h, y, l = self.shared_step(*batch)
         self._validation_metric(y_h, y)
 
-    def validation_epoch_end(self, outputs):
+    def on_validation_epoch_end(self):
         self.log(f'valid/{self.metric_name}', self._validation_metric.compute(), prog_bar=True)
         self._validation_metric.reset()
diff --git a/ptls/frames/coles/losses/binomial_deviance_loss.py b/ptls/frames/coles/losses/binomial_deviance_loss.py
index 0fc88074..ea2b46e7 100644
--- a/ptls/frames/coles/losses/binomial_deviance_loss.py
+++ b/ptls/frames/coles/losses/binomial_deviance_loss.py
@@ -12,7 +12,7 @@ class BinomialDevianceLoss(nn.Module):
     """
 
     def __init__(self, pair_selector, alpha=1, beta=1, C=1):
-        super(BinomialDevianceLoss, self).__init__()
+        super().__init__()
         self.alpha = alpha
         self.beta = beta
         self.C = C
diff --git a/ptls/frames/coles/losses/complex_loss.py b/ptls/frames/coles/losses/complex_loss.py
index f9b7496a..aaa0baf6 100644
--- a/ptls/frames/coles/losses/complex_loss.py
+++ b/ptls/frames/coles/losses/complex_loss.py
@@ -6,7 +6,7 @@ class ComplexLoss(torch.nn.Module):
 
     """
     def __init__(self, ml_loss, aug_loss, ml_loss_weight=1.):
-        super(ComplexLoss, self).__init__()
+        super().__init__()
         self.aug_loss = aug_loss
         self.ml_loss = ml_loss
         self.ml_loss_weight = ml_loss_weight
diff --git a/ptls/frames/coles/losses/contrastive_loss.py b/ptls/frames/coles/losses/contrastive_loss.py
index 56d2313e..2d0b9143 100644
--- a/ptls/frames/coles/losses/contrastive_loss.py
+++ b/ptls/frames/coles/losses/contrastive_loss.py
@@ -14,7 +14,7 @@ class ContrastiveLoss(nn.Module):
     """
 
     def __init__(self, margin, sampling_strategy, distributed_mode = False, do_loss_mult = False):
-        super(ContrastiveLoss, self).__init__()
+        super().__init__()
         self.margin = margin
         self.pair_selector = sampling_strategy
         self.distributed_mode = distributed_mode
diff --git a/ptls/frames/coles/losses/histogram_loss.py b/ptls/frames/coles/losses/histogram_loss.py
index a7773ebf..a107c8a0 100644
--- a/ptls/frames/coles/losses/histogram_loss.py
+++ b/ptls/frames/coles/losses/histogram_loss.py
@@ -14,7 +14,7 @@ class HistogramLoss(torch.nn.Module):
     """
 
     def __init__(self, num_steps=100):
-        super(HistogramLoss, self).__init__()
+        super().__init__()
         self.step = 2 / (num_steps - 1)
         self.eps = 1 / num_steps
         self.t = torch.arange(-1, 1+self.step, self.step).view(-1, 1)
diff --git a/ptls/frames/coles/losses/margin_loss.py b/ptls/frames/coles/losses/margin_loss.py
index 7756792b..1b4d8537 100644
--- a/ptls/frames/coles/losses/margin_loss.py
+++ b/ptls/frames/coles/losses/margin_loss.py
@@ -13,7 +13,7 @@ class MarginLoss(torch.nn.Module):
     """
 
     def __init__(self, pair_selector, margin=1, beta=1.2):
-        super(MarginLoss, self).__init__()
+        super().__init__()
         self.margin = margin
         self.beta = beta
         self.pair_selector = pair_selector
diff --git a/ptls/frames/coles/losses/triplet_loss.py b/ptls/frames/coles/losses/triplet_loss.py
index 68defd26..9d9711ae 100644
--- a/ptls/frames/coles/losses/triplet_loss.py
+++ b/ptls/frames/coles/losses/triplet_loss.py
@@ -11,7 +11,7 @@ class TripletLoss(nn.Module):
     """
 
     def __init__(self, margin, triplet_selector):
-        super(TripletLoss, self).__init__()
+        super().__init__()
         self.margin = margin
         self.triplet_selector = triplet_selector
 
diff --git a/ptls/frames/coles/losses/vicreg_loss.py b/ptls/frames/coles/losses/vicreg_loss.py
index 4bde2a19..b2e7ba21 100644
--- a/ptls/frames/coles/losses/vicreg_loss.py
+++ b/ptls/frames/coles/losses/vicreg_loss.py
@@ -8,7 +8,7 @@ class VicregLoss(torch.nn.Module):
 
     """
     def __init__(self, sim_coeff, std_coeff, cov_coeff):
-        super(VicregLoss, self).__init__()
+        super().__init__()
 
         self.sim_coeff = sim_coeff
         self.std_coeff = std_coeff
diff --git a/ptls/frames/coles/multimodal_dataset.py b/ptls/frames/coles/multimodal_dataset.py
index abb0e851..0ff3c111 100644
--- a/ptls/frames/coles/multimodal_dataset.py
+++ b/ptls/frames/coles/multimodal_dataset.py
@@ -4,8 +4,6 @@
 from collections import defaultdict
 from ptls.data_load.feature_dict import FeatureDict
 from ptls.data_load.padded_batch import PaddedBatch
-from ptls.frames.coles import MultiModalSortTimeSeqEncoderContainer
-
 
 def collate_feature_dict(batch):
     new_x_ = defaultdict(list)
@@ -79,7 +77,8 @@ def __init__(
         col_id:
             column name with user_id
         source_names:
-            column name with name sources
+            column name with name sources, must be specified in the same order as trx_encoders in 
+            ptls.frames.coles.multimodal_module.MultiModalSortTimeSeqEncoderContainer
         col_time:
             column name with event_time
         """
@@ -155,4 +154,4 @@ def collate_fn(self, batch, return_dct_labels=False):
 
     
 class MultiModalIterableDataset(MultiModalDataset, torch.utils.data.IterableDataset):
-    pass
\ No newline at end of file
+    pass
diff --git a/ptls/frames/coles/multimodal_module.py b/ptls/frames/coles/multimodal_module.py
index c2c73918..56d20f35 100644
--- a/ptls/frames/coles/multimodal_module.py
+++ b/ptls/frames/coles/multimodal_module.py
@@ -1,17 +1,45 @@
+from typing import Dict
 import torch
+
 from ptls.data_load.padded_batch import PaddedBatch
+from ptls.nn.trx_encoder import TrxEncoder
+from ptls.nn.seq_encoder.abs_seq_encoder import AbsSeqEncoder
 
 class MultiModalSortTimeSeqEncoderContainer(torch.nn.Module):
-    def __init__(self,
-                 trx_encoders,
-                 seq_encoder_cls, 
-                 input_size,
-                 is_reduce_sequence=True,
-                 col_time='event_time',
-                 **seq_encoder_params
-                ):
+    """Container for multimodal event sequences
+
+    It is used when there is data on sequences of events of different modality.
+    Subsequences are selected for each modality.
+    The modalities are then merged together, taking into account the time order
+    For each modality, its own trx_encoder is used, after which the received embedding events are fed to seq_encoder
+
+    Parameters
+        trx_encoders:
+            Dict with trx encoders for each modality.
+        seq_encoder_cls:
+            Class of model which calculate embeddings for original raw transaction sequences.
+            `seq_encoder` is trained by `CoLESModule` to get better representations of input sequences.
+            ptls.nn.seq_encoder.rnn_encoder.RnnEncoder can be used.
+        input_size:
+            Size of transaction embeddings.
+            Each trx_encoder should have the same linear_projection_size
+        col_time:
+            A column containing the time of events in the data to be merged.
+    
+    An example of use can be found at the link:
+    https://github.com/dllllb/pytorch-lifestream/blob/main/ptls_tests/test_frames/test_coles/test_multimodal_coles_module.py
+    """
+
+    def __init__(
+        self,
+        trx_encoders: Dict[str, TrxEncoder],
+        seq_encoder_cls: AbsSeqEncoder,
+        input_size: int,
+        is_reduce_sequence: bool = True,
+        col_time: str = 'event_time',
+        **seq_encoder_params
+    ):
         super().__init__()
-        
         self.trx_encoders = torch.nn.ModuleDict(trx_encoders)
         self.seq_encoder = seq_encoder_cls(
             input_size=input_size,
@@ -20,6 +48,7 @@ def __init__(self,
         )
         
         self.col_time = col_time
+        self.input_size = input_size
     
     @property
     def is_reduce_sequence(self):
@@ -32,12 +61,8 @@ def is_reduce_sequence(self, value):
     @property
     def embedding_size(self):
         return self.seq_encoder.embedding_size
-    
-    def get_tensor_by_indices(self, tensor, indices):
-        batch_size = tensor.shape[0]
-        return tensor[:, indices, :][torch.arange(batch_size), torch.arange(batch_size), :, :]
         
-    def merge_by_time(self, x):
+    def merge_by_time(self, x: Dict[str, torch.Tensor]):
         device = list(x.values())[1][0].device
         batch, batch_time = torch.tensor([], device=device), torch.tensor([], device=device)
         for source_batch in x.values():
@@ -47,14 +72,15 @@ def merge_by_time(self, x):
         
         batch_time[batch_time == 0] = float('inf')
         indices_time = torch.argsort(batch_time, dim=1)
-        batch = self.get_tensor_by_indices(batch, indices_time)
+        indices_time = indices_time.unsqueeze(-1).expand(-1, -1, self.input_size)
+        batch = torch.gather(batch, 1, indices_time)
         return batch
             
     def trx_encoder_wrapper(self, x_source, trx_encoder, col_time):
         if torch.nonzero(x_source.seq_lens).size()[0] == 0:
             return x_source.seq_lens, 'None', 'None'
         return x_source.seq_lens, x_source.payload[col_time], trx_encoder(x_source)
-        
+    
     def multimodal_trx_encoder(self, x):
         res = {}
         tmp_el = list(x.values())[0]
@@ -68,9 +94,9 @@ def multimodal_trx_encoder(self, x):
             length = length + source_length
         return res, length
             
-    def forward(self, x):
+    def forward(self, x, **kwargs):
         x, length = self.multimodal_trx_encoder(x)
         x = self.merge_by_time(x)
         padded_x = PaddedBatch(payload=x, length=length)
-        x = self.seq_encoder(padded_x)
-        return x
\ No newline at end of file
+        x = self.seq_encoder(padded_x, **kwargs)
+        return x
diff --git a/ptls/frames/coles/multimodal_supervised_dataset.py b/ptls/frames/coles/multimodal_supervised_dataset.py
index 0fdf4f88..014c22fc 100644
--- a/ptls/frames/coles/multimodal_supervised_dataset.py
+++ b/ptls/frames/coles/multimodal_supervised_dataset.py
@@ -4,7 +4,7 @@
 from collections import defaultdict
 from ptls.data_load.feature_dict import FeatureDict
 from ptls.frames.coles.multimodal_dataset import collate_feature_dict, collate_multimodal_feature_dict, get_dict_class_labels
-            
+
 
 class MultiModalSupervisedDataset(FeatureDict, torch.utils.data.Dataset):
     def __init__(
@@ -14,7 +14,7 @@ def __init__(
         source_names,
         col_id='client_id',
         col_time='event_time',
-        
+
         target_name = None,
         target_dtype = None,
         *args, **kwargs
@@ -39,28 +39,28 @@ def __init__(
             int or float
         """
         super().__init__(*args, **kwargs)
-        
+
         self.data = data
         self.col_time = col_time
         self.col_id = col_id
         self.source_names = source_names
         self.source_features = source_features
-        
+
         self.target_name = target_name
         self.target_dtype = target_dtype
-        
+
     def __len__(self):
         return len(self.data)
-    
+
     def __getitem__(self, idx):
         feature_arrays = self.data[idx]
         return self.split_source(feature_arrays)
-    
+
     def __iter__(self):
         for feature_arrays in self.data:
             split_data = self.split_source(feature_arrays)
             yield split_data
-            
+
     def split_source(self, feature_arrays):
         res = defaultdict(dict)
         for feature_name, feature_array in feature_arrays.items():
@@ -79,12 +79,12 @@ def split_source(self, feature_arrays):
         for source in res:
             res1[source] = [res[source]]
         return res1
-    
+
     def get_names(self, feature_name):
         idx_del = feature_name.find('_')
         return feature_name[:idx_del], feature_name[idx_del + 1:]
-            
-    
+
+
     def collate_fn(self, batch, return_dct_labels=False):
         dict_class_labels = get_dict_class_labels(batch)
         batch_y = []
@@ -95,6 +95,6 @@ def collate_fn(self, batch, return_dct_labels=False):
         padded_batch = collate_multimodal_feature_dict(batch)
         return padded_batch, torch.Tensor(batch_y)
 
-    
+
 class MultiModalSupervisedIterableDataset(MultiModalSupervisedDataset, torch.utils.data.IterableDataset):
     pass
\ No newline at end of file
diff --git a/ptls/frames/coles/sampling_strategies/all_positive_pair_selector.py b/ptls/frames/coles/sampling_strategies/all_positive_pair_selector.py
index 38ce6575..8e5a89ae 100644
--- a/ptls/frames/coles/sampling_strategies/all_positive_pair_selector.py
+++ b/ptls/frames/coles/sampling_strategies/all_positive_pair_selector.py
@@ -10,7 +10,7 @@ class AllPositivePairSelector(PairSelector):
     """
 
     def __init__(self, balance=True):
-        super(AllPositivePairSelector, self).__init__()
+        super().__init__()
         self.balance = balance
 
     def get_pairs(self, embeddings, labels):
diff --git a/ptls/frames/coles/sampling_strategies/all_triplets_selector.py b/ptls/frames/coles/sampling_strategies/all_triplets_selector.py
index 03ac9493..ab23ac60 100644
--- a/ptls/frames/coles/sampling_strategies/all_triplets_selector.py
+++ b/ptls/frames/coles/sampling_strategies/all_triplets_selector.py
@@ -13,7 +13,7 @@ class AllTripletSelector(TripletSelector):
     """
 
     def __init__(self):
-        super(AllTripletSelector, self).__init__()
+        super().__init__()
 
     def get_triplets(self, embeddings, labels):
         np_labels = labels.cpu().data.numpy()
diff --git a/ptls/frames/coles/sampling_strategies/distance_weighted_pair_selector.py b/ptls/frames/coles/sampling_strategies/distance_weighted_pair_selector.py
index fea5e11e..215d142f 100644
--- a/ptls/frames/coles/sampling_strategies/distance_weighted_pair_selector.py
+++ b/ptls/frames/coles/sampling_strategies/distance_weighted_pair_selector.py
@@ -35,7 +35,7 @@ class DistanceWeightedPairSelector(PairSelector):
     """
 
     def __init__(self, batch_k, cutoff=0.5, nonzero_loss_cutoff=1.4, normalize=False):
-        super(DistanceWeightedPairSelector, self).__init__()
+        super().__init__()
         self.batch_k = batch_k
         self.cutoff = cutoff
         self.nonzero_loss_cutoff = nonzero_loss_cutoff
diff --git a/ptls/frames/coles/sampling_strategies/hard_negative_pair_selector.py b/ptls/frames/coles/sampling_strategies/hard_negative_pair_selector.py
index 7f97ae31..d6cc3c17 100644
--- a/ptls/frames/coles/sampling_strategies/hard_negative_pair_selector.py
+++ b/ptls/frames/coles/sampling_strategies/hard_negative_pair_selector.py
@@ -11,7 +11,7 @@ class HardNegativePairSelector(PairSelector):
     """
 
     def __init__(self, neg_count=1):
-        super(HardNegativePairSelector, self).__init__()
+        super().__init__()
         self.neg_count = neg_count
 
     def get_pairs(self, embeddings, labels):
diff --git a/ptls/frames/coles/sampling_strategies/hard_triplet_selector.py b/ptls/frames/coles/sampling_strategies/hard_triplet_selector.py
index 7b00ceba..7a8e8b54 100644
--- a/ptls/frames/coles/sampling_strategies/hard_triplet_selector.py
+++ b/ptls/frames/coles/sampling_strategies/hard_triplet_selector.py
@@ -10,7 +10,7 @@ class HardTripletSelector(TripletSelector):
     """
 
     def __init__(self, neg_count=1):
-        super(HardTripletSelector, self).__init__()
+        super().__init__()
         self.neg_count = neg_count
 
     def get_triplets(self, embeddings, labels):
diff --git a/ptls/frames/coles/sampling_strategies/random_negative_triplet_selector.py b/ptls/frames/coles/sampling_strategies/random_negative_triplet_selector.py
index bc56d7fa..47e57149 100644
--- a/ptls/frames/coles/sampling_strategies/random_negative_triplet_selector.py
+++ b/ptls/frames/coles/sampling_strategies/random_negative_triplet_selector.py
@@ -9,7 +9,7 @@ class RandomNegativeTripletSelector(TripletSelector):
     """
 
     def __init__(self, neg_count=1):
-        super(RandomNegativeTripletSelector, self).__init__()
+        super().__init__()
         self.neg_count = neg_count
 
     def get_triplets(self, embeddings, labels):
diff --git a/ptls/frames/coles/sampling_strategies/semi_hard_triplet_selector.py b/ptls/frames/coles/sampling_strategies/semi_hard_triplet_selector.py
index 81f82a4a..fa4ca394 100644
--- a/ptls/frames/coles/sampling_strategies/semi_hard_triplet_selector.py
+++ b/ptls/frames/coles/sampling_strategies/semi_hard_triplet_selector.py
@@ -14,7 +14,7 @@ class SemiHardTripletSelector(TripletSelector):
     """
 
     def __init__(self, neg_count=1):
-        super(SemiHardTripletSelector, self).__init__()
+        super().__init__()
         self.neg_count = neg_count
 
     def get_triplets(self, embeddings, labels):
diff --git a/ptls/frames/gpt/gpt_module.py b/ptls/frames/gpt/gpt_module.py
index 0747dfee..a89b1a18 100644
--- a/ptls/frames/gpt/gpt_module.py
+++ b/ptls/frames/gpt/gpt_module.py
@@ -131,11 +131,11 @@ def validation_step(self, batch, batch_idx):
         loss_gpt = self.loss_gpt(out, labels, is_train_step=False)
         self.valid_gpt_loss(loss_gpt)
 
-    def training_epoch_end(self, _):
+    def on_training_epoch_end(self):
         self.log(f'gpt/train_gpt_loss', self.train_gpt_loss, prog_bar=False, sync_dist=True, rank_zero_only=True)
         # self.train_gpt_loss reset not required here
 
-    def validation_epoch_end(self, _):
+    def on_validation_epoch_end(self):
         self.log(f'gpt/valid_gpt_loss', self.valid_gpt_loss, prog_bar=True, sync_dist=True, rank_zero_only=True)
         # self.valid_gpt_loss reset not required here
 
@@ -189,4 +189,4 @@ def forward(self, batch):
             raise
         if self.model.hparams.norm_predict:
             out = out / (out.pow(2).sum(dim=-1, keepdim=True) + 1e-9).pow(0.5)
-        return out
\ No newline at end of file
+        return out
diff --git a/ptls/frames/supervised/metrics.py b/ptls/frames/supervised/metrics.py
index 0a70d9ce..6cf51317 100644
--- a/ptls/frames/supervised/metrics.py
+++ b/ptls/frames/supervised/metrics.py
@@ -155,7 +155,11 @@ def compute(self):
         b = torch.quantile(y, q, interpolation="nearest")
         y1 = torch.bucketize(y1, b1, out_int32=True)
         y = torch.bucketize(y, b, out_int32=True)
-        return torchmetrics.functional.accuracy(y1, y)
+        num_classes = y.max().item() + 1
+        if hasattr(torchmetrics.functional.classification, "multiclass_accuracy"):
+            return torchmetrics.functional.classification.multiclass_accuracy(y1, y, num_classes=num_classes)
+        else:
+            return torchmetrics.functional.classification.accuracy(y1, y, num_classes=num_classes)
 
 
 class JSDiv(torchmetrics.Metric):
diff --git a/ptls/frames/supervised/seq_to_target.py b/ptls/frames/supervised/seq_to_target.py
index c11f31e3..d3e9fa7e 100644
--- a/ptls/frames/supervised/seq_to_target.py
+++ b/ptls/frames/supervised/seq_to_target.py
@@ -127,7 +127,7 @@ def training_step(self, batch, _):
                 mf(y_h, y)
         return loss
 
-    def training_epoch_end(self, outputs):
+    def on_training_epoch_end(self):
         for name, mf in self.train_metrics.items():
             self.log(f'train/{name}', mf.compute(), prog_bar=False)
         for name, mf in self.train_metrics.items():
@@ -140,7 +140,7 @@ def validation_step(self, batch, _):
         for name, mf in self.valid_metrics.items():
             mf(y_h, y)
 
-    def validation_epoch_end(self, outputs):
+    def on_validation_epoch_end(self):
         for name, mf in self.valid_metrics.items():
             self.log(f'valid/{name}', mf.compute(), prog_bar=True)
         for name, mf in self.valid_metrics.items():
@@ -152,7 +152,7 @@ def test_step(self, batch, _):
         for name, mf in self.test_metrics.items():
             mf(y_h, y)
 
-    def test_epoch_end(self, outputs):
+    def on_test_epoch_end(self):
         for name, mf in self.test_metrics.items():
             value = mf.compute().item()
             self.log(f'test/{name}', value, prog_bar=False)
diff --git a/ptls/frames/tabformer/tabformer_module.py b/ptls/frames/tabformer/tabformer_module.py
index 5dd1516b..720bc24c 100644
--- a/ptls/frames/tabformer/tabformer_module.py
+++ b/ptls/frames/tabformer/tabformer_module.py
@@ -206,11 +206,11 @@ def validation_step(self, batch, batch_idx):
         loss_tabformer = self.loss_tabformer(z_trx, tabf_labels, is_train_step=False)
         self.valid_tabformer_loss(loss_tabformer)
 
-    def training_epoch_end(self, _):
+    def on_training_epoch_end(self):
         self.log(f'tabformer/train_tabformer_loss', self.train_tabformer_loss, prog_bar=False)
         # self.train_tabformer_loss reset not required here
 
-    def validation_epoch_end(self, _):
+    def on_validation_epoch_end(self):
         self.log(f'tabformer/valid_tabformer_loss', self.valid_tabformer_loss, prog_bar=True)
         # self.valid_tabformer_loss reset not required here
 
diff --git a/ptls/make_datasets_spark.py b/ptls/make_datasets_spark.py
index 8265b6c2..1a837ece 100644
--- a/ptls/make_datasets_spark.py
+++ b/ptls/make_datasets_spark.py
@@ -4,6 +4,7 @@
 import os
 import pickle
 from random import Random
+from typing import List, Optional
 
 import numpy as np
 import pandas as pd
@@ -12,47 +13,94 @@
 import pyspark.sql.types as T
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
+from pyspark.sql import DataFrame  # For typing
 
 
 logger = logging.getLogger(__name__)
 
 
 class DatasetConverter:
+    """
+    Converts datasets from transaction list to features for metric learning.
+
+    The class is designed to be run from command line with arguments.
+    Call python3 make_datasets_spark.py --help to see arguments description.
+    """
     def __init__(self):
         self.config = None
 
-    def parse_args(self, args=None):
+    def parse_args(self, args: Optional[List[str]]=None) -> None:
+        """
+        Parses command line arguments and saves them to self.config.
+
+        Arguments:
+        ----------
+        args: Optional[List[str]]
+            List of arguments to parse. If None, sys.argv is used.
+        """
         parser = argparse.ArgumentParser()
 
-        parser.add_argument('--data_path', type=os.path.abspath)
-        parser.add_argument('--trx_files', nargs='+')
-        parser.add_argument('--target_files', nargs='*', default=[])
+        parser.add_argument('--data_path', type=os.path.abspath, 
+                            help='Path to the directory containing trx files (datasets)')
+        parser.add_argument('--trx_files', nargs='+',
+                            help='List of dataset filenames with transaction features. ' \
+                                 'Note: target column will be ignored.' \
+                                 'Please use --target_files to specify targets.')
+        parser.add_argument('--target_files', nargs='*', default=[],
+                            help='List of target files containing client_id and target columns. '\
+                                'The files can overlap with trx_files or be separate.')
         parser.add_argument('--target_as_array', action='store_true')
 
         parser.add_argument('--print_dataset_info', action='store_true')
         parser.add_argument('--sample_fraction', type=float, default=None)
         parser.add_argument('--col_client_id', type=str)
-        parser.add_argument('--cols_event_time', nargs='+')
+        parser.add_argument('--cols_event_time', nargs='+', 
+                            help='Two arguments: 1) type of time transformation ' \
+                                 '2) time column name.\n' \
+                                 'Possible time transformation types: ' \
+                                 '"#float", "#datetime", "#gender"')
 
         parser.add_argument('--dict', nargs='*', default=[])
-        parser.add_argument('--cols_category', nargs='*', default=[])
-        parser.add_argument('--cols_log_norm', nargs='*', default=[])
+        parser.add_argument('--cols_category', nargs='*', default=[],
+                            help = 'List of categorical columns. All categorical ' \
+                                   'features are encoded with embedding indexes. ' \
+                                   'The indexes correspond to frequency rank:' \
+                                   'All values are sorted by frequency in descending order ' \
+                                   'and are numbered according to the order. ' \
+                                   'The most common value will be replaced with 1, ' \
+                                   'second common value will be replaced with 2 etc.')
+        parser.add_argument('--cols_log_norm', nargs='*', default=[],
+                            help='List of columns to apply log transformation to. ' \
+                                 'Log transformation is applied as signum(x) * log(|x| + 1)')
         parser.add_argument('--col_target', nargs='*', default=[])
         parser.add_argument('--test_size', default='0.1')
-        parser.add_argument('--salt', type=int, default=42)
-        parser.add_argument('--max_trx_count', type=int, default=5000)
+        parser.add_argument('--salt', type=int, default=42,
+                            help='Random seed for client shuffling')
+        parser.add_argument('--max_trx_count', type=int, default=5000,
+                            help='All sequences (transactions) ' \
+                                 'exceeding this number will be removed')
 
         parser.add_argument('--output_train_path', type=os.path.abspath)
         parser.add_argument('--output_test_path', type=os.path.abspath)
         parser.add_argument('--output_test_ids_path', type=os.path.abspath)
         parser.add_argument('--save_partitioned_data', action='store_true')
-        parser.add_argument('--log_file', type=os.path.abspath)
+        parser.add_argument('--log_file', type=os.path.abspath, 
+                            help='File to dump logs to. If set logs will ' \
+                            'be present in stdout and in the file, otherwise only in stdout. ' \
+                            'Notice that stdout will always contain both ' \
+                            'Spark logs and script logs, which makes it hard to read. ' \
+                            'Thus, log_file is useful to be able to read only script logs.')
+
 
         args = parser.parse_args(args)
         logger.info('Parsed args:\n' + '\n'.join([f'  {k:15}: {v}' for k, v in vars(args).items()]))
         self.config = args
 
-    def spark_read_file(self, path):
+    def spark_read_file(self, path: str):
+        """
+        Creates a spark.DataFrame from a given file 
+        using the file extension to determine the format.
+        """
         spark = SparkSession.builder.getOrCreate()
 
         ext = os.path.splitext(path)[1]
@@ -66,10 +114,18 @@ def spark_read_file(self, path):
     def path_to_file(self, file_name):
         return os.path.join(self.config.data_path, file_name)
 
-    def load_source_data(self, trx_files):
+    def load_source_data(self, trx_files: List[str]):
         """
-        :param trx_files:
-        :return: spark.DataFrame with `event_time` column of float type
+        Arguments:
+        ----------
+        trx_files: List[str]
+            List of filenames stored in `self.config.data_path` 
+            directory to load data from.
+
+        Returns:
+        --------
+        data: spark.DataFrame
+            spark.DataFrame with `event_time` column of float type
         """
         data = None
         for file in trx_files:
@@ -105,7 +161,7 @@ def pd_hist(self, df, name, bins=10):
         df['% of total'] = df['cnt'] / df['cnt'].sum()
         return df
 
-    def get_encoder(self, df, col_name):
+    def get_encoder(self, df: DataFrame, col_name: str) -> DataFrame:
         df = df.withColumn(col_name, F.coalesce(F.col(col_name).cast('string'), F.lit('#EMPTY')))
 
         col_orig = '_orig_' + col_name
@@ -120,6 +176,8 @@ def get_encoder(self, df, col_name):
 
         df_encoder = df_encoder.repartition(1)
         df_encoder.persist()
+
+        # AFAIU this call is to trigger the computation since pyspark is lazy.
         _ = df_encoder.count()
 
         return df_encoder
@@ -221,8 +279,8 @@ def join_dict(self, df, df_dict_name, col_id):
         logger.info(f'Join with "{path}" done. New {col_counter} columns joined')
         return df
 
-    def trx_to_features(self, df_data, print_dataset_info,
-                        col_client_id, cols_event_time, cols_category, cols_log_norm, max_trx_count):
+    def trx_to_features(self, df_data, print_dataset_info: bool,
+                        col_client_id, cols_event_time, cols_category, cols_log_norm, max_trx_count: int):
         if print_dataset_info:
             unique_clients = df_data.select(col_client_id).distinct().count()
             logger.info(f'Found {unique_clients} unique clients')
@@ -443,6 +501,10 @@ def logging_config(self):
         logging.basicConfig(level=logging.INFO, format='%(funcName)-20s   : %(message)s', handlers=handlers)
 
     def load_transactions(self):
+        """
+        Returns a single spark.DataFrame with transaction 
+        data collected from all trx_files.
+        """
         spark = SparkSession.builder.getOrCreate()
 
         source_data = self.load_source_data(trx_files=self.config.trx_files)
diff --git a/ptls/metric_learn/ml_models.py b/ptls/metric_learn/ml_models.py
index b3b8dd25..25ac6791 100644
--- a/ptls/metric_learn/ml_models.py
+++ b/ptls/metric_learn/ml_models.py
@@ -21,7 +21,7 @@ def projection_head(input_size, output_size):
 
 class ModelEmbeddingEnsemble(nn.Module):
     def __init__(self, submodels):
-        super(ModelEmbeddingEnsemble, self).__init__()
+        super().__init__()
         self.models = nn.ModuleList(submodels)
 
     def forward(self, x: PaddedBatch, h_0: torch.Tensor = None):
diff --git a/ptls/nn/binarization.py b/ptls/nn/binarization.py
index 9a43a9e9..79eb4536 100644
--- a/ptls/nn/binarization.py
+++ b/ptls/nn/binarization.py
@@ -18,7 +18,7 @@ def backward(self, grad_outputs):
 
 class BinarizationLayer(nn.Module):
     def __init__(self, hs_from, hs_to):
-        super(BinarizationLayer, self).__init__()
+        super().__init__()
         self.linear = nn.Linear(hs_from, hs_to, bias=False)
 
     def forward(self, x):
diff --git a/ptls/nn/seq_encoder/transformer_encoder.py b/ptls/nn/seq_encoder/transformer_encoder.py
index 24f6de0b..e96fa89a 100644
--- a/ptls/nn/seq_encoder/transformer_encoder.py
+++ b/ptls/nn/seq_encoder/transformer_encoder.py
@@ -16,7 +16,7 @@ def __init__(self,
                  use_start_random_shift=True,
                  max_len=5000,
                  ):
-        super(PositionalEncoding, self).__init__()
+        super().__init__()
         self.use_start_random_shift = use_start_random_shift
         self.max_len = max_len
 
diff --git a/ptls/nn/trx_encoder/float_positional_encoding.py b/ptls/nn/trx_encoder/float_positional_encoding.py
index dacdac1e..993ea95b 100644
--- a/ptls/nn/trx_encoder/float_positional_encoding.py
+++ b/ptls/nn/trx_encoder/float_positional_encoding.py
@@ -6,7 +6,7 @@
 
 class FloatPositionalEncoding(nn.Module):
     def __init__(self, out_size):
-        super(FloatPositionalEncoding, self).__init__()
+        super().__init__()
 
         self.out_size = out_size
 
diff --git a/ptls/pl_inference.py b/ptls/pl_inference.py
index 849f6628..b08f7ab4 100644
--- a/ptls/pl_inference.py
+++ b/ptls/pl_inference.py
@@ -60,9 +60,21 @@ def main(conf: DictConfig):
         batch_size=conf.inference.get('batch_size', 128),
     )
 
-    gpus = 1 if torch.cuda.is_available() else 0
-    gpus = conf.inference.get('gpus', gpus)
-    df_scores = pl.Trainer(gpus=gpus, max_epochs=-1).predict(model, inference_dl)
+    if torch.cuda.is_available():
+        accelerator = "cuda"
+        devices = 1
+    else:
+        accelerator = "cpu"
+        devices = 0
+    user_defined_gpus = conf.inference.get("devices", conf.inference.get("gpus", None))
+    if user_defined_gpus is not None:
+        if user_defined_gpus:
+            accelerator = "gpu"
+            devices = user_defined_gpus
+        else:
+            accelerator = "cpu"
+            devices = 0
+    df_scores = pl.Trainer(accelerator=accelerator, devices=devices, max_epochs=-1).predict(model, inference_dl)
     df_scores = pd.concat(df_scores, axis=0)
     logger.info(f'df_scores examples: {df_scores.shape}:')
 
diff --git a/ptls/pl_inference_multimodal.py b/ptls/pl_inference_multimodal.py
index 5a1ebbfd..4ee7d3b9 100644
--- a/ptls/pl_inference_multimodal.py
+++ b/ptls/pl_inference_multimodal.py
@@ -75,4 +75,4 @@ def main(conf: DictConfig):
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/ptls/pl_train_module.py b/ptls/pl_train_module.py
index fc652a23..d88f3183 100644
--- a/ptls/pl_train_module.py
+++ b/ptls/pl_train_module.py
@@ -24,7 +24,7 @@ def main(conf: DictConfig):
     _use_best_epoch = _trainer_params.get('use_best_epoch', False)
 
     if 'callbacks' in _trainer_params:
-        logger.warning(f'Overwrite `trainer.callbacks`, was "{_trainer_params.checkpoint_callback}"')
+        logger.warning(f'Overwrite `trainer.callbacks`, was `{_trainer_params.get("enable_checkpointing", _trainer_params.get("checkpoint_callback", None))}`')
     _trainer_params_callbacks = []
 
     if _use_best_epoch:
diff --git a/ptls/preprocessing/pandas/event_time.py b/ptls/preprocessing/pandas/event_time.py
index 58ee11a8..57372ee1 100644
--- a/ptls/preprocessing/pandas/event_time.py
+++ b/ptls/preprocessing/pandas/event_time.py
@@ -5,7 +5,7 @@
 
 
 def dt_to_timestamp(x: pd.Series):
-    return pd.to_datetime(x).astype('datetime64[s]').astype('int64') // 1000000000
+    return pd.to_datetime(x).astype('datetime64[ns]').astype('int64') // 1000000000
 
 
 def timestamp_to_dt(x: pd.Series):
diff --git a/ptls/preprocessing/pandas_preprocessor.py b/ptls/preprocessing/pandas_preprocessor.py
index 3e6ed5c0..87e04e0f 100644
--- a/ptls/preprocessing/pandas_preprocessor.py
+++ b/ptls/preprocessing/pandas_preprocessor.py
@@ -19,7 +19,7 @@ class PandasDataPreprocessor(DataPreprocessor):
     """Data preprocessor based on pandas.DataFrame
 
     During preprocessing it
-        * transform datetime column to `event_time`
+        * transforms datetime column to `event_time`
         * encodes category columns into indexes;
         * groups flat data by `col_id`;
         * arranges data into list of dicts with features
@@ -41,7 +41,7 @@ class PandasDataPreprocessor(DataPreprocessor):
         - 'none': without transformation, `col_event_time` is in correct format. Used `ColIdentityEncoder`
             Original column is kept by default cause it can be any type and we may use it in the future
     cols_category : list[str]
-        list of category columns. Each can me column name or `ColCategoryTransformer` implementation.
+        list of category columns. Each can be column name or `ColCategoryTransformer` implementation.
     category_transformation: str
         name of transformation for column names from `cols_category`
         - 'frequency': frequency encoding with `FrequencyEncoder`
diff --git a/ptls/preprocessing/pyspark_preprocessor.py b/ptls/preprocessing/pyspark_preprocessor.py
index cb67c388..2a2b2a70 100644
--- a/ptls/preprocessing/pyspark_preprocessor.py
+++ b/ptls/preprocessing/pyspark_preprocessor.py
@@ -26,7 +26,7 @@ class PysparkDataPreprocessor(DataPreprocessor):
     """Data preprocessor based on pyspark.sql.DataFrame
 
     During preprocessing it
-        * transform `cols_event_time` column with date and time
+        * transforms `cols_event_time` column with date and time
         * encodes category columns `cols_category` into ints;
         * apply logarithm transformation to `cols_log_norm' columns;
         * (Optional) select the last `max_trx_count` transactions for each `col_id`;
diff --git a/ptls/swa.py b/ptls/swa.py
index 51cf62b7..07768a4e 100644
--- a/ptls/swa.py
+++ b/ptls/swa.py
@@ -237,7 +237,7 @@ def load_state_dict(self, state_dict):
                           "param_groups": state_dict["param_groups"]}
         opt_state_dict = {"state": state_dict["opt_state"],
                           "param_groups": state_dict["param_groups"]}
-        super(SWA, self).load_state_dict(swa_state_dict)
+        super().load_state_dict(swa_state_dict)
         self.optimizer.load_state_dict(opt_state_dict)
         self.opt_state = self.optimizer.state
 
diff --git a/ptls_tests/test_data_load/test__init__.py b/ptls_tests/test_data_load/test__init__.py
index 40fb5a67..8ace8b2b 100644
--- a/ptls_tests/test_data_load/test__init__.py
+++ b/ptls_tests/test_data_load/test__init__.py
@@ -8,9 +8,9 @@
 
 def test_padded_collate():
     data = [
-        ({'a': torch.LongTensor([1, 2, 3, 4])}, torch.LongTensor([0])),
-        ({'a': torch.LongTensor([1, 2])},  torch.LongTensor([0])),
-        ({'a': torch.LongTensor([1])},  torch.LongTensor([1])),
+        ({'a': torch.tensor([1, 2, 3, 4])}, torch.tensor(0)),
+        ({'a': torch.tensor([1, 2])},  torch.tensor(0)),
+        ({'a': torch.tensor([1])},  torch.tensor(1)),
     ]
 
     tt = torch.LongTensor([
diff --git a/ptls_tests/test_frames/test_coles/test_coles_module.py b/ptls_tests/test_frames/test_coles/test_coles_module.py
index cb135613..9d312e5e 100644
--- a/ptls_tests/test_frames/test_coles/test_coles_module.py
+++ b/ptls_tests/test_frames/test_coles/test_coles_module.py
@@ -59,5 +59,5 @@ def test_train_loop():
         lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=1, gamma=1.0),
     )
     dl = RandomEventData(params['data_module'])
-    trainer = pl.Trainer(max_epochs=1, logger=None, checkpoint_callback=False)
+    trainer = pl.Trainer(max_epochs=1, logger=None, enable_checkpointing=False)
     trainer.fit(model, dl)
diff --git a/ptls_tests/test_frames/test_coles/test_multimodal_coles_module.py b/ptls_tests/test_frames/test_coles/test_multimodal_coles_module.py
new file mode 100644
index 00000000..07e07081
--- /dev/null
+++ b/ptls_tests/test_frames/test_coles/test_multimodal_coles_module.py
@@ -0,0 +1,147 @@
+import pytorch_lightning as pl
+import torch.optim
+import torch
+
+
+from pyhocon import ConfigFactory
+from ptls.nn import Head, TrxEncoder
+from functools import partial
+from collections import defaultdict
+from ptls.data_load.padded_batch import PaddedBatch
+
+from ptls.frames.coles import CoLESModule
+from ptls.frames.coles.multimodal_dataset import MultiModalIterableDataset
+from ptls.frames.coles.split_strategy import SampleSlices
+from ptls.frames import PtlsDataModule
+from ptls.frames.coles import MultiModalSortTimeSeqEncoderContainer
+from ptls.nn.seq_encoder.rnn_encoder import RnnEncoder
+
+def generate_multimodal_data(lengths, target_share=.5, target_type='bin_cls', use_feature_arrays_key=True):
+    n = len(lengths)
+    if target_type == 'bin_cls':
+        targets = (torch.rand(n) >= target_share).long()
+    else:
+        raise AttributeError(f'Unknown target_type: {target_type}')
+    sources = ['src1', 'src2']
+    data_lst = []
+    data = {}
+    
+    for target, length in zip(targets, lengths):
+        for idx, src in enumerate(sources):
+            data[f'src{idx+1}_event_time'] = (torch.rand(length)*100 + 1).long().sort().values
+            data[f'src{idx+1}_trans_type_{idx+1}'] = (torch.rand(length)*10 + 1).long()
+            data[f'src{idx+1}_mcc_code_{idx+1}'] = (torch.rand(length) * 20 + 1).long()
+            data[f'src{idx+1}_amount_{idx+1}'] = (torch.rand(length) * 1000 + 1).long()
+        data_lst.append(data)
+
+    return data_lst
+
+def create_train_mm_loader(data):
+    
+    dataset = MultiModalIterableDataset(
+                                     data, 
+                                     splitter = SampleSlices(split_count=5, cnt_min=20, cnt_max=200),
+                                     col_id='epk_id',
+                                     source_features={
+                                         'src1': ['trans_type_1',
+                                                  'mcc_code_1',
+                                                  'amount_1',
+                                                  'event_time',],
+                                         'src2': ['trans_type_2',
+                                                  'mcc_code_2',
+                                                  'amount_2',
+                                                  'event_time',],
+                                     },
+                                     source_names=['src1', 'src2']
+                                    )
+    
+    dl = PtlsDataModule(
+        train_data=dataset, train_num_workers=0, train_batch_size=4, 
+        valid_data=dataset, valid_num_workers=0, valid_batch_size=1
+    )
+    return dl.train_dataloader()
+
+class RandomMultimodalEventData(pl.LightningDataModule):
+    def __init__(self, params, target_type='bin_cls'):
+        super().__init__()
+        self.hparams.update(params)
+        self.target_type = target_type
+
+    def train_dataloader(self):
+        test_data = generate_multimodal_data((torch.rand(3) * 60 + 1).long(), target_type='bin_cls', use_feature_arrays_key=True)
+        train_loader = create_train_mm_loader(test_data)
+        return train_loader
+
+    def test_dataloader(self):
+        test_data = generate_multimodal_data((torch.rand(3) * 60 + 1).long(), target_type='bin_cls', use_feature_arrays_key=True)
+        train_loader = create_train_mm_loader(test_data)
+        return train_loader
+    
+
+def tst_params():
+    params = {
+        "data_module": {
+            "train": {
+                "num_workers": 1,
+                "batch_size": 32,
+                "trx_dropout": 0.01,
+                "max_seq_len": 100,
+            },
+            "valid": {
+                "batch_size": 16,
+                "num_workers": 1,
+                "max_seq_len": 100
+            }
+        },
+        "rnn": {
+            "type": "gru",
+            "input_size": 64,
+            "seq_encoder_cls": RnnEncoder,
+            "hidden_size": 16,
+            "bidir": False,
+            "trainable_starter": "static"
+        },
+        "trx_encoder_1": {
+            "embeddings_noise": .003,
+            "norm_embeddings": False,
+            'embeddings': {
+                'mcc_code_1': {'in': 21, 'out': 3},
+                'trans_type_1': {'in': 11, 'out': 2},
+            },
+            'numeric_values': {'amount_1': 'log'},
+            "linear_projection_size": 64
+        },
+        "trx_encoder_2": {
+            "embeddings_noise": .003,
+            "norm_embeddings": False,
+            'embeddings': {
+                'mcc_code_2': {'in': 21, 'out': 3},
+                'trans_type_2': {'in': 11, 'out': 2},
+            },
+            'numeric_values': {'amount_2': 'log'},
+            "linear_projection_size": 64
+        },
+    }
+
+    params = ConfigFactory.from_dict(params)
+    return params
+
+
+def test_train_loop():
+    params = tst_params()
+
+    model = CoLESModule(
+        seq_encoder=MultiModalSortTimeSeqEncoderContainer(
+            trx_encoders={
+                "src1": TrxEncoder(**params['trx_encoder_1']),
+                "src2": TrxEncoder(**params['trx_encoder_2']),
+            },
+            **params['rnn'],
+        ),
+        head=Head(use_norm_encoder=True),
+        optimizer_partial=partial(torch.optim.Adam),
+        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=1, gamma=1.0),
+    )
+    dl = RandomMultimodalEventData(params['data_module'])
+    trainer = pl.Trainer(max_epochs=1, logger=None, enable_checkpointing=False)
+    trainer.fit(model, dl)
\ No newline at end of file
diff --git a/ptls_tests/test_frames/test_cpc.py b/ptls_tests/test_frames/test_cpc.py
index 162e88d9..9791c725 100644
--- a/ptls_tests/test_frames/test_cpc.py
+++ b/ptls_tests/test_frames/test_cpc.py
@@ -71,8 +71,8 @@ def test_rnn_model():
     valid_loader = create_validation_loader(valid_ds, config['valid'])
 
     trainer = pl.Trainer(
-        gpus=None,
+        accelerator="cpu",
         max_steps=50,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
     trainer.fit(pl_module, train_loader, valid_loader)
diff --git a/ptls_tests/test_frames/test_inference_module.py b/ptls_tests/test_frames/test_inference_module.py
index ca9eb390..f12848c8 100644
--- a/ptls_tests/test_frames/test_inference_module.py
+++ b/ptls_tests/test_frames/test_inference_module.py
@@ -49,7 +49,7 @@ def test_inference_module_predict():
         model_out_name='pred',
     )
 
-    df_out = pd.concat(pl.Trainer(gpus=0, max_epochs=-1).predict(rnn_model, valid_loader))
+    df_out = pd.concat(pl.Trainer(accelerator="cpu", max_epochs=-1).predict(rnn_model, valid_loader))
     print(roc_auc_score(df_out['target'], df_out['pred']))
 
 
@@ -77,7 +77,7 @@ def test_score_model_mult2():
         model_out_name='pred', pandas_output=False,
     )
 
-    dict_out = pl.Trainer(gpus=0, max_epochs=-1).predict(model, iter(valid_loader))
+    dict_out = pl.Trainer(accelerator="cpu", max_epochs=-1).predict(model, iter(valid_loader))
     id1 = torch.cat([v['target_int'] for v in dict_out])
     id2 = np.concatenate([v['target_str'] for v in dict_out])
 
@@ -122,7 +122,7 @@ def test_inference_module_sequence():
         model_out_name='pred',
     )
 
-    df_out = pd.concat(pl.Trainer(gpus=0, max_epochs=-1).predict(rnn_model, valid_loader))
+    df_out = pd.concat(pl.Trainer(accelerator="cpu", max_epochs=-1).predict(rnn_model, valid_loader))
     assert df_out.shape == (trx_num, 20)
     assert list(df_out.mcc_code) == [mcc for usr in trx_data for mcc in usr['mcc_code']]
 
@@ -141,7 +141,7 @@ def test_inference_module_sequence_drop_seq():
         model_out_name='pred',
     )
 
-    df_out = pd.concat(pl.Trainer(gpus=0, max_epochs=-1).predict(rnn_model, valid_loader))
+    df_out = pd.concat(pl.Trainer(accelerator="cpu", max_epochs=-1).predict(rnn_model, valid_loader))
     assert df_out.shape == (trx_num, 17)
     assert list(df_out.target) == list(chain(*[[usr['target']]*usr['mcc_code'].shape[0] for usr in trx_data]))
 
@@ -160,7 +160,7 @@ def test_inference_module_record():
         model_out_name='pred',
     )
 
-    df_out = pd.concat(pl.Trainer(gpus=0, max_epochs=-1).predict(rnn_model, valid_loader))
+    df_out = pd.concat(pl.Trainer(accelerator="cpu", max_epochs=-1).predict(rnn_model, valid_loader))
     assert df_out.shape == (1000, 20)
     np.testing.assert_array_almost_equal(list(df_out.mcc_code)[0], [mcc for mcc in trx_data[0]['mcc_code']])
 
@@ -178,6 +178,6 @@ def test_inference_module_record_drop_seq():
         model_out_name='pred',
     )
 
-    df_out = pd.concat(pl.Trainer(gpus=0, max_epochs=-1).predict(rnn_model, valid_loader))
+    df_out = pd.concat(pl.Trainer(accelerator="cpu", max_epochs=-1).predict(rnn_model, valid_loader))
     assert df_out.shape == (1000, 17)
     assert list(df_out.target)[0] == trx_data[0]['target']
diff --git a/ptls_tests/test_frames/test_supervised/test_seq_to_target.py b/ptls_tests/test_frames/test_supervised/test_seq_to_target.py
index 476862d9..5370aa43 100644
--- a/ptls_tests/test_frames/test_supervised/test_seq_to_target.py
+++ b/ptls_tests/test_frames/test_supervised/test_seq_to_target.py
@@ -13,6 +13,25 @@
 from ptls_tests.test_data_load import RandomEventData
 
 
+def accuracy_metric(*args, **kwargs):
+    kwargs["task"] = kwargs.get("task", "multiclass")
+    try:
+        return torchmetrics.classification.Accuracy(*args, **kwargs)
+    except TypeError:
+        if kwargs.pop("task", None) == "binary":
+            kwargs["num_classes"] = 2
+        return torchmetrics.classification.Accuracy(*args, **kwargs)
+
+def auroc_metric(*args, **kwargs):
+    kwargs["task"] = kwargs.get("task", "multiclass")
+    try:
+        return torchmetrics.classification.AUROC(*args, **kwargs)
+    except TypeError:
+        if kwargs.pop("task", None) == "binary":
+            kwargs["num_classes"] = 2
+        return torchmetrics.classification.AUROC(*args, **kwargs)
+
+
 def get_rnn_params():
     return dict(
         seq_encoder=RnnSeqEncoder(
@@ -83,11 +102,11 @@ def test_train_loop_rnn_binary_classification():
             torch.nn.Flatten(start_dim=0),
         ),
         loss=BCELoss(),
-        metric_list=torchmetrics.AUROC(num_classes=2),
+        metric_list=auroc_metric(task="binary"),
         **get_rnn_params(),
     )
     dl = RandomEventData(tst_params_data(), target_type='bin_cls')
-    trainer = pl.Trainer(max_epochs=1, logger=None, checkpoint_callback=False)
+    trainer = pl.Trainer(max_epochs=1, logger=None, enable_checkpointing=False)
     trainer.fit(model, dl)
     print(trainer.logged_metrics)
 
@@ -100,13 +119,13 @@ def test_train_loop_rnn_milti_classification():
         ),
         loss=torch.nn.NLLLoss(),
         metric_list={
-            'auroc': torchmetrics.AUROC(num_classes=4),
-            'accuracy': torchmetrics.Accuracy(),
+            'auroc': auroc_metric(num_classes=4),
+            'accuracy': accuracy_metric(num_classes=4),
         },
         **get_rnn_params(),
     )
     dl = RandomEventData(tst_params_data(), target_type='multi_cls')
-    trainer = pl.Trainer(max_epochs=1, logger=None, checkpoint_callback=False)
+    trainer = pl.Trainer(max_epochs=1, logger=None, enable_checkpointing=False)
     trainer.fit(model, dl)
     print(trainer.logged_metrics)
 
@@ -118,11 +137,11 @@ def test_train_loop_rnn_regression():
             torch.nn.Flatten(start_dim=0),
         ),
         loss=torch.nn.MSELoss(),
-        metric_list=torchmetrics.MeanSquaredError(compute_on_step=False, squared=False),
+        metric_list=torchmetrics.MeanSquaredError(squared=False),
         **get_rnn_params(),
     )
     dl = RandomEventData(tst_params_data(), target_type='regression')
-    trainer = pl.Trainer(max_epochs=1, logger=None, checkpoint_callback=False)
+    trainer = pl.Trainer(max_epochs=1, logger=None, enable_checkpointing=False)
     trainer.fit(model, dl)
     print(trainer.logged_metrics)
 
@@ -151,33 +170,33 @@ def test_train_loop_transf():
             torch.nn.Flatten(start_dim=0),
         ),
         loss=BCELoss(),
-        metric_list=torchmetrics.AUROC(num_classes=2),
+        metric_list=auroc_metric(task="binary"),
         optimizer_partial=partial(torch.optim.Adam, lr=0.004),
         lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.8),
     )
     dl = RandomEventData(tst_params_data())
-    trainer = pl.Trainer(max_epochs=1, logger=None, checkpoint_callback=False)
+    trainer = pl.Trainer(max_epochs=1, logger=None, enable_checkpointing=False)
     trainer.fit(model, dl)
 
 # SequenceToTarget.metric_list
 def test_seq_to_target_metric_list_single_metric():
-    model = SequenceToTarget(metric_list=torchmetrics.Accuracy(), seq_encoder=None)
+    model = SequenceToTarget(metric_list=accuracy_metric(num_classes=2), seq_encoder=None)
     metric_name = next(iter(model.valid_metrics.keys()))
-    assert metric_name == 'Accuracy'
+    assert metric_name in {'Accuracy', 'MulticlassAccuracy'}
 
 
 def test_seq_to_target_metric_list_list_with_metric():
     model = SequenceToTarget(metric_list=[
-        torchmetrics.Accuracy(),
-        torchmetrics.AUROC(num_classes=2),
+        accuracy_metric(num_classes=2),
+        auroc_metric(num_classes=2),
     ], seq_encoder=None)
-    assert 'Accuracy' in model.valid_metrics
-    assert 'AUROC' in model.valid_metrics
+    assert 'Accuracy' in model.valid_metrics or 'MulticlassAccuracy' in model.valid_metrics
+    assert 'AUROC' in model.valid_metrics or 'MulticlassAUROC' in model.valid_metrics
 
 
 def test_seq_to_target_metric_list_dict_with_single_metric():
     model = SequenceToTarget(metric_list={
-        'acc': torchmetrics.Accuracy(),
+        'acc': accuracy_metric(num_classes=2),
     }, seq_encoder=None)
     metric_name = next(iter(model.valid_metrics.keys()))
     assert metric_name == 'acc'
@@ -185,20 +204,21 @@ def test_seq_to_target_metric_list_dict_with_single_metric():
 
 def test_seq_to_target_metric_list_dict_with_metric():
     model = SequenceToTarget(metric_list={
-        'acc': torchmetrics.Accuracy(),
-        'auroc': torchmetrics.AUROC(num_classes=2),
+        'acc': accuracy_metric(num_classes=2),
+        'auroc': auroc_metric(num_classes=2),
     }, seq_encoder=None)
     assert 'acc' in model.valid_metrics
     assert 'auroc' in model.valid_metrics
 
 
 def test_seq_to_target_metric_list_dict_config_with_metric():
-    conf = omegaconf.OmegaConf.create("""
+    conf = omegaconf.OmegaConf.create(f"""
         auroc:
-            _target_: torchmetrics.AUROC
+            _target_: torchmetrics.classification.{'MulticlassAUROC' if hasattr(torchmetrics.classification, 'MulticlassAUROC') else 'AUROC'}
             num_classes: 2
         acc:
-            _target_: torchmetrics.Accuracy
+            _target_: torchmetrics.classification.{'MulticlassAccuracy' if hasattr(torchmetrics.classification, 'MulticlassAccuracy') else 'Accuracy'}
+            num_classes: 2
     """)
     model = SequenceToTarget(metric_list=hydra.utils.instantiate(conf), seq_encoder=None)
     assert 'acc' in model.valid_metrics
diff --git a/ptls_tests/test_pl_api.py b/ptls_tests/test_pl_api.py
index b2177140..3c3e5215 100644
--- a/ptls_tests/test_pl_api.py
+++ b/ptls_tests/test_pl_api.py
@@ -64,7 +64,8 @@ def test_train_inference():
 
     trainer = pl.Trainer(
         max_epochs=1,
-        gpus=0 if torch.cuda.is_available() else 0,
+        accelerator="cuda" if torch.cuda.is_available() else "cpu",
+        devices=1 if torch.cuda.is_available() else "auto",
         logger=False
     )
     trainer.fit(model, train_dl)
diff --git a/ptls_tests/test_pl_api_duckdb.py b/ptls_tests/test_pl_api_duckdb.py
index daa28610..c5523015 100644
--- a/ptls_tests/test_pl_api_duckdb.py
+++ b/ptls_tests/test_pl_api_duckdb.py
@@ -69,7 +69,8 @@ def test_train_inference():
 
     trainer = pl.Trainer(
         max_epochs=1,
-        gpus=0 if torch.cuda.is_available() else 0,
+        accelerator="cuda" if torch.cuda.is_available() else "cpu",
+        devices=1 if torch.cuda.is_available() else "auto",
         logger=False
     )
     trainer.fit(model, train_dl)
@@ -78,7 +79,7 @@ def test_train_inference():
         (SELECT * FROM read_csv_auto('{Path(__file__).parent / 'age-transactions.csv'}')
         WHERE hash(client_id) % 5 == 0)
         """
-    
+
     test_ds = DuckDbDataset(
         data_read_func = test_data,
         col_id = 'client_id',
diff --git a/ptls_tests/test_preprocessing/test_pandas/test_user_group_transformer.py b/ptls_tests/test_preprocessing/test_pandas/test_user_group_transformer.py
index 32980615..c8ad69dd 100644
--- a/ptls_tests/test_preprocessing/test_pandas/test_user_group_transformer.py
+++ b/ptls_tests/test_preprocessing/test_pandas/test_user_group_transformer.py
@@ -12,7 +12,7 @@ def test_group():
         'amount':     [10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0],
     })
     t = UserGroupTransformer(col_name_original='user_id')
-    records = t.fit_transform(df).to_dict(orient='record')
+    records = t.fit_transform(df).to_dict(orient='records')
     rec = records[1]
     assert rec['user_id'] == 1
     torch.testing.assert_close(rec['event_time'], torch.LongTensor([0, 1, 2, 3]))
@@ -28,7 +28,7 @@ def test_group_with_target():
         'target':     [10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0],
     })
     t = UserGroupTransformer(col_name_original='user_id', cols_first_item=['target'])
-    records = t.fit_transform(df).to_dict(orient='record')
+    records = t.fit_transform(df).to_dict(orient='records')
     rec = records[1]
     assert rec['user_id'] == 1
     torch.testing.assert_close(rec['event_time'], torch.LongTensor([-1, 0, 1, 3]))
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..651095a1
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,200 @@
+absl-py==1.4.0
+alembic==1.11.0
+antlr4-python3-runtime==4.9.3
+anyio==3.6.2
+appnope==0.1.3
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+arrow==1.2.3
+astroid==2.15.5
+asttokens==2.2.1
+attrs==23.1.0
+autopage==0.5.1
+backcall==0.2.0
+beautifulsoup4==4.12.2
+bleach==6.0.0
+cachetools==5.3.0
+catboost==1.2
+certifi==2023.5.7
+cffi==1.15.1
+charset-normalizer==3.1.0
+click==8.1.3
+cliff==4.3.0
+cmaes==0.9.1
+cmd2==2.4.3
+colorlog==6.7.0
+comm==0.1.3
+contourpy==1.0.7
+coverage==7.2.5
+cycler==0.11.0
+debugpy==1.6.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.6
+docutils==0.20.1
+duckdb==0.8.0
+exceptiongroup==1.1.1
+executing==1.2.0
+fastjsonschema==2.16.3
+filelock==3.12.0
+fonttools==4.39.4
+fqdn==1.5.1
+fsspec[http]==2023.5.0
+ghp-import==2.1.0
+google-auth==2.18.0
+google-auth-oauthlib==1.0.0
+graphviz==0.20.1
+grpcio==1.54.2
+huggingface-hub==0.14.1
+hydra-core==1.3.2
+hydra-optuna-sweeper==1.2.0
+idna==3.4
+importlib-metadata==6.6.0
+importlib-resources==5.12.0
+iniconfig==2.0.0
+ipykernel==6.23.1
+ipython==8.12.2
+ipython-genutils==0.2.0
+ipywidgets==8.0.6
+isoduration==20.11.0
+isort==5.12.0
+jaraco.classes==3.2.3
+jedi==0.18.2
+jinja2==3.1.2
+joblib==1.2.0
+jsonpointer==2.3
+jsonschema==4.17.3
+jupyter==1.0.0
+jupyter-client==8.2.0
+jupyter-console==6.6.3
+jupyter-core==5.3.0
+jupyter-events==0.6.3
+jupyter-server==2.5.0
+jupyter-server-terminals==0.4.4
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==3.0.7
+keyring==23.13.1
+kiwisolver==1.4.4
+lazy-object-proxy==1.9.0
+mako==1.2.4
+markdown==3.4.3
+markdown-it-py==2.2.0
+markupsafe==2.1.2
+matplotlib==3.7.1
+matplotlib-inline==0.1.6
+mccabe==0.7.0
+mdurl==0.1.2
+mergedeep==1.3.4
+mistune==2.0.5
+mkdocs==1.6.1
+more-itertools==9.1.0
+nbclassic==1.0.0
+nbclient==0.7.4
+nbconvert==7.4.0
+nbformat==5.8.0
+nest-asyncio==1.5.6
+notebook==6.5.4
+notebook-shim==0.2.3
+numpy==1.23.5
+oauthlib==3.2.2
+omegaconf==2.3.0
+optuna==2.10.1
+packaging==23.1
+pandas==1.4.4
+pandocfilters==1.5.0
+parso==0.8.3
+pbr==5.11.1
+pexpect==4.8.0
+pickleshare==0.7.5
+pillow==9.5.0
+pkginfo==1.9.6
+pkgutil-resolve-name==1.3.10
+platformdirs==3.5.1
+plotly==5.14.1
+pluggy==1.0.0
+prettytable==3.7.0
+prometheus-client==0.16.0
+prompt-toolkit==3.0.38
+protobuf==3.20.1
+psutil==5.9.5
+ptyprocess==0.7.0
+pure-eval==0.2.2
+py4j==0.10.9.7
+pyarrow==12.0.0
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pycparser==2.21
+pygments==2.15.1
+pyhocon==0.3.60
+pylint==2.17.4
+pyparsing==3.0.9
+pyperclip==1.8.2
+pyrsistent==0.19.3
+pyspark==3.4.0
+pytest==7.3.1
+python-dateutil==2.8.2
+python-json-logger==2.0.7
+pytz==2023.3
+pyyaml==6.0
+pyyaml-env-tag==0.1
+pyzmq==25.0.2
+qtconsole==5.4.3
+qtpy==2.3.1
+readme-renderer==37.3
+regex==2023.5.5
+requests==2.30.0
+requests-oauthlib==1.3.1
+requests-toolbelt==1.0.0
+rfc3339-validator==0.1.4
+rfc3986==2.0.0
+rfc3986-validator==0.1.1
+rich==13.3.5
+rsa==4.9
+scikit-learn==1.2.2
+scipy==1.10.1
+send2trash==1.8.2
+setuptools==67.7.2
+six==1.16.0
+sniffio==1.3.0
+soupsieve==2.4.1
+sqlalchemy==2.0.13
+stack-data==0.6.2
+stevedore==5.1.0
+tenacity==8.2.2
+tensorboard==2.13.0
+tensorboard-data-server==0.7.0
+terminado==0.17.1
+threadpoolctl==3.1.0
+tinycss2==1.2.1
+tokenizers==0.13.3
+tomli==2.0.1
+tomlkit==0.11.8
+tornado==6.3.2
+tqdm==4.65.0
+traitlets==5.9.0
+transformers==4.29.2
+twine==4.0.2
+typing-extensions==4.5.0
+uri-template==1.2.0
+urllib3==1.26.15
+watchdog==3.0.0
+wcwidth==0.2.6
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.5.1
+werkzeug==2.3.4
+wheel==0.40.0
+widgetsnbextension==4.0.7
+wrapt==1.15.0
+zipp==3.15.0
+duckdb==0.8.0
+hydra-core==1.3.2
+numpy==1.23.5
+omegaconf==2.3.0
+pandas==1.4.4
+pyarrow==12.0.0
+pytorch-lightning==2.4.0
+scikit-learn==1.2.2
+torch==2.1.2
+torchmetrics>=0.9.0
+transformers==4.29.2
\ No newline at end of file
diff --git a/setup.py b/setup.py
index a7ed8ded..607ad3f2 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@
     
 setuptools.setup(
     name='pytorch-lifestream',
-    version='0.5.3',
+    version='0.6.0',
     author='',
     author_email='',
     description='Lifestream data analysis with PyTorch',
@@ -21,10 +21,16 @@
     ],
     python_requires='>=3.7',
     install_requires=[
-        'pytorch-lightning==1.6.*',
-        'torch==1.12.*',
-        'numpy==1.23.*',
-        'torchmetrics==0.9.*',
+        'duckdb',
+        'hydra-core>=1.1.2',
+        'numpy>=1.21.5',
         'omegaconf',
+        'pandas>=1.3.5',
+        'pyarrow>=6.0.1',
+        'pytorch-lightning>=1.6.0',
+        'scikit-learn>=1.0.2',
+        'torch>=1.12.0',
+        'torchmetrics>=0.9.0',
+        'transformers',
     ],
 )
diff --git a/test.sh b/test.sh
new file mode 100644
index 00000000..603fff33
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,2 @@
+# sudo docker build -f Dockerfile -t pytorch-lifestream-tests .
+sudo docker run --name ptls_tests -it -v ${PWD}/ptls:/ptls -v ${PWD}/ptls_tests:/ptls_tests pytorch-lifestream-tests
\ No newline at end of file