Add context storage benchmarking (#144)

Add utils for benchmarking pipeline-db interface performance.
deeppavlov · Sep 28, 2023 · a3ea816 · a3ea816
1 parent 58a0993
commit a3ea816
Show file tree

Hide file tree

Showing 23 changed files with 1,976 additions and 3 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,32 @@
+*.DS_Store*
+*.egg-info/
+dist/
+venv/
+build/
+docs/source/apiref
+docs/source/_misc
+docs/source/release_notes.rst
+docs/source/tutorials
+*__pycache__*
+*.idea/*
+.idea/*
+*.pyc
+.pytest_cache/*
+.mypy_cache
+modules/*
+dm_pickle*
+dialogue_manager*
+GlobalUserTableAccessor*
+memory_debugging*
+opening_database*
+_globals.py
+venv*
+.vscode
+.coverage
+.pytest_cache
+htmlcov
+tutorials/context_storages/dbs
+dbs
+benchmarks
+benchmark_results_files.json
+uploaded_benchmarks
diff --git a/.github/workflows/test_coverage.yml b/.github/workflows/test_coverage.yml
@@ -41,7 +41,7 @@ jobs:
 
       - name: clean environment
         run: |
-          export backup_files=( tests tutorials .env_file makefile .coveragerc pytest.ini docs )
+          export backup_files=( tests tutorials utils .env_file makefile .coveragerc pytest.ini docs )
           mkdir /tmp/backup
           for i in "${backup_files[@]}" ; do mv "$i"  /tmp/backup ; done
           rm -rf ..?* .[!.]* *

diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ dist/
 venv/
 build/
 docs/source/apiref
+docs/source/_misc
 docs/source/release_notes.rst
 docs/source/tutorials
 *__pycache__*
@@ -25,3 +26,7 @@ venv*
 .pytest_cache
 htmlcov
 tutorials/context_storages/dbs
+dbs
+benchmarks
+benchmark_results_files.json
+uploaded_benchmarks
diff --git a/README.md b/README.md
@@ -36,6 +36,7 @@ pip install dff[postgresql]  # dependencies for using PostgreSQL
 pip install dff[sqlite]  # dependencies for using SQLite
 pip install dff[ydb]  # dependencies for using Yandex Database
 pip install dff[telegram]  # dependencies for using Telegram
+pip install dff[benchmark]  # dependencies for benchmarking
 pip install dff[full]  # full dependencies including all options above
 pip install dff[tests]  # dependencies for running tests
 pip install dff[test_full]  # full dependencies for running all tests (all options above)

diff --git a/dff/utils/db_benchmark/__init__.py b/dff/utils/db_benchmark/__init__.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+# flake8: noqa: F401
+from dff.utils.db_benchmark.benchmark import (
+    time_context_read_write,
+    DBFactory,
+    BenchmarkConfig,
+    BenchmarkCase,
+    save_results_to_file,
+    benchmark_all,
+)
+from dff.utils.db_benchmark.report import report
+from dff.utils.db_benchmark.basic_config import BasicBenchmarkConfig, basic_configurations
diff --git a/dff/utils/db_benchmark/basic_config.py b/dff/utils/db_benchmark/basic_config.py
@@ -0,0 +1,218 @@
+"""
+Basic Config
+------------
+This module contains basic benchmark configurations.
+
+It defines a simple configurations class (:py:class:`~.BasicBenchmarkConfig`)
+as well as a set of configurations that covers different dialogs a user might have and some edge-cases
+(:py:data:`~.basic_configurations`).
+"""
+from typing import Tuple, Optional
+import string
+import random
+
+from humanize import naturalsize
+from pympler import asizeof
+
+from dff.script import Message, Context
+from dff.utils.db_benchmark.benchmark import BenchmarkConfig
+
+
+def get_dict(dimensions: Tuple[int, ...]):
+    """
+    Return misc dictionary build in `dimensions` dimensions.
+
+    :param dimensions:
+        Dimensions of the dictionary.
+        Each element of the dimensions tuple is the number of keys on the corresponding level of the dictionary.
+        The last element of the dimensions tuple is the length of the string values of the dict.
+
+        e.g. dimensions=(1, 2) returns a dictionary with 1 key that points to a string of len 2.
+        whereas dimensions=(1, 2, 3) returns a dictionary with 1 key that points to a dictionary
+        with 2 keys each of which points to a string of len 3.
+
+        So, the len of dimensions is the depth of the dictionary, while its values are
+        the width of the dictionary at each level.
+    """
+
+    def _get_dict(dimensions: Tuple[int, ...]):
+        if len(dimensions) < 2:
+            # get a random string of length dimensions[0]
+            return "".join(random.choice(string.printable) for _ in range(dimensions[0]))
+        return {str(i): _get_dict(dimensions[1:]) for i in range(dimensions[0])}
+
+    if len(dimensions) > 1:
+        return _get_dict(dimensions)
+    elif len(dimensions) == 1:
+        return _get_dict((dimensions[0], 0))
+    else:
+        return _get_dict((0, 0))
+
+
+def get_message(message_dimensions: Tuple[int, ...]):
+    """
+    Return message with a non-empty misc field.
+
+    :param message_dimensions: Dimensions of the misc field of the message. See :py:func:`~.get_dict`.
+    """
+    return Message(misc=get_dict(message_dimensions))
+
+
+def get_context(
+    dialog_len: int,
+    message_dimensions: Tuple[int, ...],
+    misc_dimensions: Tuple[int, ...],
+) -> Context:
+    """
+    Return context with a non-empty misc, labels, requests, responses fields.
+
+    :param dialog_len: Number of labels, requests and responses.
+    :param message_dimensions:
+        A parameter used to generate messages for requests and responses. See :py:func:`~.get_message`.
+    :param misc_dimensions:
+        A parameter used to generate misc field. See :py:func:`~.get_dict`.
+    """
+    return Context(
+        labels={i: (f"flow_{i}", f"node_{i}") for i in range(dialog_len)},
+        requests={i: get_message(message_dimensions) for i in range(dialog_len)},
+        responses={i: get_message(message_dimensions) for i in range(dialog_len)},
+        misc=get_dict(misc_dimensions),
+    )
+
+
+class BasicBenchmarkConfig(BenchmarkConfig, frozen=True):
+    """
+    A simple benchmark configuration that generates contexts using two parameters:
+
+    - `message_dimensions` -- to configure the way messages are generated.
+    - `misc_dimensions` -- to configure size of context's misc field.
+
+    Dialog length is configured using `from_dialog_len`, `to_dialog_len`, `step_dialog_len`.
+    """
+
+    context_num: int = 30
+    """
+    Number of times the contexts will be benchmarked.
+    Increasing this number decreases standard error of the mean for benchmarked data.
+    """
+    from_dialog_len: int = 300
+    """Starting dialog len of a context."""
+    to_dialog_len: int = 311
+    """
+    Final dialog len of a context.
+    :py:meth:`~.BasicBenchmarkConfig.context_updater` will return contexts
+    until their dialog len is less then `to_dialog_len`.
+    """
+    step_dialog_len: int = 1
+    """
+    Increment step for dialog len.
+    :py:meth:`~.BasicBenchmarkConfig.context_updater` will return contexts
+    increasing dialog len by `step_dialog_len`.
+    """
+    message_dimensions: Tuple[int, ...] = (10, 10)
+    """
+    Dimensions of misc dictionaries inside messages.
+    See :py:func:`~.get_message`.
+    """
+    misc_dimensions: Tuple[int, ...] = (10, 10)
+    """
+    Dimensions of misc dictionary.
+    See :py:func:`~.get_dict`.
+    """
+
+    def get_context(self) -> Context:
+        """
+        Return context with `from_dialog_len`, `message_dimensions`, `misc_dimensions`.
+
+        Wraps :py:func:`~.get_context`.
+        """
+        return get_context(self.from_dialog_len, self.message_dimensions, self.misc_dimensions)
+
+    def info(self):
+        """
+        Return fields of this instance and sizes of objects defined by this config.
+
+        :return:
+            A dictionary with two keys.
+            Key "params" stores fields of this configuration.
+            Key "sizes" stores string representation of following values:
+
+                - "starting_context_size" -- size of a context with `from_dialog_len`.
+                - "final_context_size" -- size of a context with `to_dialog_len`.
+                  A context of this size will never actually be benchmarked.
+                - "misc_size" -- size of a misc field of a context.
+                - "message_size" -- size of a misc field of a message.
+        """
+        return {
+            "params": self.model_dump(),
+            "sizes": {
+                "starting_context_size": naturalsize(asizeof.asizeof(self.get_context()), gnu=True),
+                "final_context_size": naturalsize(
+                    asizeof.asizeof(get_context(self.to_dialog_len, self.message_dimensions, self.misc_dimensions)),
+                    gnu=True,
+                ),
+                "misc_size": naturalsize(asizeof.asizeof(get_dict(self.misc_dimensions)), gnu=True),
+                "message_size": naturalsize(asizeof.asizeof(get_message(self.message_dimensions)), gnu=True),
+            },
+        }
+
+    def context_updater(self, context: Context) -> Optional[Context]:
+        """
+        Update context to have `step_dialog_len` more labels, requests and responses,
+        unless such dialog len would be equal to `to_dialog_len` or exceed than it,
+        in which case None is returned.
+        """
+        start_len = len(context.labels)
+        if start_len + self.step_dialog_len < self.to_dialog_len:
+            for i in range(start_len, start_len + self.step_dialog_len):
+                context.add_label((f"flow_{i}", f"node_{i}"))
+                context.add_request(get_message(self.message_dimensions))
+                context.add_response(get_message(self.message_dimensions))
+            return context
+        else:
+            return None
+
+
+basic_configurations = {
+    "large-misc": BasicBenchmarkConfig(
+        from_dialog_len=1,
+        to_dialog_len=50,
+        message_dimensions=(3, 5, 6, 5, 3),
+        misc_dimensions=(2, 4, 3, 8, 100),
+    ),
+    "short-messages": BasicBenchmarkConfig(
+        from_dialog_len=500,
+        to_dialog_len=550,
+        message_dimensions=(2, 30),
+        misc_dimensions=(0, 0),
+    ),
+    "default": BasicBenchmarkConfig(),
+    "large-misc--long-dialog": BasicBenchmarkConfig(
+        from_dialog_len=500,
+        to_dialog_len=550,
+        message_dimensions=(3, 5, 6, 5, 3),
+        misc_dimensions=(2, 4, 3, 8, 100),
+    ),
+    "very-long-dialog-len": BasicBenchmarkConfig(
+        context_num=10,
+        from_dialog_len=10000,
+        to_dialog_len=10050,
+    ),
+    "very-long-message-len": BasicBenchmarkConfig(
+        context_num=10,
+        from_dialog_len=1,
+        to_dialog_len=3,
+        message_dimensions=(10000, 1),
+    ),
+    "very-long-misc-len": BasicBenchmarkConfig(
+        context_num=10,
+        from_dialog_len=1,
+        to_dialog_len=3,
+        misc_dimensions=(10000, 1),
+    ),
+}
+"""
+Configuration that covers many dialog cases (as well as some edge-cases).
+
+:meta hide-value:
+"""