PB-445 💎 Release v0.6.0

xaynetwork · Feb 26, 2020 · d3f1842 · d3f1842
2 parents 67eae40 + dfa6bf2
commit d3f1842
Show file tree

Hide file tree

Showing 27 changed files with 1,063 additions and 380 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,26 @@ For reference, the possible headings are:
 
 ## [Unreleased]
 
+## [0.6.0] - 2020-02-26
+
+- HOTFIX add disclaimer (#309) [janpetschexain]
+- PB-314: document the new weight exchange mechanism (#308) [Corentin Henry]
+- PB-407 add more debug level logging (#303) [janpetschexain]
+- PB-44 add heartbeat time and timeout to config (#305) [Robert Steiner]
+- PB-423 lock round access (#304) [kwok]
+- PB-439 Make thread pool workers configurable (#302) [Robert Steiner]
+- PB-159: update xain-{proto,sdk} dependencies to the right branch (#301) [Corentin Henry]
+- PB-159: remove weights from gRPC messages (#298) [Corentin Henry]
+- PB-431 send participant state to influxdb (#300) [Robert Steiner]
+- PB-434 separate metrics (#296) [Robert Steiner]
+- PB-406 :snowflake: Configure mypy (#297) [Anastasiia Tymoshchuk]
+- PB-428 send coordinator states (#292) [Robert Steiner]
+- PB-425 split weight init from training (#295) [janpetschexain]
+- PB-398 Round resumption in Coordinator (#285) [kwok]
+- Merge pull request #294 from xainag/master. [Daniel Kravetz]
+- Hotfix: PB-432 :pencil: :books: Update test badge and CI to reflect changes. [Daniel Kravetz]
+- PB-417 Start new development cycle (#291) [Anastasiia Tymoshchuk, kwok]
+
 ## [0.5.0] - 2020-02-12
 
 Fix minor issues, update documentation.

diff --git a/README.md b/README.md
@@ -4,6 +4,12 @@
 [![Documentation Status](https://readthedocs.org/projects/xain-fl/badge/?version=latest)](https://xain-fl.readthedocs.io/en/latest/?badge=latest)
 [![Gitter chat](https://badges.gitter.im/xainag.png)](https://gitter.im/xainag)
 
+---
+
+**Disclaimer: This is work-in-progress and not production-ready, expect errors to occur! Use at your own risk! Do not use for any security related issues!**
+
+---
+
 # XAIN
 
 The XAIN project is building a privacy layer for machine learning so that AI projects can meet compliance such as

diff --git a/configs/example-config.toml b/configs/example-config.toml
@@ -7,6 +7,12 @@
 host = "localhost"
 # (Optional) Port to listen on for incoming gRPC connections
 port = 50051
+# (Optional) The maximum number of gRPC thread pool workers
+thread_pool_workers = 10
+# (Optional) The heartbeat time in seconds
+heartbeat_time = 10
+# (Optional) The heartbeat timeout in seconds
+heartbeat_timeout = 5
 
 # (Optional)
 [server.grpc_options]
@@ -34,10 +40,8 @@ fraction_participants = 1.0
 [storage]
 # (Required) URL to the storage service to use
 endpoint = "http://minio-dev:9000"
-# (Required) Name of the bucket for storing the global model weights
-global_weights_bucket = "xain-fl-aggregated-weights"
-# (Required) Name of the bucket for retrieving the local model weights
-local_weights_bucket = "xain-fl-participants-weights"
+# (Required) Name of the bucket for storing the model weights
+bucket = "xain-fl"
 # (Required) AWS access key ID to use to authenticate to the storage service
 access_key_id = "minio"
 # (Required) AWS secret access to use to authenticate to the storage service

diff --git a/configs/xain-fl.toml b/configs/xain-fl.toml
@@ -30,10 +30,8 @@ fraction_participants = 1.0
 [storage]
 # URL to the storage service to use
 endpoint = "http://minio-dev:9000"
-# Name of the bucket for storing the global model weights
-global_weights_bucket = "xain-fl-aggregated-weights"
-# Name of the bucket for retrieving the local model weights
-local_weights_bucket = "xain-fl-participants-weights"
+# Name of the bucket for storing the model weights
+bucket = "xain-fl"
 # AWS access key ID to use to authenticate to the storage service
 access_key_id = "minio"
 # AWS secret access to use to authenticate to the storage service

diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml
@@ -38,12 +38,9 @@ services:
         done;
         echo Connected!;
         mc config host add dev-minio http://minio-dev:9000 $${MINIO_ACCESS_KEY} $${MINIO_SECRET_KEY};
-        /usr/bin/mc mb -p dev-minio/xain-fl-temporary-weights;
-        /usr/bin/mc mb -p dev-minio/xain-fl-aggregated-weights;
-        /usr/bin/mc policy set upload dev-minio/xain-fl-temporary-weights;
-        /usr/bin/mc policy set download dev-minio/xain-fl-temporary-weights;
-        /usr/bin/mc policy set upload dev-minio/xain-fl-aggregated-weights;
-        /usr/bin/mc policy set download dev-minio/xain-fl-aggregated-weights;
+        /usr/bin/mc mb -p dev-minio/xain-fl;
+        /usr/bin/mc policy set upload dev-minio/xain-fl;
+        /usr/bin/mc policy set download dev-minio/xain-fl;
         /usr/bin/mc admin trace -v -e dev-minio;
       "
 
@@ -111,6 +108,8 @@ services:
     build:
       context: .
       dockerfile: Dockerfile.dev
+    depends_on:
+      - influxdb
     volumes:
       - /app/xain_fl.egg-info  #  don't use the local egg-info, if one exists
       - ${PWD}/xain_fl:/app/xain_fl

diff --git a/docs/network_architecture.rst b/docs/network_architecture.rst
@@ -53,6 +53,74 @@ Federated Machine Learning Flow
     b. Resume the task
 7. Once all rounds are completed the *Coordinator* can just exit
 
+Model weights distribution
+--------------------------
+
+Models can be massive (several dozens of megabytes), and protobuf is
+not suited for exchanging such data. Instead, the participants and the
+coordinator use an S3 bucket to exchange their weights. The exact
+mechanism is represented by the sequence diagram below.
+
+At the beginning of a round (1) the selected participants send a
+``StartTrainingRound`` request.
+
+Once it receives a response, the participant fetches the weights for
+the current global model from the S3 store (2). S3 buckets are
+key-value stores, and the key for global weights is
+``<round>/global``.
+
+Then, the participant trains. Once done, it uploads its local weights
+to the S3 bucket (3). The key is ``<round>/<participant_id>``.
+
+Finally (4), the participant sends its ``EndTrainingRoundRequest``. Before
+answering, the coordinator retrieves the local weights the participant
+has uploaded.
+
+.. code::
+
+        P                                C                      Store
+    1.  |   StartTrainingRoundRequest    |                        |
+        | -----------------------------> |                        |
+        |   StartTrainingRoundResponse   |                        |
+        | <----------------------------- |                        |
+        |                                |                        |
+        |                Get global weights (key="round/global")  |
+    2.  | ------------------------------------------------------> |
+        |                         Global weights                  |
+        | <------------------------------------------------------ |
+        |                                |                        |
+        | [train...]                     |                        |
+        |                                |                        |
+    3.  |       Set local weights (key="round/participant")       |
+        | ------------------------------------------------------> |
+        |                               Ok                        |
+        | <------------------------------------------------------ |
+        |                                |                        |
+    4.  |   EndTrainingRoundRequest      |                        |
+        | -----------------------------> | Get local weights (key="round/participant")
+        |                                | ---------------------> |
+        |                                | Local weights          |
+        |  EndTrainingRoundResponse      | <--------------------- |
+        | <----------------------------- |                        |
+
+At the end of the round, the coordinator writes the weights to the s3
+bucket, using the next upcoming round number as key (see the sequence
+diagram below).
+
+.. code::
+
+    P                                C                      Store
+    |   EndTrainingRoundRequest      |                        |
+    | -----------------------------> | Get local weights (key="round/participant")
+    |                                | ---------------------> |
+    |                                | Local weights          |
+    |  EndTrainingRoundResponse      | <--------------------> |
+    | <----------------------------- |                        |
+    |                                |                        |
+    |                                | Set global weights (key="round+1/participant")
+    |                                | ---------------------> |
+    |                                | Ok                     |
+    |                                | <--------------------- |
 
 Coordinator
 -----------
@@ -89,13 +157,13 @@ A **Rendezvous** method that allows *Participants* to register with a
 about the *Participant* in order to keep track of what the *Participant* is
 doing.
 
-A **StartTrainingRound** method that allows *Participants* to get the current global
-model as well as signaling their intent to participate in a given round.
+A **StartTrainingRound** method that allows *Participants* to retrieve
+the current global model as well as signaling their intent to
+participate in a given round.
 
 An **EndTrainingRound** method that allows *Participants* to submit their updated
 models after they finished their training task.
 
-
 In order to remain agnostic to the machine learning framework *Participants*
 and *Coordinator* exchange models in the form of numpy arrays. How models are
 converted from a particular machine learning framework model into numpy arrays
@@ -347,27 +415,21 @@ where the request and response data are given as the following protobuf messages
 
    message StartTrainingRoundRequest {}
 
+
    message StartTrainingRoundResponse {
-       xain_proto.np.NDArray weights = 1;
-       int32 epochs = 2;
-       int32 epoch_base = 3;
+       int32 epochs = 1;
+       int32 epoch_base = 2;
    }
 
    message EndTrainingRoundRequest {
-       xain_proto.np.NDArray weights = 1;
+       string participant_id = 1;
        int32 number_samples = 2;
-       map<string, xain_proto.np.NDArray> metrics = 3;
+       string metrics = 3;
    }
 
    message EndTrainingRoundResponse {}
 
 
-Note that while most of the Python data types to be exchanged can be
-"protobuf-erized" (and back), :code:`ndarray` requires more work. Fortunately we
-have the 
-`xain_proto/np <https://github.com/xainag/xain-proto/tree/master/python/xain_proto/np>`_
-project to help with this conversion.
-
 Training Round Communication
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/mypy.ini b/mypy.ini
@@ -1,5 +1,16 @@
 [mypy]
+python_version = 3.6
 ignore_missing_imports = True
+follow_imports = skip
+warn_redundant_casts = True
+warn_unused_configs = True
 
-[mypy-xain_fl.coordinator.legacy_participant]
-ignore_errors = True
+[mypy-xain_fl.*]
+disallow_any_explicit = False
+disallow_incomplete_defs = True
+disallow_untyped_defs = True
+disallow_untyped_calls = False
+check_untyped_defs = True
+warn_return_any = True
+warn_unused_ignores = True
+no_implicit_optional = True
diff --git a/setup.py b/setup.py
@@ -27,7 +27,7 @@
     "numpy==1.15",  # BSD
     "grpcio==1.23",  # Apache License 2.0
     "structlog==19.2.0",  # Apache License 2.0
-    "xain-proto==0.5.0",  # Apache License 2.0
+    "xain-proto==0.6.0",  # Apache License 2.0
     "boto3==1.10.48",  # Apache License 2.0
     "toml==0.10.0",  # MIT
     "schema~=0.7",  # MIT
@@ -38,7 +38,7 @@
 
 dev_require = [
     "black==19.10b0",  # MIT
-    "mypy==0.760",  # MIT License
+    "mypy==0.761",  # MIT License
     "pylint==2.3.1",  # GPL
     "astroid==2.2.5",  # LGPL
     "isort==4.3.21",  # MIT
@@ -52,7 +52,8 @@
     "pytest==5.3.2",  # MIT license
     "pytest-cov==2.8.1",  # MIT
     "pytest-watch==4.2.0",  # MIT
-    "xain-sdk==0.5.0",  # Apache License 2.0
+    "pytest-mock==2.0.0",  # MIT
+    "xain-sdk==0.6.0",  # Apache License 2.0
 ]
 
 docs_require = [