Neuron SDK Release 2.19.0 (#919)

Neuron SDK Release 2.19.0 - Release Notes
aws-neuron · Jul 4, 2024 · 215b421 · 215b421
1 parent 78169c6
commit 215b421
Show file tree

Hide file tree

Showing 103 changed files with 2,682 additions and 596 deletions.
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -8,18 +8,18 @@
 # review when someone opens a pull request.
 # *       @global-owner1 @global-owner2
 
-*       @aws-maens @aws-mesharma @rgrandhiamzn
+*       @aws-maens @micwade-aws @musunita @aws-sadaf @natemail-aws @rgrandhiamzn @eshalakhotia @jluntamazon @jeffhataws @aws-rhsoln @hannanjgaws @aws-trsharma @PrashantSaraf @shadis @aws-donkrets @aws-singhada @gsnaws @awsjoshir @sidjoshiaws @pinak-p @vikas-paliwal-aws
 
-src/examples/mxnet/ @aws-rhsoln  @aws-sadaf @aws-maens
-neuron-guide/neuron-frameworks/mxnet-neuron/  @aws-rhsoln @aws-maens
-neuron-guide/neuron-frameworks/mxnet-neuron/tutorials/ @kct22aws  @musunita @aws-rhsoln @aws-maens 
+src/examples/mxnet/ @aws-rhsoln  @aws-sadaf @aws-maens @vikas-paliwal-aws @rgrandhiamzn @eshalakhotia
+neuron-guide/neuron-frameworks/mxnet-neuron/  @aws-rhsoln @aws-maens @vikas-paliwal-aws @rgrandhiamzn @eshalakhotia
+neuron-guide/neuron-frameworks/mxnet-neuron/tutorials/ @musunita @aws-rhsoln @aws-maens @vikas-paliwal-aws @rgrandhiamzn @eshalakhotia
 
-src/examples/tensorflow/  @awshaichen  @aws-sadaf @aws-maens
-neuron-guide/neuron-frameworks/tensorflow-neuron/ @awshaichen @aws-maens
-neuron-guide/neuron-frameworks/tensorflow-neuron/tutorials/ @kct22aws  @musunita @awshaichen @aws-maens 
+src/examples/tensorflow/  @awshaichen  @aws-sadaf @aws-maens @vikas-paliwal-aws @rgrandhiamzn @eshalakhotia
+neuron-guide/neuron-frameworks/tensorflow-neuron/ @awshaichen @aws-maens @vikas-paliwal-aws @rgrandhiamzn @eshalakhotia
+neuron-guide/neuron-frameworks/tensorflow-neuron/tutorials/ @musunita @awshaichen @aws-maens @vikas-paliwal-aws @rgrandhiamzn @eshalakhotia
 
 
-src/examples/pytorch/ @jluntamazon @aws-sadaf @aws-maens
-neuron-guide/neuron-frameworks/pytorch-neuron/  @jluntamazon @aws-maens
-neuron-guide/neuron-frameworks/pytorch-neuron/tutorials/ @kct22aws  @musunita @jluntamazon @aws-maens 
+src/examples/pytorch/ @jluntamazon @aws-sadaf @aws-maens @vikas-paliwal-aws @rgrandhiamzn @eshalakhotia
+neuron-guide/neuron-frameworks/pytorch-neuron/  @jluntamazon @aws-maens @vikas-paliwal-aws @rgrandhiamzn @eshalakhotia
+neuron-guide/neuron-frameworks/pytorch-neuron/tutorials/ @musunita @jluntamazon @aws-maens @vikas-paliwal-aws @rgrandhiamzn @eshalakhotia
 
diff --git a/conf.py b/conf.py
@@ -81,6 +81,7 @@
     'sphinx.ext.autodoc',
     'local_documenter',
     'archive',
+    "sphinx_copybutton",
 ]
 
 
@@ -97,6 +98,10 @@
 exclude_patterns = ['_build','**.ipynb_checkpoints','.venv']
 html_extra_path = ['static']
 
+# remove bash/python/ipython/jupyter prompts and continuations
+copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: "
+copybutton_prompt_is_regexp = True
+
 # nbsphinx_allow_errors = True
 nbsphinx_execute = 'never'
 
@@ -141,9 +146,7 @@
 
 #top_banner_message="<span>&#9888;</span><a class='reference internal' style='color:white;' href='https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/setup-troubleshooting.html#gpg-key-update'>  Neuron repository GPG key for Ubuntu installation has expired, see instructions how to update! </a>"
 
-
-top_banner_message="Neuron 2.18.2 is released! check <a class='reference internal' style='color:white;' href='https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/index.html#latest-neuron-release'> What's New  </a> and <a class='reference internal' style='color:white;' href='https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/announcements/index.html'> Announcements  </a>"
-
+top_banner_message="Neuron 2.19.0 is released! check <a class='reference internal' style='color:white;' href='https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/index.html#latest-neuron-release'> What's New  </a> and <a class='reference internal' style='color:white;' href='https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/announcements/index.html'> Announcements  </a>"
 
 html_theme = "sphinx_book_theme"
 html_theme_options = {
@@ -234,6 +237,6 @@
 ,r'https://github.com/awslabs/multi-model-server/blob/master/docs/management_api.md',r'https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/training/dp_bert_hf_pretrain/run_dp_bert_large_hf_pretrain_bf16_s128.sh',r' https://github.com/pytorch/xla/blob/master/test/test_train_mp_mnist.py',r'https://github.com/pytorch/xla/blob/v1.10.0/TROUBLESHOOTING.md'
 ,r'https://github.com/tensorflow/docs/blob/master/site/en/r1/guide/saved_model.md',r'https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/index.md',r'https://github.com/pytorch/xla/blob/master/test/test_train_mp_mnist.py',r'https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/transformers-neuronx/inference/meta-llama-2-13b-sampling.ipynb'
 ,r'https://github.com/aws-neuron/aws-neuron-sdk/blob/master/src/examples/pytorch/torch-neuronx/t5-inference-tutorial.ipynb',r'https://github.com/aws-neuron/aws-neuron-parallelcluster-samples/blob/master/examples/jobs/neuronx-nemo-megatron-llamav2-job.md',r'https://github.com/pytorch/PiPPy/blob/main/pippy/IR.py#L697', r'https://github.com/pytorch/pytorch/blob/main/torch/fx/_symbolic_trace.py#L241', r'https://github.com/pytorch/xla/blob/master/torch_xla/utils/checkpoint.py#L129', r'https://github.com/aws-neuron/neuronx-distributed/blob/main/src/neuronx_distributed/parallel_layers/layer_norm.py#L32', r'https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain.py#L273C1-L289C55'
-,r'https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/setup/pytorch-install.html#pytorch-neuronx-install',r'https://github.com/google-research/bert#user-content-pre-trained-models',r'https://github.com/google-research/bert#user-content-sentence-and-sentence-pair-classification-tasks', r'https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-retirement.html', r'https://repost.aws/knowledge-center/eventbridge-notification-scheduled-events', r'https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/modeling_gpt_neox_nxd.py',r'https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain.py']
+,r'https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/setup/pytorch-install.html#pytorch-neuronx-install',r'https://github.com/google-research/bert#user-content-pre-trained-models',r'https://github.com/google-research/bert#user-content-sentence-and-sentence-pair-classification-tasks', r'https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-retirement.html', r'https://repost.aws/knowledge-center/eventbridge-notification-scheduled-events', r'https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/modeling_gpt_neox_nxd.py',r'https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain.py',r'https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/transformers-neuronx/inference/llama-3-8b-32k-sampling.ipynb']
 linkcheck_exclude_documents = [r'src/examples/.*', 'general/announcements/neuron1.x/announcements', r'release-notes/.*',r'containers/.*',r'general/.*']
 nitpicky = True
diff --git a/containers/getting-started.txt b/containers/getting-started.txt
@@ -60,7 +60,7 @@
                sudo yum install -y docker.io
                sudo usermod -aG docker $USER
 
-               Logout and log back in to refresh membership.
+            Logout and log back in to refresh membership.
 
       .. dropdown:: Verify Docker
             :class-title: sphinx-design-class-title-small
@@ -97,32 +97,32 @@
                https://docs.docker.com/get-started/
 
       .. dropdown:: Verify Neuron Component
-            :class-title: sphinx-design-class-title-small
-            :class-body: sphinx-design-class-body-small
-            :animate: fade-in
+           :class-title: sphinx-design-class-title-small
+           :class-body: sphinx-design-class-body-small
+           :animate: fade-in
 
-            Once the environment is setup, a container can be started with
-            --device=/dev/neuron# to specify desired set of Inferentia/Trainium devices to be
-            exposed to the container. To find out the available neuron devices on
-            your instance, use the command ``ls /dev/neuron*``.
+           Once the environment is setup, a container can be started with
+           --device=/dev/neuron# to specify desired set of Inferentia/Trainium devices to be
+           exposed to the container. To find out the available neuron devices on
+           your instance, use the command ``ls /dev/neuron*``.
 
-            When running neuron-ls inside a container, you will only see the set of
-            exposed Trainiums. For example:
+           When running neuron-ls inside a container, you will only see the set of
+           exposed Trainiums. For example:
 
-            .. code:: bash
+           .. code:: bash
 
-               docker run --device=/dev/neuron0 neuron-test neuron-ls
+             docker run --device=/dev/neuron0 neuron-test neuron-ls
 
-               Would produce the following output in trn1.32xlarge:
+           Would produce the following output in trn1.32xlarge:
 
-               ::
+           ::
 
-                  +--------+--------+--------+---------+
-                  | NEURON | NEURON | NEURON |   PCI   |
-                  | DEVICE | CORES  | MEMORY |   BDF   |
-                  +--------+--------+--------+---------+
-                  | 0      | 2      | 32 GB  | 10:1c.0 |
-                  +--------+--------+--------+---------+
+             +--------+--------+--------+---------+
+             | NEURON | NEURON | NEURON |   PCI   |
+             | DEVICE | CORES  | MEMORY |   BDF   |
+             +--------+--------+--------+---------+
+             | 0      | 2      | 32 GB  | 10:1c.0 |
+             +--------+--------+--------+---------+
 
       .. dropdown:: Build and Run Docker Image
             :class-title: sphinx-design-class-title-small
@@ -146,8 +146,7 @@
             :class-title: sphinx-design-class-title-small
             :class-body: sphinx-design-class-body-small
             :animate: fade-in
-
-            .. include:: /general/setup/install-templates/launch-inf1.txt
+                  .. include:: /general/setup/install-templates/launch-inf1.txt
 
       .. dropdown:: Install Drivers
             :class-title: sphinx-design-class-title-small
@@ -195,7 +194,7 @@
                sudo yum install -y docker.io
                sudo usermod -aG docker $USER
 
-               Logout and log back in to refresh membership.
+            Logout and log back in to refresh membership.
 
       .. dropdown:: Verify Docker
             :class-title: sphinx-design-class-title-small
@@ -233,32 +232,32 @@
 
 
       .. dropdown:: Verify Neuron Component
-            :class-title: sphinx-design-class-title-small
-            :class-body: sphinx-design-class-body-small
-            :animate: fade-in
+           :class-title: sphinx-design-class-title-small
+           :class-body: sphinx-design-class-body-small
+           :animate: fade-in
 
-            Once the environment is setup, a container can be started with
-            --device=/dev/neuron# to specify desired set of Inferentia/Trainium devices to be
-            exposed to the container. To find out the available neuron devices on
-            your instance, use the command ``ls /dev/neuron*``.
+           Once the environment is setup, a container can be started with
+           --device=/dev/neuron# to specify desired set of Inferentia/Trainium devices to be
+           exposed to the container. To find out the available neuron devices on
+           your instance, use the command ``ls /dev/neuron*``.
 
-            When running neuron-ls inside a container, you will only see the set of
-            exposed Inferentias. For example:
+           When running neuron-ls inside a container, you will only see the set of
+           exposed Inferentias. For example:
 
-            .. code:: bash
+           .. code:: bash
 
-               docker run --device=/dev/neuron0 neuron-test neuron-ls
+             docker run --device=/dev/neuron0 neuron-test neuron-ls
 
-               Would produce the following output in inf1.xlarge:
+           Would produce the following output in inf1.xlarge:
 
-               ::
+           ::
 
-                  +--------------+---------+--------+-----------+-----------+------+------+
-                  |   PCI BDF    | LOGICAL | NEURON |  MEMORY   |  MEMORY   | EAST | WEST |
-                  |              |   ID    | CORES  | CHANNEL 0 | CHANNEL 1 |      |      |
-                  +--------------+---------+--------+-----------+-----------+------+------+
-                  | 0000:00:1f.0 |       0 |      4 | 4096 MB   | 4096 MB   |    0 |    0 |
-                  +--------------+---------+--------+-----------+-----------+------+------+
+		     +--------------+---------+--------+-----------+-----------+------+------+
+		     |   PCI BDF    | LOGICAL | NEURON |  MEMORY   |  MEMORY   | EAST | WEST |
+		     |              |   ID    | CORES  | CHANNEL 0 | CHANNEL 1 |      |      |
+		     +--------------+---------+--------+-----------+-----------+------+------+
+		     | 0000:00:1f.0 |       0 |      4 | 4096 MB   | 4096 MB   |    0 |    0 |
+		     +--------------+---------+--------+-----------+-----------+------+------+
 
       .. dropdown::  Run Tutorial
             :class-title: sphinx-design-class-title-small

diff --git a/containers/kubernetes-getting-started.rst b/containers/kubernetes-getting-started.rst
@@ -1,5 +1,10 @@
 Containers - Kubernetes - Getting Started
 =========================================
 
+The Neuron device plugin is a DaemonSet run on all Inferentia and Trainium nodes that enables the containers in your Kubernetes cluster to request and use Neuron cores or devices.
+The Neuron scheduler extension is required for containers in your Kubernetes cluster that request multiple Neuron resources. 
+It helps find optimal sets of Neuron resources to minimize inter-resource communication costs. 
+Below are directions for installing and using the Neuron device plugin and scheduler extension.
+
 
 .. include:: /containers/kubernetes-getting-started.txt
diff --git a/containers/kubernetes-getting-started.txt b/containers/kubernetes-getting-started.txt
@@ -5,6 +5,13 @@
 
         .. include:: /containers/tutorials/k8s-prerequisite.rst
 
+.. dropdown:: Prerequisite for Neuron Problem Detector Plugin
+        :class-title: sphinx-design-class-title-small
+        :class-body: sphinx-design-class-body-small
+        :animate: fade-in
+
+        .. include:: /containers/tutorials/k8s-neuron-problem-detector-and-recovery-irsa.rst
+
 .. dropdown:: Deploy Neuron Device Plugin
         :class-title: sphinx-design-class-title-small
         :class-body: sphinx-design-class-body-small
@@ -17,4 +24,18 @@
         :class-body: sphinx-design-class-body-small
         :animate: fade-in
 
-        .. include:: /containers/tutorials/k8s-neuron-scheduler.rst
+        .. include:: /containers/tutorials/k8s-neuron-scheduler.rst
+
+.. dropdown:: Deploy Neuron Problem Detector And Recovery
+        :class-title: sphinx-design-class-title-small
+        :class-body: sphinx-design-class-body-small
+        :animate: fade-in
+
+        .. include:: /containers/tutorials/k8s-neuron-problem-detector-and-recovery.rst
+
+.. dropdown:: Deploy Neuron Monitor Daemonset
+        :class-title: sphinx-design-class-title-small
+        :class-body: sphinx-design-class-body-small
+        :animate: fade-in
+
+        .. include:: /containers/tutorials/k8s-neuron-monitor.rst
diff --git a/containers/tutorials/inference/tutorial-infer.rst b/containers/tutorials/inference/tutorial-infer.rst
@@ -23,7 +23,6 @@ Setup Environment
 -----------------
 
 1. Launch an Inf1 Instance
-	.. include:: /general/setup/install-templates/launch-inf1.txt
 
 2. Set up docker environment according to :ref:`tutorial-docker-env-setup`
 

diff --git a/containers/tutorials/k8s-neuron-device-plugin.rst b/containers/tutorials/k8s-neuron-device-plugin.rst
@@ -1,6 +1,6 @@
 .. _k8s-neuron-device-plugin:
 
-Neuron device plugin exposes Neuron cores & devices to kubernetes as a resource. aws.amazon.com/neuroncore, aws.amazon.com/neurondevice, aws.amazon.com/neuron are the resources that the neuron device plugin registers with the kubernetes. aws.amazon.com/neuroncore is used for allocating neuron cores to the container. aws.amazon.com/neurondevice is used for allocating neuron devices to the container. When neurondevice is used all the cores belonging to the device will be allocated to container. aws.amazon.com/neuron also allocates neurondevices and this exists just to be backward compatible with already existing installations. aws.amazon.com/neurondevice is the recommended resource for allocating devices to the container.
+Neuron device plugin exposes Neuron cores & devices to kubernetes as a resource. aws.amazon.com/neuroncore, aws.amazon.com/neurondevice, aws.amazon.com/neuron are the resources that the neuron device plugin registers with the kubernetes. aws.amazon.com/neuroncore is used for allocating neuron cores to the container. aws.amazon.com/neurondevice is used for allocating neuron devices to the container. When neurondevice is used all the cores belonging to the device will be allocated to container. aws.amazon.com/neuron also allocates neurondevices. Resource name 'neuron' is recommended for allocating devices to the container. Neuron will be ending support of resource name 'neurondevice' in a future release. Please check announcements for updates.
 
 * Make sure :ref:`prequisite<k8s-prerequisite>` are satisified
 * Download the neuron device plugin yaml file. :download:`k8s-neuron-device-plugin.yml </src/k8/k8s-neuron-device-plugin.yml>`
@@ -49,4 +49,4 @@ Neuron device plugin exposes Neuron cores & devices to kubernetes as a resource.
 
         NAME                                          NeuronDevice
         ip-192-168-65-41.us-west-2.compute.internal   16
-        ip-192-168-87-81.us-west-2.compute.internal   16
+        ip-192-168-87-81.us-west-2.compute.internal   16
diff --git a/containers/tutorials/k8s-neuron-monitor.rst b/containers/tutorials/k8s-neuron-monitor.rst
@@ -0,0 +1,59 @@
+.. _k8s-neuron-monitor:
+
+ Neuron monitor Container
+ ========================
+
+ Neuron monitor is primary observability tool for neuron devices. For details of neuron monitor, please refer to the `neuron monitor guide <https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-monitor-user-guide.html>`_. This tutorial describes deploying neuron monitor as a daemonset on the kubernetes cluster.
+
+
+* Download the neuron monitor  yaml file. :download:`k8s-neuron-monitor-daemonset.yml </src/k8/k8s-neuron-monitor-daemonset.yml>`
+* Apply the Neuron monitor yaml to create a daemonset on the cluster with the following command
+
+    .. code:: bash
+
+        kubectl apply -f k8s-neuron-monitor.yml
+    
+* Verify that neuron monitor daemonset is running
+
+    .. code:: bash
+
+        kubectl get ds neuron-monitor --namespace neuron-monitor
+
+    Expected result (with 2 nodes in cluster):
+
+    .. code:: bash
+
+        NAME                             DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE   NODE SELECTOR   AGE
+        neuron-monitor                     2         2         2       2            2           <none>          27h
+
+
+* Get the neuron-monitor pod names
+    .. code:: bash
+
+        kubectl get pods
+
+    Expected result
+
+    .. code:: bash 
+
+        NAME                   READY   STATUS    RESTARTS   AGE
+        neuron-monitor-slsxf   1/1     Running   0          17m
+        neuron-monitor-wc4f5   1/1     Running   0          17m
+    
+
+* Verify the prometheus endpoint is available 
+    .. code:: bash
+
+        kubectl exec neuron-monitor-wc4f5 -- wget -q --output-document - http://127.0.0.1:8000
+
+    Expected result
+
+    .. code:: bash
+
+        # HELP python_gc_objects_collected_total Objects collected during gc
+        # TYPE python_gc_objects_collected_total counter
+        python_gc_objects_collected_total{generation="0"} 362.0
+        python_gc_objects_collected_total{generation="1"} 0.0
+        python_gc_objects_collected_total{generation="2"} 0.0
+        # HELP python_gc_objects_uncollectable_total Uncollectable objects found during GC
+        # TYPE python_gc_objects_uncollectable_total counter
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,7 +23,6 @@ Setup Environment @@
     -----------------
 . Launch an Inf1 Instance
-    	.. include:: /general/setup/install-templates/launch-inf1.txt
 . Set up docker environment according to :ref:`tutorial-docker-env-setup`
@@ Expand Down @@