From 400efb1f653bba6ff5e32cc06c2cb7daf3eb2fd8 Mon Sep 17 00:00:00 2001 From: Saiteja Samudrala Date: Wed, 30 Oct 2024 16:32:33 -0700 Subject: [PATCH 1/4] redirect duplicate save_load tutorials to the main save_load tutorial --- .../recipes/save_load_across_devices.py | 181 ------------------ .../recipes/save_load_across_devices.rst | 0 ...saving_and_loading_a_general_checkpoint.py | 155 --------------- ...aving_and_loading_a_general_checkpoint.rst | 10 + ...saving_and_loading_models_for_inference.py | 168 ---------------- ...aving_and_loading_models_for_inference.rst | 0 .../saving_multiple_models_in_one_file.py | 154 --------------- .../saving_multiple_models_in_one_file.rst | 10 + 8 files changed, 20 insertions(+), 658 deletions(-) delete mode 100644 recipes_source/recipes/save_load_across_devices.py create mode 100644 recipes_source/recipes/save_load_across_devices.rst delete mode 100644 recipes_source/recipes/saving_and_loading_a_general_checkpoint.py create mode 100644 recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst delete mode 100644 recipes_source/recipes/saving_and_loading_models_for_inference.py create mode 100644 recipes_source/recipes/saving_and_loading_models_for_inference.rst delete mode 100644 recipes_source/recipes/saving_multiple_models_in_one_file.py create mode 100644 recipes_source/recipes/saving_multiple_models_in_one_file.rst diff --git a/recipes_source/recipes/save_load_across_devices.py b/recipes_source/recipes/save_load_across_devices.py deleted file mode 100644 index c59af8821e..0000000000 --- a/recipes_source/recipes/save_load_across_devices.py +++ /dev/null @@ -1,181 +0,0 @@ -""" -Saving and loading models across devices in PyTorch -=================================================== - -There may be instances where you want to save and load your neural -networks across different devices. - -Introduction ------------- - -Saving and loading models across devices is relatively straightforward -using PyTorch. In this recipe, we will experiment with saving and -loading models across CPUs and GPUs. - -Setup ------ - -In order for every code block to run properly in this recipe, you must -first change the runtime to “GPU” or higher. Once you do, we need to -install ``torch`` if it isn’t already available. - -.. code-block:: sh - - pip install torch - -""" - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Save on a GPU, load on a CPU -# 4. Save on a GPU, load on a GPU -# 5. Save on a CPU, load on a GPU -# 6. Saving and loading ``DataParallel`` models -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -net = Net() -print(net) - - -###################################################################### -# 3. Save on GPU, Load on CPU -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# When loading a model on a CPU that was trained with a GPU, pass -# ``torch.device('cpu')`` to the ``map_location`` argument in the -# ``torch.load()`` function. -# - -# Specify a path to save to -PATH = "model.pt" - -# Save -torch.save(net.state_dict(), PATH) - -# Load -device = torch.device('cpu') -model = Net() -model.load_state_dict(torch.load(PATH, map_location=device, weights_only=True)) - - -###################################################################### -# In this case, the storages underlying the tensors are dynamically -# remapped to the CPU device using the ``map_location`` argument. -# -# 4. Save on GPU, Load on GPU -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# When loading a model on a GPU that was trained and saved on GPU, simply -# convert the initialized model to a CUDA optimized model using -# ``model.to(torch.device('cuda'))``. -# -# Be sure to use the ``.to(torch.device('cuda'))`` function on all model -# inputs to prepare the data for the model. -# - -# Save -torch.save(net.state_dict(), PATH) - -# Load -device = torch.device("cuda") -model = Net() -model.load_state_dict(torch.load(PATH)) -model.to(device) - - -###################################################################### -# Note that calling ``my_tensor.to(device)`` returns a new copy of -# ``my_tensor`` on GPU. It does NOT overwrite ``my_tensor``. Therefore, -# remember to manually overwrite tensors: -# ``my_tensor = my_tensor.to(torch.device('cuda'))``. -# -# 5. Save on CPU, Load on GPU -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# When loading a model on a GPU that was trained and saved on CPU, set the -# ``map_location`` argument in the ``torch.load()`` function to -# ``cuda:device_id``. This loads the model to a given GPU device. -# -# Be sure to call ``model.to(torch.device('cuda'))`` to convert the -# model’s parameter tensors to CUDA tensors. -# -# Finally, also be sure to use the ``.to(torch.device('cuda'))`` function -# on all model inputs to prepare the data for the CUDA optimized model. -# - -# Save -torch.save(net.state_dict(), PATH) - -# Load -device = torch.device("cuda") -model = Net() -# Choose whatever GPU device number you want -model.load_state_dict(torch.load(PATH, map_location="cuda:0")) -# Make sure to call input = input.to(device) on any input tensors that you feed to the model -model.to(device) - - -###################################################################### -# 6. Saving ``torch.nn.DataParallel`` Models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# ``torch.nn.DataParallel`` is a model wrapper that enables parallel GPU -# utilization. -# -# To save a ``DataParallel`` model generically, save the -# ``model.module.state_dict()``. This way, you have the flexibility to -# load the model any way you want to any device you want. -# - -# Save -torch.save(net.module.state_dict(), PATH) - -# Load to whatever device you want - - -###################################################################### -# Congratulations! You have successfully saved and loaded models across -# devices in PyTorch. -# diff --git a/recipes_source/recipes/save_load_across_devices.rst b/recipes_source/recipes/save_load_across_devices.rst new file mode 100644 index 0000000000..e69de29bb2 diff --git a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py deleted file mode 100644 index 8c773a1490..0000000000 --- a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -Saving and loading a general checkpoint in PyTorch -================================================== -Saving and loading a general checkpoint model for inference or -resuming training can be helpful for picking up where you last left off. -When saving a general checkpoint, you must save more than just the -model’s state_dict. It is important to also save the optimizer’s -state_dict, as this contains buffers and parameters that are updated as -the model trains. Other items that you may want to save are the epoch -you left off on, the latest recorded training loss, external -``torch.nn.Embedding`` layers, and more, based on your own algorithm. - -Introduction ------------- -To save multiple checkpoints, you must organize them in a dictionary and -use ``torch.save()`` to serialize the dictionary. A common PyTorch -convention is to save these checkpoints using the ``.tar`` file -extension. To load the items, first initialize the model and optimizer, -then load the dictionary locally using torch.load(). From here, you can -easily access the saved items by simply querying the dictionary as you -would expect. - -In this recipe, we will explore how to save and load multiple -checkpoints. - -Setup ------ -Before we begin, we need to install ``torch`` if it isn’t already -available. - -:: - - pip install torch - - -""" - - - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Initialize the optimizer -# 4. Save the general checkpoint -# 5. Load the general checkpoint -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -net = Net() -print(net) - - -###################################################################### -# 3. Initialize the optimizer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We will use SGD with momentum. -# - -optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) - - -###################################################################### -# 4. Save the general checkpoint -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Collect all relevant information and build your dictionary. -# - -# Additional information -EPOCH = 5 -PATH = "model.pt" -LOSS = 0.4 - -torch.save({ - 'epoch': EPOCH, - 'model_state_dict': net.state_dict(), - 'optimizer_state_dict': optimizer.state_dict(), - 'loss': LOSS, - }, PATH) - - -###################################################################### -# 5. Load the general checkpoint -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Remember to first initialize the model and optimizer, then load the -# dictionary locally. -# - -model = Net() -optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) - -checkpoint = torch.load(PATH, weights_only=True) -model.load_state_dict(checkpoint['model_state_dict']) -optimizer.load_state_dict(checkpoint['optimizer_state_dict']) -epoch = checkpoint['epoch'] -loss = checkpoint['loss'] - -model.eval() -# - or - -model.train() - - -###################################################################### -# You must call ``model.eval()`` to set dropout and batch normalization -# layers to evaluation mode before running inference. Failing to do this -# will yield inconsistent inference results. -# -# If you wish to resuming training, call ``model.train()`` to ensure these -# layers are in training mode. -# -# Congratulations! You have successfully saved and loaded a general -# checkpoint for inference and/or resuming training in PyTorch. -# diff --git a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst new file mode 100644 index 0000000000..b868c26a6c --- /dev/null +++ b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst @@ -0,0 +1,10 @@ +Saving And Loading A General Checkpoint +======================================= + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/recipes_source/recipes/saving_and_loading_models_for_inference.py b/recipes_source/recipes/saving_and_loading_models_for_inference.py deleted file mode 100644 index 7adce2a90b..0000000000 --- a/recipes_source/recipes/saving_and_loading_models_for_inference.py +++ /dev/null @@ -1,168 +0,0 @@ -""" -Saving and loading models for inference in PyTorch -================================================== -There are two approaches for saving and loading models for inference in -PyTorch. The first is saving and loading the ``state_dict``, and the -second is saving and loading the entire model. - -Introduction ------------- -Saving the model’s ``state_dict`` with the ``torch.save()`` function -will give you the most flexibility for restoring the model later. This -is the recommended method for saving models, because it is only really -necessary to save the trained model’s learned parameters. -When saving and loading an entire model, you save the entire module -using Python’s -`pickle `__ module. Using -this approach yields the most intuitive syntax and involves the least -amount of code. The disadvantage of this approach is that the serialized -data is bound to the specific classes and the exact directory structure -used when the model is saved. The reason for this is because pickle does -not save the model class itself. Rather, it saves a path to the file -containing the class, which is used during load time. Because of this, -your code can break in various ways when used in other projects or after -refactors. -In this recipe, we will explore both ways on how to save and load models -for inference. - -Setup ------ -Before we begin, we need to install ``torch`` if it isn’t already -available. - - -:: - - pip install torch - - -""" - - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Initialize the optimizer -# 4. Save and load the model via ``state_dict`` -# 5. Save and load the entire model -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -net = Net() -print(net) - - -###################################################################### -# 3. Initialize the optimizer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We will use SGD with momentum. -# - -optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) - - -###################################################################### -# 4. Save and load the model via ``state_dict`` -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Let’s save and load our model using just ``state_dict``. -# - -# Specify a path -PATH = "state_dict_model.pt" - -# Save -torch.save(net.state_dict(), PATH) - -# Load -model = Net() -model.load_state_dict(torch.load(PATH, weights_only=True)) -model.eval() - - -###################################################################### -# A common PyTorch convention is to save models using either a ``.pt`` or -# ``.pth`` file extension. -# -# Notice that the ``load_state_dict()`` function takes a dictionary -# object, NOT a path to a saved object. This means that you must -# deserialize the saved state_dict before you pass it to the -# ``load_state_dict()`` function. For example, you CANNOT load using -# ``model.load_state_dict(PATH)``. -# -# Remember too, that you must call ``model.eval()`` to set dropout and -# batch normalization layers to evaluation mode before running inference. -# Failing to do this will yield inconsistent inference results. -# -# 5. Save and load entire model -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Now let’s try the same thing with the entire model. -# - -# Specify a path -PATH = "entire_model.pt" - -# Save -torch.save(net, PATH) - -# Load -model = torch.load(PATH) -model.eval() - - -###################################################################### -# Again here, remember that you must call ``model.eval()`` to set dropout and -# batch normalization layers to evaluation mode before running inference. -# -# Congratulations! You have successfully saved and load models for -# inference in PyTorch. -# -# Learn More -# ---------- -# -# Take a look at these other recipes to continue your learning: -# -# - `Saving and loading a general checkpoint in PyTorch `__ -# - `Saving and loading multiple models in one file using PyTorch `__ diff --git a/recipes_source/recipes/saving_and_loading_models_for_inference.rst b/recipes_source/recipes/saving_and_loading_models_for_inference.rst new file mode 100644 index 0000000000..e69de29bb2 diff --git a/recipes_source/recipes/saving_multiple_models_in_one_file.py b/recipes_source/recipes/saving_multiple_models_in_one_file.py deleted file mode 100644 index e938be03b4..0000000000 --- a/recipes_source/recipes/saving_multiple_models_in_one_file.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Saving and loading multiple models in one file using PyTorch -============================================================ -Saving and loading multiple models can be helpful for reusing models -that you have previously trained. - -Introduction ------------- -When saving a model comprised of multiple ``torch.nn.Modules``, such as -a GAN, a sequence-to-sequence model, or an ensemble of models, you must -save a dictionary of each model’s state_dict and corresponding -optimizer. You can also save any other items that may aid you in -resuming training by simply appending them to the dictionary. -To load the models, first initialize the models and optimizers, then -load the dictionary locally using ``torch.load()``. From here, you can -easily access the saved items by simply querying the dictionary as you -would expect. -In this recipe, we will demonstrate how to save multiple models to one -file using PyTorch. - -Setup ------ -Before we begin, we need to install ``torch`` if it isn’t already -available. - -.. code-block:: sh - - pip install torch - -""" - - - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Initialize the optimizer -# 4. Save multiple models -# 5. Load multiple models -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. Build -# two variables for the models to eventually save. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -netA = Net() -netB = Net() - - -###################################################################### -# 3. Initialize the optimizer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We will use SGD with momentum to build an optimizer for each model we -# created. -# - -optimizerA = optim.SGD(netA.parameters(), lr=0.001, momentum=0.9) -optimizerB = optim.SGD(netB.parameters(), lr=0.001, momentum=0.9) - - -###################################################################### -# 4. Save multiple models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Collect all relevant information and build your dictionary. -# - -# Specify a path to save to -PATH = "model.pt" - -torch.save({ - 'modelA_state_dict': netA.state_dict(), - 'modelB_state_dict': netB.state_dict(), - 'optimizerA_state_dict': optimizerA.state_dict(), - 'optimizerB_state_dict': optimizerB.state_dict(), - }, PATH) - - -###################################################################### -# 4. Load multiple models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Remember to first initialize the models and optimizers, then load the -# dictionary locally. -# - -modelA = Net() -modelB = Net() -optimModelA = optim.SGD(modelA.parameters(), lr=0.001, momentum=0.9) -optimModelB = optim.SGD(modelB.parameters(), lr=0.001, momentum=0.9) - -checkpoint = torch.load(PATH, weights_only=True) -modelA.load_state_dict(checkpoint['modelA_state_dict']) -modelB.load_state_dict(checkpoint['modelB_state_dict']) -optimizerA.load_state_dict(checkpoint['optimizerA_state_dict']) -optimizerB.load_state_dict(checkpoint['optimizerB_state_dict']) - -modelA.eval() -modelB.eval() -# - or - -modelA.train() -modelB.train() - - -###################################################################### -# You must call ``model.eval()`` to set dropout and batch normalization -# layers to evaluation mode before running inference. Failing to do this -# will yield inconsistent inference results. -# -# If you wish to resuming training, call ``model.train()`` to ensure these -# layers are in training mode. -# -# Congratulations! You have successfully saved and loaded multiple models -# in PyTorch. -# diff --git a/recipes_source/recipes/saving_multiple_models_in_one_file.rst b/recipes_source/recipes/saving_multiple_models_in_one_file.rst new file mode 100644 index 0000000000..33040e6c87 --- /dev/null +++ b/recipes_source/recipes/saving_multiple_models_in_one_file.rst @@ -0,0 +1,10 @@ +Saving Multiple Models In One File +================================== + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + From c5c074bcd7dd41cb99f2af6670dd3696c43d7c4d Mon Sep 17 00:00:00 2001 From: Saiteja Samudrala Date: Wed, 30 Oct 2024 16:32:33 -0700 Subject: [PATCH 2/4] redirect duplicate save_load tutorials to the main save_load tutorial --- .../recipes/save_load_across_devices.py | 181 ------------------ .../recipes/save_load_across_devices.rst | 10 + ...saving_and_loading_a_general_checkpoint.py | 155 --------------- ...aving_and_loading_a_general_checkpoint.rst | 10 + ...saving_and_loading_models_for_inference.py | 168 ---------------- ...aving_and_loading_models_for_inference.rst | 10 + .../saving_multiple_models_in_one_file.py | 154 --------------- .../saving_multiple_models_in_one_file.rst | 10 + 8 files changed, 40 insertions(+), 658 deletions(-) delete mode 100644 recipes_source/recipes/save_load_across_devices.py create mode 100644 recipes_source/recipes/save_load_across_devices.rst delete mode 100644 recipes_source/recipes/saving_and_loading_a_general_checkpoint.py create mode 100644 recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst delete mode 100644 recipes_source/recipes/saving_and_loading_models_for_inference.py create mode 100644 recipes_source/recipes/saving_and_loading_models_for_inference.rst delete mode 100644 recipes_source/recipes/saving_multiple_models_in_one_file.py create mode 100644 recipes_source/recipes/saving_multiple_models_in_one_file.rst diff --git a/recipes_source/recipes/save_load_across_devices.py b/recipes_source/recipes/save_load_across_devices.py deleted file mode 100644 index c59af8821e..0000000000 --- a/recipes_source/recipes/save_load_across_devices.py +++ /dev/null @@ -1,181 +0,0 @@ -""" -Saving and loading models across devices in PyTorch -=================================================== - -There may be instances where you want to save and load your neural -networks across different devices. - -Introduction ------------- - -Saving and loading models across devices is relatively straightforward -using PyTorch. In this recipe, we will experiment with saving and -loading models across CPUs and GPUs. - -Setup ------ - -In order for every code block to run properly in this recipe, you must -first change the runtime to “GPU” or higher. Once you do, we need to -install ``torch`` if it isn’t already available. - -.. code-block:: sh - - pip install torch - -""" - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Save on a GPU, load on a CPU -# 4. Save on a GPU, load on a GPU -# 5. Save on a CPU, load on a GPU -# 6. Saving and loading ``DataParallel`` models -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -net = Net() -print(net) - - -###################################################################### -# 3. Save on GPU, Load on CPU -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# When loading a model on a CPU that was trained with a GPU, pass -# ``torch.device('cpu')`` to the ``map_location`` argument in the -# ``torch.load()`` function. -# - -# Specify a path to save to -PATH = "model.pt" - -# Save -torch.save(net.state_dict(), PATH) - -# Load -device = torch.device('cpu') -model = Net() -model.load_state_dict(torch.load(PATH, map_location=device, weights_only=True)) - - -###################################################################### -# In this case, the storages underlying the tensors are dynamically -# remapped to the CPU device using the ``map_location`` argument. -# -# 4. Save on GPU, Load on GPU -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# When loading a model on a GPU that was trained and saved on GPU, simply -# convert the initialized model to a CUDA optimized model using -# ``model.to(torch.device('cuda'))``. -# -# Be sure to use the ``.to(torch.device('cuda'))`` function on all model -# inputs to prepare the data for the model. -# - -# Save -torch.save(net.state_dict(), PATH) - -# Load -device = torch.device("cuda") -model = Net() -model.load_state_dict(torch.load(PATH)) -model.to(device) - - -###################################################################### -# Note that calling ``my_tensor.to(device)`` returns a new copy of -# ``my_tensor`` on GPU. It does NOT overwrite ``my_tensor``. Therefore, -# remember to manually overwrite tensors: -# ``my_tensor = my_tensor.to(torch.device('cuda'))``. -# -# 5. Save on CPU, Load on GPU -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# When loading a model on a GPU that was trained and saved on CPU, set the -# ``map_location`` argument in the ``torch.load()`` function to -# ``cuda:device_id``. This loads the model to a given GPU device. -# -# Be sure to call ``model.to(torch.device('cuda'))`` to convert the -# model’s parameter tensors to CUDA tensors. -# -# Finally, also be sure to use the ``.to(torch.device('cuda'))`` function -# on all model inputs to prepare the data for the CUDA optimized model. -# - -# Save -torch.save(net.state_dict(), PATH) - -# Load -device = torch.device("cuda") -model = Net() -# Choose whatever GPU device number you want -model.load_state_dict(torch.load(PATH, map_location="cuda:0")) -# Make sure to call input = input.to(device) on any input tensors that you feed to the model -model.to(device) - - -###################################################################### -# 6. Saving ``torch.nn.DataParallel`` Models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# ``torch.nn.DataParallel`` is a model wrapper that enables parallel GPU -# utilization. -# -# To save a ``DataParallel`` model generically, save the -# ``model.module.state_dict()``. This way, you have the flexibility to -# load the model any way you want to any device you want. -# - -# Save -torch.save(net.module.state_dict(), PATH) - -# Load to whatever device you want - - -###################################################################### -# Congratulations! You have successfully saved and loaded models across -# devices in PyTorch. -# diff --git a/recipes_source/recipes/save_load_across_devices.rst b/recipes_source/recipes/save_load_across_devices.rst new file mode 100644 index 0000000000..fbda156220 --- /dev/null +++ b/recipes_source/recipes/save_load_across_devices.rst @@ -0,0 +1,10 @@ +Save Load Across Devices +======================== + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py deleted file mode 100644 index 8c773a1490..0000000000 --- a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -Saving and loading a general checkpoint in PyTorch -================================================== -Saving and loading a general checkpoint model for inference or -resuming training can be helpful for picking up where you last left off. -When saving a general checkpoint, you must save more than just the -model’s state_dict. It is important to also save the optimizer’s -state_dict, as this contains buffers and parameters that are updated as -the model trains. Other items that you may want to save are the epoch -you left off on, the latest recorded training loss, external -``torch.nn.Embedding`` layers, and more, based on your own algorithm. - -Introduction ------------- -To save multiple checkpoints, you must organize them in a dictionary and -use ``torch.save()`` to serialize the dictionary. A common PyTorch -convention is to save these checkpoints using the ``.tar`` file -extension. To load the items, first initialize the model and optimizer, -then load the dictionary locally using torch.load(). From here, you can -easily access the saved items by simply querying the dictionary as you -would expect. - -In this recipe, we will explore how to save and load multiple -checkpoints. - -Setup ------ -Before we begin, we need to install ``torch`` if it isn’t already -available. - -:: - - pip install torch - - -""" - - - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Initialize the optimizer -# 4. Save the general checkpoint -# 5. Load the general checkpoint -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -net = Net() -print(net) - - -###################################################################### -# 3. Initialize the optimizer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We will use SGD with momentum. -# - -optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) - - -###################################################################### -# 4. Save the general checkpoint -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Collect all relevant information and build your dictionary. -# - -# Additional information -EPOCH = 5 -PATH = "model.pt" -LOSS = 0.4 - -torch.save({ - 'epoch': EPOCH, - 'model_state_dict': net.state_dict(), - 'optimizer_state_dict': optimizer.state_dict(), - 'loss': LOSS, - }, PATH) - - -###################################################################### -# 5. Load the general checkpoint -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Remember to first initialize the model and optimizer, then load the -# dictionary locally. -# - -model = Net() -optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) - -checkpoint = torch.load(PATH, weights_only=True) -model.load_state_dict(checkpoint['model_state_dict']) -optimizer.load_state_dict(checkpoint['optimizer_state_dict']) -epoch = checkpoint['epoch'] -loss = checkpoint['loss'] - -model.eval() -# - or - -model.train() - - -###################################################################### -# You must call ``model.eval()`` to set dropout and batch normalization -# layers to evaluation mode before running inference. Failing to do this -# will yield inconsistent inference results. -# -# If you wish to resuming training, call ``model.train()`` to ensure these -# layers are in training mode. -# -# Congratulations! You have successfully saved and loaded a general -# checkpoint for inference and/or resuming training in PyTorch. -# diff --git a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst new file mode 100644 index 0000000000..b868c26a6c --- /dev/null +++ b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst @@ -0,0 +1,10 @@ +Saving And Loading A General Checkpoint +======================================= + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/recipes_source/recipes/saving_and_loading_models_for_inference.py b/recipes_source/recipes/saving_and_loading_models_for_inference.py deleted file mode 100644 index 7adce2a90b..0000000000 --- a/recipes_source/recipes/saving_and_loading_models_for_inference.py +++ /dev/null @@ -1,168 +0,0 @@ -""" -Saving and loading models for inference in PyTorch -================================================== -There are two approaches for saving and loading models for inference in -PyTorch. The first is saving and loading the ``state_dict``, and the -second is saving and loading the entire model. - -Introduction ------------- -Saving the model’s ``state_dict`` with the ``torch.save()`` function -will give you the most flexibility for restoring the model later. This -is the recommended method for saving models, because it is only really -necessary to save the trained model’s learned parameters. -When saving and loading an entire model, you save the entire module -using Python’s -`pickle `__ module. Using -this approach yields the most intuitive syntax and involves the least -amount of code. The disadvantage of this approach is that the serialized -data is bound to the specific classes and the exact directory structure -used when the model is saved. The reason for this is because pickle does -not save the model class itself. Rather, it saves a path to the file -containing the class, which is used during load time. Because of this, -your code can break in various ways when used in other projects or after -refactors. -In this recipe, we will explore both ways on how to save and load models -for inference. - -Setup ------ -Before we begin, we need to install ``torch`` if it isn’t already -available. - - -:: - - pip install torch - - -""" - - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Initialize the optimizer -# 4. Save and load the model via ``state_dict`` -# 5. Save and load the entire model -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -net = Net() -print(net) - - -###################################################################### -# 3. Initialize the optimizer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We will use SGD with momentum. -# - -optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) - - -###################################################################### -# 4. Save and load the model via ``state_dict`` -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Let’s save and load our model using just ``state_dict``. -# - -# Specify a path -PATH = "state_dict_model.pt" - -# Save -torch.save(net.state_dict(), PATH) - -# Load -model = Net() -model.load_state_dict(torch.load(PATH, weights_only=True)) -model.eval() - - -###################################################################### -# A common PyTorch convention is to save models using either a ``.pt`` or -# ``.pth`` file extension. -# -# Notice that the ``load_state_dict()`` function takes a dictionary -# object, NOT a path to a saved object. This means that you must -# deserialize the saved state_dict before you pass it to the -# ``load_state_dict()`` function. For example, you CANNOT load using -# ``model.load_state_dict(PATH)``. -# -# Remember too, that you must call ``model.eval()`` to set dropout and -# batch normalization layers to evaluation mode before running inference. -# Failing to do this will yield inconsistent inference results. -# -# 5. Save and load entire model -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Now let’s try the same thing with the entire model. -# - -# Specify a path -PATH = "entire_model.pt" - -# Save -torch.save(net, PATH) - -# Load -model = torch.load(PATH) -model.eval() - - -###################################################################### -# Again here, remember that you must call ``model.eval()`` to set dropout and -# batch normalization layers to evaluation mode before running inference. -# -# Congratulations! You have successfully saved and load models for -# inference in PyTorch. -# -# Learn More -# ---------- -# -# Take a look at these other recipes to continue your learning: -# -# - `Saving and loading a general checkpoint in PyTorch `__ -# - `Saving and loading multiple models in one file using PyTorch `__ diff --git a/recipes_source/recipes/saving_and_loading_models_for_inference.rst b/recipes_source/recipes/saving_and_loading_models_for_inference.rst new file mode 100644 index 0000000000..19e1405dd8 --- /dev/null +++ b/recipes_source/recipes/saving_and_loading_models_for_inference.rst @@ -0,0 +1,10 @@ +Saving And Loading Models For Inference +======================================= + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/recipes_source/recipes/saving_multiple_models_in_one_file.py b/recipes_source/recipes/saving_multiple_models_in_one_file.py deleted file mode 100644 index e938be03b4..0000000000 --- a/recipes_source/recipes/saving_multiple_models_in_one_file.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Saving and loading multiple models in one file using PyTorch -============================================================ -Saving and loading multiple models can be helpful for reusing models -that you have previously trained. - -Introduction ------------- -When saving a model comprised of multiple ``torch.nn.Modules``, such as -a GAN, a sequence-to-sequence model, or an ensemble of models, you must -save a dictionary of each model’s state_dict and corresponding -optimizer. You can also save any other items that may aid you in -resuming training by simply appending them to the dictionary. -To load the models, first initialize the models and optimizers, then -load the dictionary locally using ``torch.load()``. From here, you can -easily access the saved items by simply querying the dictionary as you -would expect. -In this recipe, we will demonstrate how to save multiple models to one -file using PyTorch. - -Setup ------ -Before we begin, we need to install ``torch`` if it isn’t already -available. - -.. code-block:: sh - - pip install torch - -""" - - - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Initialize the optimizer -# 4. Save multiple models -# 5. Load multiple models -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. Build -# two variables for the models to eventually save. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -netA = Net() -netB = Net() - - -###################################################################### -# 3. Initialize the optimizer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We will use SGD with momentum to build an optimizer for each model we -# created. -# - -optimizerA = optim.SGD(netA.parameters(), lr=0.001, momentum=0.9) -optimizerB = optim.SGD(netB.parameters(), lr=0.001, momentum=0.9) - - -###################################################################### -# 4. Save multiple models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Collect all relevant information and build your dictionary. -# - -# Specify a path to save to -PATH = "model.pt" - -torch.save({ - 'modelA_state_dict': netA.state_dict(), - 'modelB_state_dict': netB.state_dict(), - 'optimizerA_state_dict': optimizerA.state_dict(), - 'optimizerB_state_dict': optimizerB.state_dict(), - }, PATH) - - -###################################################################### -# 4. Load multiple models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Remember to first initialize the models and optimizers, then load the -# dictionary locally. -# - -modelA = Net() -modelB = Net() -optimModelA = optim.SGD(modelA.parameters(), lr=0.001, momentum=0.9) -optimModelB = optim.SGD(modelB.parameters(), lr=0.001, momentum=0.9) - -checkpoint = torch.load(PATH, weights_only=True) -modelA.load_state_dict(checkpoint['modelA_state_dict']) -modelB.load_state_dict(checkpoint['modelB_state_dict']) -optimizerA.load_state_dict(checkpoint['optimizerA_state_dict']) -optimizerB.load_state_dict(checkpoint['optimizerB_state_dict']) - -modelA.eval() -modelB.eval() -# - or - -modelA.train() -modelB.train() - - -###################################################################### -# You must call ``model.eval()`` to set dropout and batch normalization -# layers to evaluation mode before running inference. Failing to do this -# will yield inconsistent inference results. -# -# If you wish to resuming training, call ``model.train()`` to ensure these -# layers are in training mode. -# -# Congratulations! You have successfully saved and loaded multiple models -# in PyTorch. -# diff --git a/recipes_source/recipes/saving_multiple_models_in_one_file.rst b/recipes_source/recipes/saving_multiple_models_in_one_file.rst new file mode 100644 index 0000000000..33040e6c87 --- /dev/null +++ b/recipes_source/recipes/saving_multiple_models_in_one_file.rst @@ -0,0 +1,10 @@ +Saving Multiple Models In One File +================================== + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + From 76bd6d3e2467740542fcb906a9695e55fedc773e Mon Sep 17 00:00:00 2001 From: Chirag Pandya Date: Wed, 30 Oct 2024 18:19:46 -0700 Subject: [PATCH 3/4] [doc][c10d] fix up distributed tutorial (#3132) * [doc][c10d] fix up distributed tutorial Summary: Minor fixups to the distributed tutorial. 1. Fix broken links. Test Plan: Ran the referenced code locally to make sure that it works. Reviewers: Subscribers: Tasks: Tags: * Update intermediate_source/dist_tuto.rst Co-authored-by: Svetlana Karslioglu --------- Co-authored-by: Svetlana Karslioglu --- intermediate_source/dist_tuto.rst | 58 +++++++++++++++++++------------ 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/intermediate_source/dist_tuto.rst b/intermediate_source/dist_tuto.rst index 4a45254998..c5ffc317c4 100644 --- a/intermediate_source/dist_tuto.rst +++ b/intermediate_source/dist_tuto.rst @@ -38,7 +38,7 @@ simultaneously. If you have access to compute cluster you should check with your local sysadmin or use your favorite coordination tool (e.g., `pdsh `__, `clustershell `__, or -`others `__). For the purpose of this +`slurm `__). For the purpose of this tutorial, we will use a single machine and spawn multiple processes using the following template. @@ -64,11 +64,11 @@ the following template. if __name__ == "__main__": - size = 2 + world_size = 2 processes = [] mp.set_start_method("spawn") - for rank in range(size): - p = mp.Process(target=init_process, args=(rank, size, run)) + for rank in range(world_size): + p = mp.Process(target=init_process, args=(rank, world_size, run)) p.start() processes.append(p) @@ -125,7 +125,7 @@ process 0 increments the tensor and sends it to process 1 so that they both end up with 1.0. Notice that process 1 needs to allocate memory in order to store the data it will receive. -Also notice that ``send``/``recv`` are **blocking**: both processes stop +Also notice that ``send/recv`` are **blocking**: both processes block until the communication is completed. On the other hand immediates are **non-blocking**; the script continues its execution and the methods return a ``Work`` object upon which we can choose to @@ -219,16 +219,23 @@ to obtain the sum of all tensors on all processes, we can use the Since we want the sum of all tensors in the group, we use ``dist.ReduceOp.SUM`` as the reduce operator. Generally speaking, any commutative mathematical operation can be used as an operator. -Out-of-the-box, PyTorch comes with 4 such operators, all working at the +Out-of-the-box, PyTorch comes with many such operators, all working at the element-wise level: - ``dist.ReduceOp.SUM``, - ``dist.ReduceOp.PRODUCT``, - ``dist.ReduceOp.MAX``, -- ``dist.ReduceOp.MIN``. +- ``dist.ReduceOp.MIN``, +- ``dist.ReduceOp.BAND``, +- ``dist.ReduceOp.BOR``, +- ``dist.ReduceOp.BXOR``, +- ``dist.ReduceOp.PREMUL_SUM``. -In addition to ``dist.all_reduce(tensor, op, group)``, there are a total -of 6 collectives currently implemented in PyTorch. +The full list of supported operators is +`here `__. + +In addition to ``dist.all_reduce(tensor, op, group)``, there are many additional collectives currently implemented in +PyTorch. Here are a few supported collectives. - ``dist.broadcast(tensor, src, group)``: Copies ``tensor`` from ``src`` to all other processes. @@ -244,6 +251,12 @@ of 6 collectives currently implemented in PyTorch. - ``dist.all_gather(tensor_list, tensor, group)``: Copies ``tensor`` from all processes to ``tensor_list``, on all processes. - ``dist.barrier(group)``: Blocks all processes in `group` until each one has entered this function. +- ``dist.all_to_all(output_tensor_list, input_tensor_list, group)``: Scatters list of input tensors to all processes in +a group and return gathered list of tensors in output list. + +The full list of supported collectives can be found by looking at the latest documentation for PyTorch Distributed +`(link) `__. + Distributed Training -------------------- @@ -275,7 +288,7 @@ gradients of their model on their batch of data and then average their gradients. In order to ensure similar convergence results when changing the number of processes, we will first have to partition our dataset. (You could also use -`tnt.dataset.SplitDataset `__, +`torch.utils.data.random_split `__, instead of the snippet below.) .. code:: python @@ -389,7 +402,7 @@ could train any model on a large computer cluster. lot more tricks `__ required to implement a production-level implementation of synchronous SGD. Again, use what `has been tested and -optimized `__. +optimized `__. Our Own Ring-Allreduce ~~~~~~~~~~~~~~~~~~~~~~ @@ -451,8 +464,9 @@ Communication Backends One of the most elegant aspects of ``torch.distributed`` is its ability to abstract and build on top of different backends. As mentioned before, -there are currently three backends implemented in PyTorch: Gloo, NCCL, and -MPI. They each have different specifications and tradeoffs, depending +there are multiple backends implemented in PyTorch. +Some of the most popular ones are Gloo, NCCL, and MPI. +They each have different specifications and tradeoffs, depending on the desired use case. A comparative table of supported functions can be found `here `__. @@ -544,15 +558,15 @@ NCCL backend is included in the pre-built binaries with CUDA support. Initialization Methods ~~~~~~~~~~~~~~~~~~~~~~ -To finish this tutorial, let's talk about the very first function we -called: ``dist.init_process_group(backend, init_method)``. In -particular, we will go over the different initialization methods which -are responsible for the initial coordination step between each process. -Those methods allow you to define how this coordination is done. -Depending on your hardware setup, one of these methods should be -naturally more suitable than the others. In addition to the following -sections, you should also have a look at the `official -documentation `__. +To conclude this tutorial, let's examine the initial function we invoked: +``dist.init_process_group(backend, init_method)``. Specifically, we will discuss the various +initialization methods responsible for the preliminary coordination step between each process. +These methods enable you to define how this coordination is accomplished. + +The choice of initialization method depends on your hardware setup, and one method may be more +suitable than others. In addition to the following sections, please refer to the `official +documentation `__ for further information. + **Environment Variable** From 5feb4a501d27f7d213a49fb56f2f3cfadaab45a3 Mon Sep 17 00:00:00 2001 From: Saiteja Samudrala Date: Wed, 30 Oct 2024 20:41:35 -0700 Subject: [PATCH 4/4] update conf.py to copy rst files from src recursive directories as well --- conf.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/conf.py b/conf.py index 4ab380a7b3..6e9c22908d 100644 --- a/conf.py +++ b/conf.py @@ -45,6 +45,7 @@ import pandocfilters import pypandoc import plotly.io as pio +from pathlib import Path pio.renderers.default = 'sphinx_gallery' @@ -140,22 +141,17 @@ def reset_seeds(gallery_conf, fname): sphinx_gallery_conf['ignore_pattern'] = r'/(?!' + re.escape(os.getenv('GALLERY_PATTERN')) + r')[^/]+$' for i in range(len(sphinx_gallery_conf['examples_dirs'])): - gallery_dir = sphinx_gallery_conf['gallery_dirs'][i] - source_dir = sphinx_gallery_conf['examples_dirs'][i] - # Create gallery dirs if it doesn't exist - try: - os.mkdir(gallery_dir) - except OSError: - pass + gallery_dir = Path(sphinx_gallery_conf["gallery_dirs"][i]) + source_dir = Path(sphinx_gallery_conf["examples_dirs"][i]) # Copy rst files from source dir to gallery dir - for f in glob.glob(os.path.join(source_dir, '*.rst')): - distutils.file_util.copy_file(f, gallery_dir, update=True) - + for f in source_dir.rglob("*.rst"): + f_dir = Path(f).parent + gallery_subdir_path = gallery_dir / f_dir.relative_to(source_dir) + gallery_subdir_path.mkdir(parents=True, exist_ok=True) + distutils.file_util.copy_file(f, gallery_subdir_path, update=True) # Add any paths that contain templates here, relative to this directory. - - templates_path = ['_templates'] # The suffix(es) of source filenames.