From 793da934082ec2a6fc738bce751eb43998806403 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Tue, 5 Oct 2021 14:05:54 -0700 Subject: [PATCH 01/69] Update links in README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 50fcf66..be0801b 100644 --- a/README.md +++ b/README.md @@ -38,8 +38,8 @@ The repository has implementations for the following Bayesian layers: Please refer to [documentation](doc/bayesian_torch.layers.md#layers) of Bayesian layers for details. Other features include: -- [x] AvUC: Accuracy versus Uncertainty Calibration loss [[Krishnan et al. 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)] -- [x] MOPED: specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2019](https://arxiv.org/abs/1906.05323)] +- [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan et al. 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)] +- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/738df2ddfef8a1c9eaa0463053d926723c9bb9ec/bayesian_torch/utils/util.py#L72): specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2019](https://arxiv.org/abs/1906.05323)] - [ ] dnn_to_bnn: An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition by replacing neural network layers with corresponding Bayesian layers (`updating soon...`) From b9245101b53fbda0d6a968e35f8ee9fdc50bccd5 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Tue, 5 Oct 2021 07:25:33 -0700 Subject: [PATCH 02/69] update MOPED layer example utility function Signed-off-by: Ranganath Krishnan --- .../main_bayesian_flipout_imagenet.py | 2 +- bayesian_torch/utils/util.py | 62 +++++++++++-------- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/bayesian_torch/examples/main_bayesian_flipout_imagenet.py b/bayesian_torch/examples/main_bayesian_flipout_imagenet.py index 067c896..212a1e9 100644 --- a/bayesian_torch/examples/main_bayesian_flipout_imagenet.py +++ b/bayesian_torch/examples/main_bayesian_flipout_imagenet.py @@ -212,7 +212,7 @@ def MOPED_layer(layer, det_layer, delta): print(str(layer)) layer.weight.data = det_layer.weight.data if layer.bias is not None: - layer.bias.data = det_layer.bias.data2 + layer.bias.data = det_layer.bias.data elif (str(layer) == 'LinearFlipout()' or str(layer) == 'LinearReparameterization()'): diff --git a/bayesian_torch/utils/util.py b/bayesian_torch/utils/util.py index 24f5fde..df51f41 100644 --- a/bayesian_torch/utils/util.py +++ b/bayesian_torch/utils/util.py @@ -55,9 +55,9 @@ def mutual_information(mc_preds): Compute the difference between the entropy of the mean of the predictive distribution and the mean of the entropy. """ - MI = entropy(np.mean(mc_preds, axis=0)) - np.mean(entropy(mc_preds), - axis=0) - return MI + mutual_info = entropy(np.mean(mc_preds, axis=0)) - np.mean(entropy(mc_preds), + axis=0) + return mutual_info def get_rho(sigma, delta): @@ -86,39 +86,51 @@ def MOPED(model, det_model, det_checkpoint, delta): for (idx, layer), (det_idx, det_layer) in zip(enumerate(model.modules()), enumerate(det_model.modules())): - if (str(layer) == 'Conv1dVariational()' - or str(layer) == 'Conv2dVariational()' - or str(layer) == 'Conv3dVariational()' - or str(layer) == 'ConvTranspose1dVariational()' - or str(layer) == 'ConvTranspose2dVariational()' - or str(layer) == 'ConvTranspose3dVariational()'): + if (str(layer) == 'Conv1dReparametrization()' + or str(layer) == 'Conv2dReparameterization()' + or str(layer) == 'Conv3dReparameterization()' + or str(layer) == 'ConvTranspose1dReparameterization()' + or str(layer) == 'ConvTranspose2dReparameterization()' + or str(layer) == 'ConvTranspose3dReparameterization()' + or str(layer) == 'Conv1dFlipout()' + or str(layer) == 'Conv2dFlipout()' + or str(layer) == 'Conv3dFlipout()' + or str(layer) == 'ConvTranspose1dFlipout()' + or str(layer) == 'ConvTranspose2dFlipout()' + or str(layer) == 'ConvTranspose3dFlipout()'): #set the priors - layer.prior_weight_mu.data = det_layer.weight - layer.prior_bias_mu.data = det_layer.bias + layer.prior_weight_mu = det_layer.weight.data + if layer.prior_bias_mu is not None: + layer.prior_bias_mu = det_layer.bias.data #initialize surrogate posteriors - layer.mu_kernel.data = det_layer.weight + layer.mu_kernel.data = det_layer.weight.data layer.rho_kernel.data = get_rho(det_layer.weight.data, delta) - layer.mu_bias.data = det_layer.bias - layer.rho_bias.data = get_rho(det_layer.bias.data, delta) - elif (str(layer) == 'LinearVariational()'): + if layer.mu_bias is not None: + layer.mu_bias.data = det_layer.bias.data + layer.rho_bias.data = get_rho(det_layer.bias.data, delta) + elif (str(layer) == 'LinearReparameterization()' + or str(layer) == 'LinearFlipout()'): #set the priors - layer.prior_weight_mu.data = det_layer.weight - layer.prior_bias_mu.data = det_layer.bias + layer.prior_weight_mu = det_layer.weight.data + if layer.prior_bias_mu is not None: + layer.prior_bias_mu.data = det_layer.bias #initialize the surrogate posteriors - layer.mu_weight.data = det_layer.weight + layer.mu_weight.data = det_layer.weight.data layer.rho_weight.data = get_rho(det_layer.weight.data, delta) - layer.mu_bias.data = det_layer.bias - layer.rho_bias.data = get_rho(det_layer.bias.data, delta) + if layer.mu_bias is not None: + layer.mu_bias.data = det_layer.bias.data + layer.rho_bias.data = get_rho(det_layer.bias.data, delta) elif str(layer).startswith('Batch'): #initialize parameters - layer.weight.data = det_layer.weight - layer.bias.data = det_layer.bias - layer.running_mean.data = det_layer.running_mean - layer.running_var.data = det_layer.running_var - layer.num_batches_tracked.data = det_layer.num_batches_tracked + layer.weight.data = det_layer.weight.data + if layer.bias is not None: + layer.bias.data = det_layer.bias + layer.running_mean.data = det_layer.running_mean.data + layer.running_var.data = det_layer.running_var.data + layer.num_batches_tracked.data = det_layer.num_batches_tracked.data model.state_dict() return model From 81648f90589b9be74623434674ef4c083e7422a9 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Tue, 5 Oct 2021 07:29:39 -0700 Subject: [PATCH 03/69] fix minor typo. Signed-off-by: Ranganath Krishnan --- bayesian_torch/utils/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bayesian_torch/utils/util.py b/bayesian_torch/utils/util.py index df51f41..7418679 100644 --- a/bayesian_torch/utils/util.py +++ b/bayesian_torch/utils/util.py @@ -86,7 +86,7 @@ def MOPED(model, det_model, det_checkpoint, delta): for (idx, layer), (det_idx, det_layer) in zip(enumerate(model.modules()), enumerate(det_model.modules())): - if (str(layer) == 'Conv1dReparametrization()' + if (str(layer) == 'Conv1dReparameterization()' or str(layer) == 'Conv2dReparameterization()' or str(layer) == 'Conv3dReparameterization()' or str(layer) == 'ConvTranspose1dReparameterization()' From a5065302947199152a453ed65e893625ba54d368 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Tue, 5 Oct 2021 14:32:18 -0700 Subject: [PATCH 04/69] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index be0801b..9d77441 100644 --- a/README.md +++ b/README.md @@ -38,8 +38,8 @@ The repository has implementations for the following Bayesian layers: Please refer to [documentation](doc/bayesian_torch.layers.md#layers) of Bayesian layers for details. Other features include: -- [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan et al. 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)] -- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/738df2ddfef8a1c9eaa0463053d926723c9bb9ec/bayesian_torch/utils/util.py#L72): specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2019](https://arxiv.org/abs/1906.05323)] +- [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)] +- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2019](https://arxiv.org/abs/1906.05323)] - [ ] dnn_to_bnn: An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition by replacing neural network layers with corresponding Bayesian layers (`updating soon...`) From ade5f9bf8f64b934df20eab9163422f8dc958da5 Mon Sep 17 00:00:00 2001 From: Pi Date: Fri, 26 Nov 2021 10:50:15 -0300 Subject: [PATCH 05/69] feat: add possibility to return no kl, save it as attribute --- .../variational_layers/conv_variational.py | 60 +++++++++++++++---- .../variational_layers/linear_variational.py | 10 +++- .../variational_layers/rnn_variational.py | 10 +++- 3 files changed, 64 insertions(+), 16 deletions(-) diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py index 4311400..96b1db5 100644 --- a/bayesian_torch/layers/variational_layers/conv_variational.py +++ b/bayesian_torch/layers/variational_layers/conv_variational.py @@ -112,6 +112,8 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias + self.kl = 0 + self.mu_kernel = Parameter( torch.Tensor(out_channels, in_channels // groups, kernel_size)) self.rho_kernel = Parameter( @@ -160,7 +162,7 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) - def forward(self, input): + def forward(self, input, return_kl=True): sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() weight = self.mu_kernel + (sigma_weight * eps_kernel) @@ -182,7 +184,11 @@ def forward(self, input): else: kl = kl_weight - return out, kl + self.kl = kl + + if return_kl: + return out, kl + return out class Conv2dReparameterization(BaseVariationalLayer_): @@ -239,6 +245,8 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias + self.kl = 0 + self.mu_kernel = Parameter( torch.Tensor(out_channels, in_channels // groups, kernel_size, kernel_size)) @@ -292,7 +300,7 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) - def forward(self, input): + def forward(self, input, return_kl=True): sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() weight = self.mu_kernel + (sigma_weight * eps_kernel) @@ -313,8 +321,12 @@ def forward(self, input): kl = kl_weight + kl_bias else: kl = kl_weight + + self.kl = kl - return out, kl + if return_kl: + return out, kl + return out class Conv3dReparameterization(BaseVariationalLayer_): @@ -371,6 +383,8 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias + self.kl = 0 + self.mu_kernel = Parameter( torch.Tensor(out_channels, in_channels // groups, kernel_size, kernel_size, kernel_size)) @@ -424,7 +438,7 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) - def forward(self, input): + def forward(self, input, return_kl=True): sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() weight = self.mu_kernel + (sigma_weight * eps_kernel) @@ -446,7 +460,11 @@ def forward(self, input): else: kl = kl_weight - return out, kl + self.kl = kl + + if return_kl: + return out, kl + return out class ConvTranspose1dReparameterization(BaseVariationalLayer_): @@ -504,6 +522,8 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias + self.kl = 0 + self.mu_kernel = Parameter( torch.Tensor(in_channels, out_channels // groups, kernel_size)) self.rho_kernel = Parameter( @@ -552,7 +572,7 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) - def forward(self, input): + def forward(self, input, return_kl=True): sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() weight = self.mu_kernel + (sigma_weight * eps_kernel) @@ -575,7 +595,11 @@ def forward(self, input): else: kl = kl_weight - return out, kl + self.kl = kl + + if return_kl: + return out, kl + return out class ConvTranspose2dReparameterization(BaseVariationalLayer_): @@ -633,6 +657,8 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias + self.kl = 0 + self.mu_kernel = Parameter( torch.Tensor(in_channels, out_channels // groups, kernel_size, kernel_size)) @@ -686,7 +712,7 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) - def forward(self, input): + def forward(self, input, return_kl=True): sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() weight = self.mu_kernel + (sigma_weight * eps_kernel) @@ -709,7 +735,11 @@ def forward(self, input): else: kl = kl_weight - return out, kl + self.kl = kl + + if return_kl: + return out, kl + return out class ConvTranspose3dReparameterization(BaseVariationalLayer_): @@ -768,6 +798,8 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias + self.kl = 0 + self.mu_kernel = Parameter( torch.Tensor(in_channels, out_channels // groups, kernel_size, kernel_size, kernel_size)) @@ -821,7 +853,7 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) - def forward(self, input): + def forward(self, input, return_kl=True): sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() weight = self.mu_kernel + (sigma_weight * eps_kernel) @@ -844,4 +876,8 @@ def forward(self, input): else: kl = kl_weight - return out, kl + self.kl = kl + + if return_kl: + return out, kl + return out diff --git a/bayesian_torch/layers/variational_layers/linear_variational.py b/bayesian_torch/layers/variational_layers/linear_variational.py index af113f5..bb3a296 100644 --- a/bayesian_torch/layers/variational_layers/linear_variational.py +++ b/bayesian_torch/layers/variational_layers/linear_variational.py @@ -83,6 +83,8 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias + self.kl = 0 + self.mu_weight = Parameter(torch.Tensor(out_features, in_features)) self.rho_weight = Parameter(torch.Tensor(out_features, in_features)) self.register_buffer('eps_weight', @@ -124,7 +126,7 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) - def forward(self, input): + def forward(self, input, return_kl=True): sigma_weight = torch.log1p(torch.exp(self.rho_weight)) weight = self.mu_weight + \ (sigma_weight * self.eps_weight.data.normal_()) @@ -143,5 +145,9 @@ def forward(self, input): kl = kl_weight + kl_bias else: kl = kl_weight + + self.kl = kl - return out, kl + if return_kl: + return out, kl + return out \ No newline at end of file diff --git a/bayesian_torch/layers/variational_layers/rnn_variational.py b/bayesian_torch/layers/variational_layers/rnn_variational.py index ab126ad..c36378c 100644 --- a/bayesian_torch/layers/variational_layers/rnn_variational.py +++ b/bayesian_torch/layers/variational_layers/rnn_variational.py @@ -77,6 +77,8 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias + self.kl = kl + self.ih = LinearReparameterization( prior_mean=prior_mean, prior_variance=prior_variance, @@ -95,7 +97,7 @@ def __init__(self, out_features=out_features * 4, bias=bias) - def forward(self, X, hidden_states=None): + def forward(self, X, hidden_states=None, return_kl=True): batch_size, seq_size, _ = X.size() @@ -140,4 +142,8 @@ def forward(self, X, hidden_states=None): hidden_seq = hidden_seq.transpose(0, 1).contiguous() c_ts = c_ts.transpose(0, 1).contiguous() - return hidden_seq, (hidden_seq, c_ts), kl + self.kl = kl + + if return_kl: + return hidden_seq, (hidden_seq, c_ts), kl + return hidden_seq, (hidden_seq, c_ts) From 037006db1b0e7f35d13209517dd88fa711ec75d6 Mon Sep 17 00:00:00 2001 From: Pi Date: Fri, 26 Nov 2021 11:01:06 -0300 Subject: [PATCH 06/69] feat: add possibility to return no kl on flipout layers, save it as attribute --- .../layers/flipout_layers/conv_flipout.py | 54 ++++++++++++++----- .../layers/flipout_layers/linear_flipout.py | 10 +++- .../layers/flipout_layers/rnn_flipout.py | 9 +++- 3 files changed, 57 insertions(+), 16 deletions(-) diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py index cc3c26e..d1996f7 100644 --- a/bayesian_torch/layers/flipout_layers/conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py @@ -100,6 +100,8 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init self.bias = bias + self.kl = 0 + self.mu_kernel = nn.Parameter( torch.Tensor(out_channels, in_channels // groups, kernel_size)) self.rho_kernel = nn.Parameter( @@ -150,7 +152,7 @@ def init_parameters(self): self.prior_bias_mu.data.fill_(self.prior_mean) self.prior_bias_sigma.data.fill_(self.prior_variance) - def forward(self, x): + def forward(self, x, return_kl=True): # linear outputs outputs = F.conv1d(x, @@ -191,8 +193,11 @@ def forward(self, x): dilation=self.dilation, groups=self.groups) * sign_output + self.kl = kl # returning outputs + perturbations - return outputs + perturbed_outputs, kl + if return_kl: + return outputs + perturbed_outputs, kl + return outputs + perturbed_outputs class Conv2dFlipout(BaseVariationalLayer_): @@ -244,6 +249,8 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init self.bias = bias + self.kl = 0 + self.mu_kernel = nn.Parameter( torch.Tensor(out_channels, in_channels // groups, kernel_size, kernel_size)) @@ -299,7 +306,7 @@ def init_parameters(self): self.prior_bias_mu.data.fill_(self.prior_mean) self.prior_bias_sigma.data.fill_(self.prior_variance) - def forward(self, x): + def forward(self, x, return_kl=True): # linear outputs outputs = F.conv2d(x, @@ -340,8 +347,11 @@ def forward(self, x): dilation=self.dilation, groups=self.groups) * sign_output + self.kl = kl # returning outputs + perturbations - return outputs + perturbed_outputs, kl + if return_kl: + return outputs + perturbed_outputs, kl + return outputs + perturbed_outputs class Conv3dFlipout(BaseVariationalLayer_): @@ -388,6 +398,8 @@ def __init__(self, self.groups = groups self.bias = bias + self.kl = 0 + self.prior_mean = prior_mean self.prior_variance = prior_variance self.posterior_mu_init = posterior_mu_init @@ -448,7 +460,7 @@ def init_parameters(self): self.prior_bias_mu.data.fill_(self.prior_mean) self.prior_bias_sigma.data.fill_(self.prior_variance) - def forward(self, x): + def forward(self, x, return_kl=True): # linear outputs outputs = F.conv3d(x, @@ -489,8 +501,11 @@ def forward(self, x): dilation=self.dilation, groups=self.groups) * sign_output + self.kl = kl # returning outputs + perturbations - return outputs + perturbed_outputs, kl + if return_kl: + return outputs + perturbed_outputs, kl + return outputs + perturbed_outputs class ConvTranspose1dFlipout(BaseVariationalLayer_): @@ -537,6 +552,8 @@ def __init__(self, self.groups = groups self.bias = bias + self.kl = 0 + self.prior_mean = prior_mean self.prior_variance = prior_variance self.posterior_mu_init = posterior_mu_init @@ -593,7 +610,7 @@ def init_parameters(self): self.prior_bias_mu.data.fill_(self.prior_mean) self.prior_bias_sigma.data.fill_(self.prior_variance) - def forward(self, x): + def forward(self, x, return_kl=True): # linear outputs outputs = F.conv_transpose1d(x, @@ -635,8 +652,11 @@ def forward(self, x): dilation=self.dilation, groups=self.groups) * sign_output + self.kl = kl # returning outputs + perturbations - return outputs + perturbed_outputs, kl + if return_kl: + return outputs + perturbed_outputs, kl + return outputs + perturbed_outputs class ConvTranspose2dFlipout(BaseVariationalLayer_): @@ -683,6 +703,8 @@ def __init__(self, self.groups = groups self.bias = bias + self.kl = 0 + self.prior_mean = prior_mean self.prior_variance = prior_variance self.posterior_mu_init = posterior_mu_init @@ -743,7 +765,7 @@ def init_parameters(self): self.prior_bias_mu.data.fill_(self.prior_mean) self.prior_bias_sigma.data.fill_(self.prior_variance) - def forward(self, x): + def forward(self, x, return_kl=True): # linear outputs outputs = F.conv_transpose2d(x, @@ -785,8 +807,11 @@ def forward(self, x): dilation=self.dilation, groups=self.groups) * sign_output + self.kl = kl # returning outputs + perturbations - return outputs + perturbed_outputs, kl + if return_kl: + return outputs + perturbed_outputs, kl + return outputs + perturbed_outputs class ConvTranspose3dFlipout(BaseVariationalLayer_): @@ -838,6 +863,8 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init self.bias = bias + self.kl = 0 + self.mu_kernel = nn.Parameter( torch.Tensor(in_channels, out_channels // groups, kernel_size, kernel_size, kernel_size)) @@ -893,7 +920,7 @@ def init_parameters(self): self.prior_bias_mu.data.fill_(self.prior_mean) self.prior_bias_sigma.data.fill_(self.prior_variance) - def forward(self, x): + def forward(self, x, return_kl=True): # linear outputs outputs = F.conv_transpose3d(x, @@ -935,5 +962,8 @@ def forward(self, x): dilation=self.dilation, groups=self.groups) * sign_output + self.kl = kl # returning outputs + perturbations - return outputs + perturbed_outputs, kl + if return_kl: + return outputs + perturbed_outputs, kl + return outputs + perturbed_outputs diff --git a/bayesian_torch/layers/flipout_layers/linear_flipout.py b/bayesian_torch/layers/flipout_layers/linear_flipout.py index d7d577f..2538f1d 100644 --- a/bayesian_torch/layers/flipout_layers/linear_flipout.py +++ b/bayesian_torch/layers/flipout_layers/linear_flipout.py @@ -90,6 +90,8 @@ def __init__(self, torch.Tensor(out_features, in_features), persistent=False) + self.kl = 0 + if bias: self.mu_bias = nn.Parameter(torch.Tensor(out_features)) self.rho_bias = nn.Parameter(torch.Tensor(out_features)) @@ -123,7 +125,7 @@ def init_parameters(self): self.mu_bias.data.normal_(mean=self.posterior_mu_init, std=0.1) self.rho_bias.data.normal_(mean=self.posterior_rho_init, std=0.1) - def forward(self, x): + def forward(self, x, return_kl=True): # sampling delta_W sigma_weight = torch.log1p(torch.exp(self.rho_weight)) delta_weight = (sigma_weight * self.eps_weight.data.normal_()) @@ -148,5 +150,9 @@ def forward(self, x): perturbed_outputs = F.linear(x * sign_input, delta_weight, bias) * sign_output + self.kl = kl + # returning outputs + perturbations - return outputs + perturbed_outputs, kl + if return_kl: + return outputs + perturbed_outputs, kl + return outputs + perturbed_outputs diff --git a/bayesian_torch/layers/flipout_layers/rnn_flipout.py b/bayesian_torch/layers/flipout_layers/rnn_flipout.py index 38c222a..317ebc4 100644 --- a/bayesian_torch/layers/flipout_layers/rnn_flipout.py +++ b/bayesian_torch/layers/flipout_layers/rnn_flipout.py @@ -76,6 +76,8 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, # variance of weight --> sigma = log (1 + exp(rho)) self.bias = bias + self.kl = 0 + self.ih = LinearFlipout(prior_mean=prior_mean, prior_variance=prior_variance, posterior_mu_init=posterior_mu_init, @@ -92,7 +94,7 @@ def __init__(self, out_features=out_features * 4, bias=bias) - def forward(self, X, hidden_states=None): + def forward(self, X, hidden_states=None, return_kl=True): batch_size, seq_size, _ = X.size() @@ -137,4 +139,7 @@ def forward(self, X, hidden_states=None): hidden_seq = hidden_seq.transpose(0, 1).contiguous() c_ts = c_ts.transpose(0, 1).contiguous() - return hidden_seq, (hidden_seq, c_ts), kl + self.kl = kl + if return_kl: + return hidden_seq, (hidden_seq, c_ts), kl + return hidden_seq, (hidden_seq, c_ts) From f892b955bffeb58d51064828b41f08a56d93f4c0 Mon Sep 17 00:00:00 2001 From: msubedar Date: Tue, 7 Dec 2021 23:00:49 -0800 Subject: [PATCH 07/69] updates to support dnn to bnn imodel auto conversion --- .../layers/base_variational_layer.py | 9 + .../layers/flipout_layers/conv_flipout.py | 74 ++++-- .../layers/flipout_layers/linear_flipout.py | 24 +- .../layers/flipout_layers/rnn_flipout.py | 8 + .../variational_layers/conv_variational.py | 230 ++++++++++++------ .../variational_layers/linear_variational.py | 54 ++-- .../variational_layers/rnn_variational.py | 14 +- 7 files changed, 281 insertions(+), 132 deletions(-) diff --git a/bayesian_torch/layers/base_variational_layer.py b/bayesian_torch/layers/base_variational_layer.py index 86b2505..4d63cc9 100644 --- a/bayesian_torch/layers/base_variational_layer.py +++ b/bayesian_torch/layers/base_variational_layer.py @@ -34,6 +34,15 @@ class BaseVariationalLayer_(nn.Module): def __init__(self): super().__init__() + self._dnn_to_bnn_flag = False + + @property + def dnn_to_bnn_flag(self): + return self._dnn_to_bnn_flag + + @dnn_to_bnn_flag.setter + def dnn_to_bnn_flag(self, value): + self._dnn_to_bnn_flag = value def kl_div(self, mu_q, sigma_q, mu_p, sigma_p): """ diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py index d1996f7..5214a99 100644 --- a/bayesian_torch/layers/flipout_layers/conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py @@ -154,6 +154,9 @@ def init_parameters(self): def forward(self, x, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + # linear outputs outputs = F.conv1d(x, weight=self.mu_kernel, @@ -173,16 +176,18 @@ def forward(self, x, return_kl=True): delta_kernel = (sigma_weight * eps_kernel) - kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, - self.prior_weight_sigma) + if return_kl: + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, + self.prior_weight_sigma) bias = None if self.bias: sigma_bias = torch.log1p(torch.exp(self.rho_bias)) eps_bias = self.eps_bias.data.normal_() bias = (sigma_bias * eps_bias) - kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) + if return_kl: + kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) # perturbed feedforward perturbed_outputs = F.conv1d(x * sign_input, @@ -308,6 +313,9 @@ def init_parameters(self): def forward(self, x, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + # linear outputs outputs = F.conv2d(x, weight=self.mu_kernel, @@ -327,16 +335,18 @@ def forward(self, x, return_kl=True): delta_kernel = (sigma_weight * eps_kernel) - kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, - self.prior_weight_sigma) + if return_kl: + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, + self.prior_weight_sigma) bias = None if self.bias: sigma_bias = torch.log1p(torch.exp(self.rho_bias)) eps_bias = self.eps_bias.data.normal_() bias = (sigma_bias * eps_bias) - kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) + if return_kl: + kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) # perturbed feedforward perturbed_outputs = F.conv2d(x * sign_input, @@ -347,7 +357,6 @@ def forward(self, x, return_kl=True): dilation=self.dilation, groups=self.groups) * sign_output - self.kl = kl # returning outputs + perturbations if return_kl: return outputs + perturbed_outputs, kl @@ -462,6 +471,9 @@ def init_parameters(self): def forward(self, x, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + # linear outputs outputs = F.conv3d(x, weight=self.mu_kernel, @@ -481,16 +493,18 @@ def forward(self, x, return_kl=True): delta_kernel = (sigma_weight * eps_kernel) - kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, - self.prior_weight_sigma) + if return_kl: + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, + self.prior_weight_sigma) bias = None if self.bias: sigma_bias = torch.log1p(torch.exp(self.rho_bias)) eps_bias = self.eps_bias.data.normal_() bias = (sigma_bias * eps_bias) - kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) + if return_kl: + kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) # perturbed feedforward perturbed_outputs = F.conv3d(x * sign_input, @@ -612,6 +626,9 @@ def init_parameters(self): def forward(self, x, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + # linear outputs outputs = F.conv_transpose1d(x, weight=self.mu_kernel, @@ -631,16 +648,18 @@ def forward(self, x, return_kl=True): delta_kernel = (sigma_weight * eps_kernel) - kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, - self.prior_weight_sigma) + if return_kl: + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, + self.prior_weight_sigma) bias = None if self.bias: sigma_bias = torch.log1p(torch.exp(self.rho_bias)) eps_bias = self.eps_bias.data.normal_() bias = (sigma_bias * eps_bias) - kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) + if return_kl: + kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) # perturbed feedforward perturbed_outputs = F.conv_transpose1d( @@ -767,6 +786,9 @@ def init_parameters(self): def forward(self, x, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + # linear outputs outputs = F.conv_transpose2d(x, bias=self.mu_bias, @@ -786,16 +808,18 @@ def forward(self, x, return_kl=True): delta_kernel = (sigma_weight * eps_kernel) - kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, - self.prior_weight_sigma) + if return_kl: + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, + self.prior_weight_sigma) bias = None if self.bias: sigma_bias = torch.log1p(torch.exp(self.rho_bias)) eps_bias = self.eps_bias.data.normal_() bias = (sigma_bias * eps_bias) - kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) + if return_kl: + kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) # perturbed feedforward perturbed_outputs = F.conv_transpose2d( @@ -922,6 +946,9 @@ def init_parameters(self): def forward(self, x, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + # linear outputs outputs = F.conv_transpose3d(x, weight=self.mu_kernel, @@ -941,8 +968,9 @@ def forward(self, x, return_kl=True): delta_kernel = (sigma_weight * eps_kernel) - kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, - self.prior_weight_sigma) + if return_kl: + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, + self.prior_weight_sigma) bias = None if self.bias: diff --git a/bayesian_torch/layers/flipout_layers/linear_flipout.py b/bayesian_torch/layers/flipout_layers/linear_flipout.py index 2538f1d..af34d5d 100644 --- a/bayesian_torch/layers/flipout_layers/linear_flipout.py +++ b/bayesian_torch/layers/flipout_layers/linear_flipout.py @@ -90,8 +90,6 @@ def __init__(self, torch.Tensor(out_features, in_features), persistent=False) - self.kl = 0 - if bias: self.mu_bias = nn.Parameter(torch.Tensor(out_features)) self.rho_bias = nn.Parameter(torch.Tensor(out_features)) @@ -125,21 +123,33 @@ def init_parameters(self): self.mu_bias.data.normal_(mean=self.posterior_mu_init, std=0.1) self.rho_bias.data.normal_(mean=self.posterior_rho_init, std=0.1) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_weight)) + kl = self.kl_div(self.mu_weight, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.mu_bias is not None: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + return kl + def forward(self, x, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False # sampling delta_W sigma_weight = torch.log1p(torch.exp(self.rho_weight)) delta_weight = (sigma_weight * self.eps_weight.data.normal_()) # get kl divergence - kl = self.kl_div(self.mu_weight, sigma_weight, self.prior_weight_mu, - self.prior_weight_sigma) + if return_kl: + kl = self.kl_div(self.mu_weight, sigma_weight, self.prior_weight_mu, + self.prior_weight_sigma) bias = None if self.mu_bias is not None: sigma_bias = torch.log1p(torch.exp(self.rho_bias)) bias = (sigma_bias * self.eps_bias.data.normal_()) - kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) + if return_kl: + kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) # linear outputs outputs = F.linear(x, self.mu_weight, self.mu_bias) @@ -150,8 +160,6 @@ def forward(self, x, return_kl=True): perturbed_outputs = F.linear(x * sign_input, delta_weight, bias) * sign_output - self.kl = kl - # returning outputs + perturbations if return_kl: return outputs + perturbed_outputs, kl diff --git a/bayesian_torch/layers/flipout_layers/rnn_flipout.py b/bayesian_torch/layers/flipout_layers/rnn_flipout.py index 317ebc4..5977740 100644 --- a/bayesian_torch/layers/flipout_layers/rnn_flipout.py +++ b/bayesian_torch/layers/flipout_layers/rnn_flipout.py @@ -94,8 +94,16 @@ def __init__(self, out_features=out_features * 4, bias=bias) + def kl_loss(self): + kl_i = self.ih.kl_loss() + kl_h = self.hh.kl_loss() + return kl_i + kl_h + def forward(self, X, hidden_states=None, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + batch_size, seq_size, _ = X.size() hidden_seq = [] diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py index 96b1db5..1d55363 100644 --- a/bayesian_torch/layers/variational_layers/conv_variational.py +++ b/bayesian_torch/layers/variational_layers/conv_variational.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 Intel Labs +# Copyright (C) 2021 Intel Labs # # BSD-3-Clause License # @@ -112,8 +112,6 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias - self.kl = 0 - self.mu_kernel = Parameter( torch.Tensor(out_channels, in_channels // groups, kernel_size)) self.rho_kernel = Parameter( @@ -162,32 +160,53 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + + return kl + + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + + return kl + def forward(self, input, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() weight = self.mu_kernel + (sigma_weight * eps_kernel) - kl_weight = self.kl_div(self.mu_kernel, sigma_weight, - self.prior_weight_mu, self.prior_weight_sigma) + if return_kl: + kl_weight = self.kl_div(self.mu_kernel, sigma_weight, + self.prior_weight_mu, self.prior_weight_sigma) bias = None if self.bias: sigma_bias = torch.log1p(torch.exp(self.rho_bias)) eps_bias = self.eps_bias.data.normal_() bias = self.mu_bias + (sigma_bias * eps_bias) - kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) + if return_kl: + kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) out = F.conv1d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups) - if self.bias: - kl = kl_weight + kl_bias - else: - kl = kl_weight - - self.kl = kl - if return_kl: + if self.bias: + kl = kl_weight + kl_bias + else: + kl = kl_weight return out, kl + return out @@ -245,8 +264,6 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias - self.kl = 0 - self.mu_kernel = Parameter( torch.Tensor(out_channels, in_channels // groups, kernel_size, kernel_size)) @@ -300,32 +317,45 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + + return kl + def forward(self, input, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() weight = self.mu_kernel + (sigma_weight * eps_kernel) - kl_weight = self.kl_div(self.mu_kernel, sigma_weight, - self.prior_weight_mu, self.prior_weight_sigma) + + if return_kl: + kl_weight = self.kl_div(self.mu_kernel, sigma_weight, + self.prior_weight_mu, self.prior_weight_sigma) bias = None if self.bias: sigma_bias = torch.log1p(torch.exp(self.rho_bias)) eps_bias = self.eps_bias.data.normal_() bias = self.mu_bias + (sigma_bias * eps_bias) - kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) + if return_kl: + kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) out = F.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups) - if self.bias: - kl = kl_weight + kl_bias - else: - kl = kl_weight - - self.kl = kl - if return_kl: + if self.bias: + kl = kl_weight + kl_bias + else: + kl = kl_weight return out, kl + return out @@ -383,8 +413,6 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias - self.kl = 0 - self.mu_kernel = Parameter( torch.Tensor(out_channels, in_channels // groups, kernel_size, kernel_size, kernel_size)) @@ -438,32 +466,44 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + + return kl + def forward(self, input, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() weight = self.mu_kernel + (sigma_weight * eps_kernel) - kl_weight = self.kl_div(self.mu_kernel, sigma_weight, - self.prior_weight_mu, self.prior_weight_sigma) + if return_kl: + kl_weight = self.kl_div(self.mu_kernel, sigma_weight, + self.prior_weight_mu, self.prior_weight_sigma) bias = None if self.bias: sigma_bias = torch.log1p(torch.exp(self.rho_bias)) eps_bias = self.eps_bias.data.normal_() bias = self.mu_bias + (sigma_bias * eps_bias) - kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) + if return_kl: + kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) out = F.conv3d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups) - if self.bias: - kl = kl_weight + kl_bias - else: - kl = kl_weight - - self.kl = kl - if return_kl: + if self.bias: + kl = kl_weight + kl_bias + else: + kl = kl_weight return out, kl + return out @@ -522,8 +562,6 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias - self.kl = 0 - self.mu_kernel = Parameter( torch.Tensor(in_channels, out_channels // groups, kernel_size)) self.rho_kernel = Parameter( @@ -572,33 +610,46 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + + return kl + def forward(self, input, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() weight = self.mu_kernel + (sigma_weight * eps_kernel) - kl_weight = self.kl_div(self.mu_kernel, sigma_weight, - self.prior_weight_mu, self.prior_weight_sigma) + if return_kl: + kl_weight = self.kl_div(self.mu_kernel, sigma_weight, + self.prior_weight_mu, self.prior_weight_sigma) bias = None if self.bias: sigma_bias = torch.log1p(torch.exp(self.rho_bias)) eps_bias = self.eps_bias.data.normal_() bias = self.mu_bias + (sigma_bias * eps_bias) - kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) + if return_kl: + kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) out = F.conv_transpose1d(input, weight, bias, self.stride, self.padding, self.output_padding, self.dilation, self.groups) - if self.bias: - kl = kl_weight + kl_bias - else: - kl = kl_weight - - self.kl = kl - if return_kl: + if self.bias: + kl = kl_weight + kl_bias + else: + kl = kl_weight + return out, kl + return out @@ -657,8 +708,6 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias - self.kl = 0 - self.mu_kernel = Parameter( torch.Tensor(in_channels, out_channels // groups, kernel_size, kernel_size)) @@ -712,33 +761,46 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + + return kl + def forward(self, input, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() weight = self.mu_kernel + (sigma_weight * eps_kernel) - kl_weight = self.kl_div(self.mu_kernel, sigma_weight, - self.prior_weight_mu, self.prior_weight_sigma) + if return_kl: + kl_weight = self.kl_div(self.mu_kernel, sigma_weight, + self.prior_weight_mu, self.prior_weight_sigma) bias = None if self.bias: sigma_bias = torch.log1p(torch.exp(self.rho_bias)) eps_bias = self.eps_bias.data.normal_() bias = self.mu_bias + (sigma_bias * eps_bias) - kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) + if return_kl: + kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) out = F.conv_transpose2d(input, weight, bias, self.stride, self.padding, self.output_padding, self.dilation, self.groups) - if self.bias: - kl = kl_weight + kl_bias - else: - kl = kl_weight - - self.kl = kl - if return_kl: + if self.bias: + kl = kl_weight + kl_bias + else: + kl = kl_weight + return out, kl + return out @@ -798,8 +860,6 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias - self.kl = 0 - self.mu_kernel = Parameter( torch.Tensor(in_channels, out_channels // groups, kernel_size, kernel_size, kernel_size)) @@ -853,31 +913,43 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + + return kl + def forward(self, input, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() weight = self.mu_kernel + (sigma_weight * eps_kernel) - kl_weight = self.kl_div(self.mu_kernel, sigma_weight, - self.prior_weight_mu, self.prior_weight_sigma) + if return_kl: + kl_weight = self.kl_div(self.mu_kernel, sigma_weight, + self.prior_weight_mu, self.prior_weight_sigma) bias = None if self.bias: sigma_bias = torch.log1p(torch.exp(self.rho_bias)) eps_bias = self.eps_bias.data.normal_() bias = self.mu_bias + (sigma_bias * eps_bias) - kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) + if return_kl: + kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) out = F.conv_transpose3d(input, weight, bias, self.stride, self.padding, self.output_padding, self.dilation, self.groups) - if self.bias: - kl = kl_weight + kl_bias - else: - kl = kl_weight - - self.kl = kl - if return_kl: + if self.bias: + kl = kl_weight + kl_bias + else: + kl = kl_weight return out, kl + return out diff --git a/bayesian_torch/layers/variational_layers/linear_variational.py b/bayesian_torch/layers/variational_layers/linear_variational.py index bb3a296..7efb667 100644 --- a/bayesian_torch/layers/variational_layers/linear_variational.py +++ b/bayesian_torch/layers/variational_layers/linear_variational.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 Intel Labs +# Copyright (C) 2021 Intel Labs # # BSD-3-Clause License # @@ -83,8 +83,6 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias - self.kl = 0 - self.mu_weight = Parameter(torch.Tensor(out_features, in_features)) self.rho_weight = Parameter(torch.Tensor(out_features, in_features)) self.register_buffer('eps_weight', @@ -99,8 +97,14 @@ def __init__(self, if bias: self.mu_bias = Parameter(torch.Tensor(out_features)) self.rho_bias = Parameter(torch.Tensor(out_features)) - self.register_buffer('eps_bias', torch.Tensor(out_features), persistent=False) - self.register_buffer('prior_bias_mu', torch.Tensor(out_features), persistent=False) + self.register_buffer( + 'eps_bias', + torch.Tensor(out_features), + persistent=False) + self.register_buffer( + 'prior_bias_mu', + torch.Tensor(out_features), + persistent=False) self.register_buffer('prior_bias_sigma', torch.Tensor(out_features), persistent=False) @@ -126,28 +130,44 @@ def init_parameters(self): self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], std=0.1) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_weight)) + kl = self.kl_div( + self.mu_weight, + sigma_weight, + self.prior_weight_mu, + self.prior_weight_sigma) + if self.mu_bias is not None: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, + self.prior_bias_mu, self.prior_bias_sigma) + return kl + def forward(self, input, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False sigma_weight = torch.log1p(torch.exp(self.rho_weight)) weight = self.mu_weight + \ (sigma_weight * self.eps_weight.data.normal_()) - kl_weight = self.kl_div(self.mu_weight, sigma_weight, - self.prior_weight_mu, self.prior_weight_sigma) + if return_kl: + kl_weight = self.kl_div(self.mu_weight, sigma_weight, + self.prior_weight_mu, self.prior_weight_sigma) bias = None if self.mu_bias is not None: sigma_bias = torch.log1p(torch.exp(self.rho_bias)) bias = self.mu_bias + (sigma_bias * self.eps_bias.data.normal_()) - kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) + if return_kl: + kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) out = F.linear(input, weight, bias) - if self.mu_bias is not None: - kl = kl_weight + kl_bias - else: - kl = kl_weight - - self.kl = kl - if return_kl: + if self.mu_bias is not None: + kl = kl_weight + kl_bias + else: + kl = kl_weight + return out, kl - return out \ No newline at end of file + + return out diff --git a/bayesian_torch/layers/variational_layers/rnn_variational.py b/bayesian_torch/layers/variational_layers/rnn_variational.py index c36378c..39f4a2d 100644 --- a/bayesian_torch/layers/variational_layers/rnn_variational.py +++ b/bayesian_torch/layers/variational_layers/rnn_variational.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 Intel Labs +# Copyright (C) 2021 Intel Labs # # BSD-3-Clause License # @@ -77,8 +77,6 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias - self.kl = kl - self.ih = LinearReparameterization( prior_mean=prior_mean, prior_variance=prior_variance, @@ -97,8 +95,16 @@ def __init__(self, out_features=out_features * 4, bias=bias) + def kl_loss(self): + kl_i = self.ih.kl_loss() + kl_h = self.hh.kl_loss() + return kl_i + kl_h + def forward(self, X, hidden_states=None, return_kl=True): + if self.dnn_to_bnn_flag: + return_kl = False + batch_size, seq_size, _ = X.size() hidden_seq = [] @@ -142,8 +148,6 @@ def forward(self, X, hidden_states=None, return_kl=True): hidden_seq = hidden_seq.transpose(0, 1).contiguous() c_ts = c_ts.transpose(0, 1).contiguous() - self.kl = kl - if return_kl: return hidden_seq, (hidden_seq, c_ts), kl return hidden_seq, (hidden_seq, c_ts) From 161bfdfe16c97ee579c45c52ab7ceddc5fc5d0e3 Mon Sep 17 00:00:00 2001 From: msubedar Date: Tue, 7 Dec 2021 23:18:05 -0800 Subject: [PATCH 08/69] updates to support dnn to bnn imodel auto conversion --- .../examples/main_bayesian_cifar_dnn2bnn.py | 522 ++++++++++++++++++ bayesian_torch/models/dnn_to_bnn.py | 165 ++++++ 2 files changed, 687 insertions(+) create mode 100644 bayesian_torch/examples/main_bayesian_cifar_dnn2bnn.py create mode 100644 bayesian_torch/models/dnn_to_bnn.py diff --git a/bayesian_torch/examples/main_bayesian_cifar_dnn2bnn.py b/bayesian_torch/examples/main_bayesian_cifar_dnn2bnn.py new file mode 100644 index 0000000..8305844 --- /dev/null +++ b/bayesian_torch/examples/main_bayesian_cifar_dnn2bnn.py @@ -0,0 +1,522 @@ +import argparse +import os +import shutil +import time + +import torch +import torch.nn as nn +import torch.nn.parallel +import torch.backends.cudnn as cudnn +import torch.optim +import torch.utils.data +from torch.utils.tensorboard import SummaryWriter +import torchvision.transforms as transforms +import torchvision.datasets as datasets + +import bayesian_torch.models.deterministic.resnet as resnet +import numpy as np +from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn, get_kl_loss + +model_names = sorted( + name + for name in resnet.__dict__ + if name.islower() and not name.startswith("__") and name.startswith("resnet") and callable(resnet.__dict__[name]) +) + +print(model_names) +len_trainset = 50000 +len_testset = 10000 + +parser = argparse.ArgumentParser(description="CIFAR10") +parser.add_argument( + "--arch", + "-a", + metavar="ARCH", + default="resnet20", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet20)", +) +parser.add_argument( + "-j", "--workers", default=8, type=int, metavar="N", help="number of data loading workers (default: 8)" +) +parser.add_argument("--epochs", default=200, type=int, metavar="N", help="number of total epochs to run") +parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="manual epoch number (useful on restarts)") +parser.add_argument("-b", "--batch-size", default=128, type=int, metavar="N", help="mini-batch size (default: 512)") +parser.add_argument("--lr", "--learning-rate", default=0.001, type=float, metavar="LR", help="initial learning rate") +parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") +parser.add_argument( + "--weight-decay", "--wd", default=1e-4, type=float, metavar="W", help="weight decay (default: 5e-4)" +) +parser.add_argument("--print-freq", "-p", default=50, type=int, metavar="N", help="print frequency (default: 20)") +parser.add_argument("--resume", default="", type=str, metavar="PATH", help="path to latest checkpoint (default: none)") +parser.add_argument("-e", "--evaluate", dest="evaluate", action="store_true", help="evaluate model on validation set") +parser.add_argument("--pretrained", dest="pretrained", action="store_true", help="use pre-trained model") +parser.add_argument("--half", dest="half", action="store_true", help="use half-precision(16-bit) ") +parser.add_argument( + "--save-dir", + dest="save_dir", + help="The directory used to save the trained models", + default="./checkpoint/bayesian", + type=str, +) +parser.add_argument( + "--moped-init-model", + dest="moped_init_model", + help="DNN model to intialize MOPED method", + default="", + type=str, +) +parser.add_argument( + "--moped-delta-factor", + dest="moped_delta_factor", + help="MOPED delta scale factor", + default=0.2, + type=float, +) + +parser.add_argument( + "--bnn-rho-init", + dest="bnn_rho_init", + help="rho init for bnn layers", + default=-3.0, + type=float, +) + +parser.add_argument( + "--use-flipout-layers", + type=bool, + default=False, + metavar="use_flipout_layers", + help="Use Flipout layers for BNNs, default is Reparameterization layers", +) + +parser.add_argument( + "--save-every", + dest="save_every", + help="Saves checkpoints at every specified number of epochs", + type=int, + default=10, +) +parser.add_argument("--mode", type=str, required=True, help="train | test") + +parser.add_argument( + "--num_monte_carlo", + type=int, + default=20, + metavar="N", + help="number of Monte Carlo samples to be drawn during inference", +) +parser.add_argument("--num_mc", type=int, default=1, metavar="N", help="number of Monte Carlo runs during training") +parser.add_argument( + "--tensorboard", + type=bool, + default=True, + metavar="N", + help="use tensorboard for logging and visualization of training progress", +) +parser.add_argument( + "--log_dir", + type=str, + default="./logs/cifar/bayesian", + metavar="N", + help="use tensorboard for logging and visualization of training progress", +) + +best_prec1 = 0 + + +def main(): + global args, best_prec1 + args = parser.parse_args() + moped_enable = False + if len(args.moped_init_model) > 0: # use moped method if trained dnn model weights are provided + moped_enable = True + + const_bnn_prior_parameters = { + "prior_mu": 0.0, + "prior_sigma": 1.0, + "posterior_mu_init": 0.0, + "posterior_rho_init": args.bnn_rho_init, + "type": "Flipout" if args.use_flipout_layers else "Reparameterization", # Flipout or Reparameterization + "moped_enable": moped_enable, # initialize mu/sigma from the dnn weights + "moped_delta": args.moped_delta_factor, + } + + # Check the save_dir exists or not + if not os.path.exists(args.save_dir): + os.makedirs(args.save_dir) + + model = torch.nn.DataParallel(resnet.__dict__[args.arch]()) + model.cuda() if torch.cuda.is_available() else model.cpu() + if moped_enable: + checkpoint = torch.load(args.moped_init_model) + if "state_dict" in checkpoint.keys(): + model.load_state_dict(checkpoint["state_dict"]) + else: + model.load_state_dict(checkpoint) + + dnn_to_bnn(model, const_bnn_prior_parameters) # only replaces linear and conv layers + if torch.cuda.is_available(): + model.cuda() + else: + model.cpu() + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + checkpoint = torch.load(args.resume) + args.start_epoch = checkpoint["epoch"] + best_prec1 = checkpoint["best_prec1"] + model.load_state_dict(checkpoint) + print("=> loaded checkpoint '{}' (epoch {})".format(args.evaluate, checkpoint["epoch"])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + + tb_writer = None + if args.tensorboard: + logger_dir = os.path.join(args.log_dir, "tb_logger") + if not os.path.exists(logger_dir): + os.makedirs(logger_dir) + tb_writer = SummaryWriter(logger_dir) + + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + + train_loader = torch.utils.data.DataLoader( + datasets.CIFAR10( + root="./data", + train=True, + transform=transforms.Compose( + [ + transforms.RandomHorizontalFlip(), + transforms.RandomCrop(32, 4), + transforms.ToTensor(), + normalize, + ] + ), + download=True, + ), + batch_size=args.batch_size, + shuffle=True, + num_workers=args.workers, + pin_memory=True, + ) + + val_loader = torch.utils.data.DataLoader( + datasets.CIFAR10( + root="./data", + train=False, + transform=transforms.Compose( + [ + transforms.ToTensor(), + normalize, + ] + ), + ), + batch_size=args.batch_size, + shuffle=False, + num_workers=args.workers, + pin_memory=True, + ) + + if not os.path.exists(args.save_dir): + os.makedirs(args.save_dir) + + if torch.cuda.is_available(): + criterion = nn.CrossEntropyLoss().cuda() + else: + criterion = nn.CrossEntropyLoss().cpu() + + if args.half: + model.half() + criterion.half() + + if args.arch in ["resnet110"]: + for param_group in optimizer.param_groups: + param_group["lr"] = args.lr * 0.1 + + if args.evaluate: + validate(val_loader, model, criterion) + return + + if args.mode == "train": + + for epoch in range(args.start_epoch, args.epochs): + + lr = args.lr + if epoch >= 80 and epoch < 120: + lr = 0.1 * args.lr + elif epoch >= 120 and epoch < 160: + lr = 0.01 * args.lr + elif epoch >= 160 and epoch < 180: + lr = 0.001 * args.lr + elif epoch >= 180: + lr = 0.0005 * args.lr + + optimizer = torch.optim.Adam(model.parameters(), lr) + + # train for one epoch + print("current lr {:.5e}".format(optimizer.param_groups[0]["lr"])) + train(args, train_loader, model, criterion, optimizer, epoch, tb_writer) + + prec1 = validate(args, val_loader, model, criterion, epoch, tb_writer) + + is_best = prec1 > best_prec1 + best_prec1 = max(prec1, best_prec1) + + if is_best: + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model.state_dict(), + "best_prec1": best_prec1, + }, + is_best, + filename=os.path.join(args.save_dir, "bayesian_{}_cifar.pth".format(args.arch)), + ) + + elif args.mode == "test": + checkpoint_file = args.save_dir + "/bayesian_{}_cifar.pth".format(args.arch) + if torch.cuda.is_available(): + checkpoint = torch.load(checkpoint_file) + else: + checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu")) + model.load_state_dict(checkpoint["state_dict"]) + evaluate(args, model, val_loader) + + +def train(args, train_loader, model, criterion, optimizer, epoch, tb_writer=None): + batch_time = AverageMeter() + data_time = AverageMeter() + losses = AverageMeter() + top1 = AverageMeter() + + # switch to train mode + model.train() + + end = time.time() + for i, (input, target) in enumerate(train_loader): + + # measure data loading time + data_time.update(time.time() - end) + + if torch.cuda.is_available(): + target = target.cuda() + input_var = input.cuda() + target_var = target + else: + target = target.cpu() + input_var = input.cpu() + target_var = target + + if args.half: + input_var = input_var.half() + + # compute output + output_ = [] + kl_ = [] + for mc_run in range(args.num_mc): + output = model(input_var) + kl = get_kl_loss(model) + output_.append(output) + kl_.append(kl) + output = torch.mean(torch.stack(output_), dim=0) + kl = torch.mean(torch.stack(kl_), dim=0) + cross_entropy_loss = criterion(output, target_var) + scaled_kl = kl / args.batch_size + + # ELBO loss + loss = cross_entropy_loss + scaled_kl + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + output = output.float() + loss = loss.float() + # measure accuracy and record loss + prec1 = accuracy(output.data, target)[0] + losses.update(loss.item(), input.size(0)) + top1.update(prec1.item(), input.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + print( + "Epoch: [{0}][{1}/{2}]\t" + "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t" + "Data {data_time.val:.3f} ({data_time.avg:.3f})\t" + "Loss {loss.val:.4f} ({loss.avg:.4f})\t" + "Prec@1 {top1.val:.3f} ({top1.avg:.3f})".format( + epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1 + ) + ) + + if tb_writer is not None: + tb_writer.add_scalar("train/cross_entropy_loss", cross_entropy_loss.item(), epoch) + tb_writer.add_scalar("train/kl_div", scaled_kl.item(), epoch) + tb_writer.add_scalar("train/elbo_loss", loss.item(), epoch) + tb_writer.add_scalar("train/accuracy", prec1.item(), epoch) + tb_writer.flush() + + +def validate(args, val_loader, model, criterion, epoch, tb_writer=None): + batch_time = AverageMeter() + losses = AverageMeter() + top1 = AverageMeter() + + # switch to evaluate mode + model.eval() + + end = time.time() + with torch.no_grad(): + for i, (input, target) in enumerate(val_loader): + if torch.cuda.is_available(): + target = target.cuda() + input_var = input.cuda() + target_var = target.cuda() + else: + target = target.cpu() + input_var = input.cpu() + target_var = target.cpu() + + if args.half: + input_var = input_var.half() + + # compute output + output_ = [] + kl_ = [] + for mc_run in range(args.num_mc): + output = model(input_var) + kl = get_kl_loss(model) + output_.append(output) + kl_.append(kl) + output = torch.mean(torch.stack(output_), dim=0) + kl = torch.mean(torch.stack(kl_), dim=0) + cross_entropy_loss = criterion(output, target_var) + # scaled_kl = kl / len_trainset + scaled_kl = kl / args.batch_size + # scaled_kl = 0.2 * (kl / len_trainset) + + # ELBO loss + loss = cross_entropy_loss + scaled_kl + + output = output.float() + loss = loss.float() + + # measure accuracy and record loss + prec1 = accuracy(output.data, target)[0] + losses.update(loss.item(), input.size(0)) + top1.update(prec1.item(), input.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + print( + "Test: [{0}/{1}]\t" + "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t" + "Loss {loss.val:.4f} ({loss.avg:.4f})\t" + "Prec@1 {top1.val:.3f} ({top1.avg:.3f})".format( + i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1 + ) + ) + + if tb_writer is not None: + tb_writer.add_scalar("val/cross_entropy_loss", cross_entropy_loss.item(), epoch) + tb_writer.add_scalar("val/kl_div", scaled_kl.item(), epoch) + tb_writer.add_scalar("val/elbo_loss", loss.item(), epoch) + tb_writer.add_scalar("val/accuracy", prec1.item(), epoch) + tb_writer.flush() + + print(" * Prec@1 {top1.avg:.3f}".format(top1=top1)) + + return top1.avg + + +def evaluate(args, model, val_loader): + pred_probs_mc = [] + test_loss = 0 + correct = 0 + output_list = [] + labels_list = [] + model.eval() + with torch.no_grad(): + begin = time.time() + for data, target in val_loader: + if torch.cuda.is_available(): + data, target = data.cuda(), target.cuda() + else: + data, target = data.cpu(), target.cpu() + output_mc = [] + for mc_run in range(args.num_monte_carlo): + output = model.forward(data) + output_mc.append(output) + output_ = torch.stack(output_mc) + output_list.append(output_) + labels_list.append(target) + end = time.time() + print("inference throughput: ", len_testset / (end - begin), " images/s") + + output = torch.stack(output_list) + output = output.permute(1, 0, 2, 3) + output = output.contiguous().view(args.num_monte_carlo, len_testset, -1) + output = torch.nn.functional.softmax(output, dim=2) + labels = torch.cat(labels_list) + pred_mean = output.mean(dim=0) + Y_pred = torch.argmax(pred_mean, axis=1) + print("Test accuracy:", (Y_pred.data.cpu().numpy() == labels.data.cpu().numpy()).mean() * 100) + np.save("./probs_cifar_mc.npy", output.data.cpu().numpy()) + np.save("./cifar_test_labels_mc.npy", labels.data.cpu().numpy()) + + +def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): + """ + Save the training model + """ + torch.save(state, filename) + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def accuracy(output, target, topk=(1,)): + """Computes the precision@k for the specified values of k""" + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == "__main__": + main() diff --git a/bayesian_torch/models/dnn_to_bnn.py b/bayesian_torch/models/dnn_to_bnn.py new file mode 100644 index 0000000..18b9b51 --- /dev/null +++ b/bayesian_torch/models/dnn_to_bnn.py @@ -0,0 +1,165 @@ +# Copyright (C) 2021 Intel Labs +# +# BSD-3-Clause License +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Functions related to DNN to BNN model conversion. +# +# @authors: Mahesh Subedar +# +# =============================================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import bayesian_torch.layers as bayesian_layers +from bayesian_torch.utils.util import get_rho + + +# -------------------------------------------------------------------------------- +# Parameters used to define BNN layyers. +# bnn_prior_parameters = { +# "prior_mu": 0.0, +# "prior_sigma": 1.0, +# "posterior_mu_init": 0.0, +# "posterior_rho_init": -4.0, +# "type": "Reparameterization", # Flipout or Reparameterization +# } + + +def bnn_linear_layer(params, d): + layer_type = d.__class__.__name__ + params["type"] + layer_fn = getattr(bayesian_layers, layer_type) # Get BNN layer + bnn_layer = layer_fn( + in_features=d.in_features, + out_features=d.out_features, + prior_mean=params["prior_mu"], + prior_variance=params["prior_sigma"], + posterior_mu_init=params["posterior_mu_init"], + posterior_rho_init=params["posterior_rho_init"], + bias=d.bias is not None, + ) + # if MOPED is enabled initialize mu and sigma + if params["moped_enable"]: + delta = params["moped_delta"] + bnn_layer.mu_weight.data.copy_(d.weight.data) + bnn_layer.rho_weight.data.copy_(get_rho(d.weight.data, delta)) + if bnn_layer.mu_bias is not None: + bnn_layer.mu_bias.data.copy_(d.bias.data) + bnn_layer.rho_bias.data.copy_(get_rho(d.bias.data, delta)) + bnn_layer.dnn_to_bnn_flag = True + return bnn_layer + + +def bnn_conv_layer(params, d): + layer_type = d.__class__.__name__ + params["type"] + layer_fn = getattr(bayesian_layers, layer_type) # Get BNN layer + bnn_layer = layer_fn( + in_channels=d.in_channels, + out_channels=d.out_channels, + kernel_size=d.kernel_size[0], + stride=d.stride, + padding=d.padding, + dilation=d.dilation, + groups=d.groups, + prior_mean=params["prior_mu"], + prior_variance=params["prior_sigma"], + posterior_mu_init=params["posterior_mu_init"], + posterior_rho_init=params["posterior_rho_init"], + bias=d.bias is not None, + ) + + # if MOPED is enabled, initialize mu and sigma + if params["moped_enable"]: + delta = params["moped_delta"] + bnn_layer.mu_kernel.data.copy_(d.weight.data) + bnn_layer.rho_kernel.data.copy_(get_rho(d.weight.data, delta)) + if bnn_layer.mu_bias is not None: + bnn_layer.mu_bias.data.copy_(d.bias.data) + bnn_layer.rho_bias.data.copy_(get_rho(d.bias.data, delta)) + bnn_layer.dnn_to_bnn_flag = True + return bnn_layer + + +def bnn_lstm_layer(params, d): + layer_type = d.__class__.__name__ + params["type"] + layer_fn = getattr(bayesian_layers, layer_type) # Get BNN layer + bnn_layer = layer_fn( + in_features=d.input_size, + out_features=d.hidden_size, + prior_mean=params["prior_mu"], + prior_variance=params["prior_sigma"], + posterior_mu_init=params["posterior_mu_init"], + posterior_rho_init=params["posterior_rho_init"], + bias=d.bias is not None, + ) + # if MOPED is enabled initialize mu and sigma + if params["moped_enable"]: + print("WARNING: MOPED method is not supported for LSTM layers!!!") + bnn_layer.dnn_to_bnn_flag = True + return bnn_layer + + +# replaces linear and conv layers +# bnn_prior_parameters - check the template at the top. +def dnn_to_bnn(m, bnn_prior_parameters): + for name, value in list(m._modules.items()): + if m._modules[name]._modules: + dnn_to_bnn(m._modules[name], bnn_prior_parameters) + elif "Conv" in m._modules[name].__class__.__name__: + setattr( + m, + name, + bnn_conv_layer( + bnn_prior_parameters, + m._modules[name])) + elif "Linear" in m._modules[name].__class__.__name__: + setattr( + m, + name, + bnn_linear_layer( + bnn_prior_parameters, + m._modules[name])) + elif "LSTM" in m._modules[name].__class__.__name__: + setattr( + m, + name, + bnn_lstm_layer( + bnn_prior_parameters, + m._modules[name])) + else: + pass + return + + +def get_kl_loss(m): + kl_loss = None + for layer in m.modules(): + if hasattr(layer, "kl_loss"): + if kl_loss is None: + kl_loss = layer.kl_loss() + else: + kl_loss += layer.kl_loss() + return kl_loss From b2f81a3fa8df54973144615bc61af1472e08cff7 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Thu, 16 Dec 2021 13:06:16 -0800 Subject: [PATCH 09/69] Update README.md --- README.md | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 9d77441..12c9bc4 100644 --- a/README.md +++ b/README.md @@ -27,21 +27,20 @@ The repository has implementations for the following Bayesian layers: LinearRadial Conv1dRadial, Conv2dRadial, Conv3dRadial, ConvTranspose1dRadial, ConvTranspose2dRadial, ConvTranspose3dRadial LSTMRadial ---> - [ ] **Variational layers with Gaussian mixture model (GMM) posteriors using reparameterized Monte Carlo estimators** (in `pre-alpha`) LinearMixture Conv1dMixture, Conv2dMixture, Conv3dMixture, ConvTranspose1dMixture, ConvTranspose2dMixture, ConvTranspose3dMixture LSTMMixture +--> Please refer to [documentation](doc/bayesian_torch.layers.md#layers) of Bayesian layers for details. Other features include: - [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)] - [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2019](https://arxiv.org/abs/1906.05323)] -- [ ] dnn_to_bnn: An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition by replacing neural network layers with corresponding Bayesian layers (`updating soon...`) - +- [x] [dnn_to_bnn](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. ## Installation @@ -119,19 +118,44 @@ sh scripts/test_deterministic_cifar.sh If you use this code, please cite as: ```sh @misc{krishnan2020bayesiantorch, - author = {Ranganath Krishnan and Piero Esposito}, + author = {Ranganath Krishnan and Piero Esposito and Mahesh Subedar}, title = {Bayesian-Torch: Bayesian neural network layers for uncertainty estimation}, year = {2020}, publisher = {GitHub}, howpublished = {\url{https://github.com/IntelLabs/bayesian-torch}} } ``` - -Cite the weight sampling methods as well: [Blundell et al. 2015](https://arxiv.org/abs/1505.05424); [Wen et al. 2018](https://arxiv.org/abs/1803.04386) +Accuracy versus Uncertainty Calibration (AvUC) loss +```sh +@inproceedings{NEURIPS2020_d3d94468, + title = {Improving model calibration with accuracy versus uncertainty optimization}, + author = {Krishnan, Ranganath and Tickoo, Omesh}, + booktitle = {Advances in Neural Information Processing Systems}, + volume = {33}, + pages = {18237--18248}, + year = {2020}, + url = {https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf} + +} +``` +MOdel Priors with Empirical Bayes using DNN (MOPED) +```sh +@inproceedings{krishnan2020specifying, + title={Specifying weight priors in bayesian deep neural networks with empirical bayes}, + author={Krishnan, Ranganath and Subedar, Mahesh and Tickoo, Omesh}, + booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, + volume={34}, + number={04}, + pages={4477--4484}, + year={2020}, + url = {https://ojs.aaai.org/index.php/AAAI/article/view/5875} +} +``` **Contributors** - Ranganath Krishnan - Piero Esposito +- Mahesh Subedar This code is intended for researchers and developers, enables to quantify principled uncertainty estimates from deep neural network predictions using stochastic variational inference in Bayesian neural networks. Feedbacks, issues and contributions are welcome. Email to for any questions. From 42724f5ad76c50a9a749167d077b180bd5a18009 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Thu, 16 Dec 2021 13:10:27 -0800 Subject: [PATCH 10/69] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 12c9bc4..1495f01 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Please refer to [documentation](doc/bayesian_torch.layers.md#layers) of Bayesian Other features include: - [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)] -- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2019](https://arxiv.org/abs/1906.05323)] +- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): Specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2020](https://ojs.aaai.org/index.php/AAAI/article/view/5875)] - [x] [dnn_to_bnn](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. ## Installation From d8b9940dd411e6c6651cc1dfd1d2daa21f64778d Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Thu, 16 Dec 2021 06:21:34 -0800 Subject: [PATCH 11/69] update the posterior variational param init value Signed-off-by: Ranganath Krishnan --- bayesian_torch/models/bayesian/resnet_variational.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bayesian_torch/models/bayesian/resnet_variational.py b/bayesian_torch/models/bayesian/resnet_variational.py index 49d3086..74e1d16 100644 --- a/bayesian_torch/models/bayesian/resnet_variational.py +++ b/bayesian_torch/models/bayesian/resnet_variational.py @@ -20,7 +20,7 @@ prior_mu = 0.0 prior_sigma = 1.0 posterior_mu_init = 0.0 -posterior_rho_init = -2.0 +posterior_rho_init = -3.0 def _weights_init(m): From 8d4e1366bb7d73dbd98c723c9162ccada26fd6b2 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Thu, 16 Dec 2021 06:26:08 -0800 Subject: [PATCH 12/69] remove duplicate kl_loss definition in Conv1dReparameterization layer Signed-off-by: Ranganath Krishnan --- .../layers/variational_layers/conv_variational.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py index 1d55363..7855ad8 100644 --- a/bayesian_torch/layers/variational_layers/conv_variational.py +++ b/bayesian_torch/layers/variational_layers/conv_variational.py @@ -169,15 +169,6 @@ def kl_loss(self): return kl - def kl_loss(self): - sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) - kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) - if self.bias: - sigma_bias = torch.log1p(torch.exp(self.rho_bias)) - kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) - - return kl - def forward(self, input, return_kl=True): if self.dnn_to_bnn_flag: return_kl = False From bc6681bbed3f7d4a963536250abf505c77f2bcbe Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Thu, 16 Dec 2021 06:50:57 -0800 Subject: [PATCH 13/69] include kl_loss() function in Convolutional flipout layers, to compute kl when 'return_kl' flag is set to False. Fix for issue#12. Signed-off-by: Ranganath Krishnan --- .../layers/flipout_layers/conv_flipout.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py index 5214a99..ce13897 100644 --- a/bayesian_torch/layers/flipout_layers/conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py @@ -152,6 +152,14 @@ def init_parameters(self): self.prior_bias_mu.data.fill_(self.prior_mean) self.prior_bias_sigma.data.fill_(self.prior_variance) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + return kl + def forward(self, x, return_kl=True): if self.dnn_to_bnn_flag: @@ -311,6 +319,14 @@ def init_parameters(self): self.prior_bias_mu.data.fill_(self.prior_mean) self.prior_bias_sigma.data.fill_(self.prior_variance) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + return kl + def forward(self, x, return_kl=True): if self.dnn_to_bnn_flag: @@ -469,6 +485,14 @@ def init_parameters(self): self.prior_bias_mu.data.fill_(self.prior_mean) self.prior_bias_sigma.data.fill_(self.prior_variance) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + return kl + def forward(self, x, return_kl=True): if self.dnn_to_bnn_flag: @@ -624,6 +648,14 @@ def init_parameters(self): self.prior_bias_mu.data.fill_(self.prior_mean) self.prior_bias_sigma.data.fill_(self.prior_variance) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + return kl + def forward(self, x, return_kl=True): if self.dnn_to_bnn_flag: @@ -784,6 +816,14 @@ def init_parameters(self): self.prior_bias_mu.data.fill_(self.prior_mean) self.prior_bias_sigma.data.fill_(self.prior_variance) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + return kl + def forward(self, x, return_kl=True): if self.dnn_to_bnn_flag: @@ -944,6 +984,14 @@ def init_parameters(self): self.prior_bias_mu.data.fill_(self.prior_mean) self.prior_bias_sigma.data.fill_(self.prior_variance) + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + return kl + def forward(self, x, return_kl=True): if self.dnn_to_bnn_flag: From 06922f86c5672b178c84ede714e461cbc13ad439 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Thu, 16 Dec 2021 13:53:28 -0800 Subject: [PATCH 14/69] Update release version with dnn_to_bnn() feature --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 533df3a..0ff1bc1 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name = "bayesian-torch", packages = find_packages(), - version = "0.1", + version = "0.2", description = "Bayesian layers and utilities to perform stochastic variational inference in PyTorch", author = "ranganath.krishnan@intel.com", url = "https://github.com/IntelLabs/bayesian-torch", From fa51c94f7de025ef2291f85fb4ba81a6fcbb5831 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Thu, 13 Jan 2022 00:16:17 -0800 Subject: [PATCH 15/69] Update README.md update usage instructions in README file --- README.md | 60 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 1495f01..5f33aff 100644 --- a/README.md +++ b/README.md @@ -43,9 +43,13 @@ Other features include: - [x] [dnn_to_bnn](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. ## Installation - - -**Install from source:** + +**To install latest development version from source:** ```sh git clone https://github.com/IntelLabs/bayesian-torch cd bayesian-torch @@ -61,15 +65,52 @@ Dependencies: - pip install tensorboard - pip install scikit-learn -## Example usage -We have provided [example model implementations](bayesian_torch/models/bayesian) using the Bayesian layers. +## Usage +There are two ways to build Bayesian deep neural networks using Bayesian-Torch: +1. Convert an existing deterministic deep neural network (dnn) model to Bayesian deep neural network (bnn) model with dnn_to_bnn() +2. Define your custom model using the Bayesian layers ([Flipout](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers) or [Reparameterization](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers)) -We also provide [example usages](bayesian_torch/examples) and [scripts](bayesian_torch/scripts) to train/evaluate the models. The instructions for CIFAR10 examples is provided below, similar scripts for ImageNet and MNIST are available. +(1) For instance to build Bayesian-ResNet18 from torchvision deterministic ResNet18 model: +``` +import torchvision +from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn + +const_bnn_prior_parameters = { + "prior_mu": 0.0, + "prior_sigma": 1.0, + "posterior_mu_init": 0.0, + "posterior_rho_init": -3.0, + "type": "Reparameterization", # Flipout or Reparameterization + "moped_enable": False, # True to initialize mu/sigma from the pretrained dnn weights + "moped_delta": 0.2, +} + +model = torchvision.models.resnet18() +dnn_to_bnn(model, const_bnn_prior_parameters) +``` +To use MOPED method, setting the prior and initializing variational parameters from a pretrained determined model (helps training convergence of larger models): +``` +const_bnn_prior_parameters = { + "prior_mu": 0.0, + "prior_sigma": 1.0, + "posterior_mu_init": 0.0, + "posterior_rho_init": -3.0, + "type": "Reparameterization", # Flipout or Reparameterization + "moped_enable": True, # True to initialize mu/sigma from the pretrained dnn weights + "moped_delta": 0.2, +} + +model = torchvision.models.resnet18(pretrained=True) +dnn_to_bnn(model, const_bnn_prior_parameters) +``` +(2) For building custom models, we have provided [example model implementations](bayesian_torch/models/bayesian) using the Bayesian layers. +## Example usage (training and evaluation of models) + +We have provided [example usages](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/examples) and [scripts](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/scripts) to train/evaluate the models. The instructions for CIFAR10 examples is provided below, similar scripts for ImageNet and MNIST are available. ``` cd bayesian_torch ``` - ### Training To train Bayesian ResNet on CIFAR10, run this command: @@ -152,11 +193,6 @@ MOdel Priors with Empirical Bayes using DNN (MOPED) } ``` -**Contributors** -- Ranganath Krishnan -- Piero Esposito -- Mahesh Subedar - This code is intended for researchers and developers, enables to quantify principled uncertainty estimates from deep neural network predictions using stochastic variational inference in Bayesian neural networks. Feedbacks, issues and contributions are welcome. Email to for any questions. From c3ca3f81af08833b7049ba6ffb82699d273bba7e Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Mon, 17 Jan 2022 15:20:35 -0800 Subject: [PATCH 16/69] Update requirements.txt --- requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index f67a034..2452240 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -torch -torchvision -tensorboard -scikit-learn \ No newline at end of file +torch>=1.7.0 +torchvision>=0.8.1 +tensorboard>=1.15.0 +scikit-learn>=0.20.3 From de85018aee4927d4324e10f0c1b53095f85dbec8 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Mon, 17 Jan 2022 16:56:43 -0800 Subject: [PATCH 17/69] Include training, testing and uncertainty quantification snippet in README.md --- README.md | 55 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 5f33aff..2965ee4 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Bayesian-Torch: Bayesian neural network layers for uncertainty estimation -**[Get started](#Installation)** | **[Example usage](#example-usage)** | **[Documentation](doc/bayesian_torch.layers.md)** | **[License](LICENSE)** | **[Citing](#citing)** +**[Get started](#installation)** | **[Example usage](#example-usage-training-and-evaluation-of-models)** | **[Documentation](doc/bayesian_torch.layers.md)** | **[License](LICENSE)** | **[Citing](#citing)** ### Bayesian layers and utilities to perform stochastic variational inference in PyTorch @@ -38,9 +38,9 @@ The repository has implementations for the following Bayesian layers: Please refer to [documentation](doc/bayesian_torch.layers.md#layers) of Bayesian layers for details. Other features include: +- [x] [dnn_to_bnn()](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. +- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): Specifying weight priors and variational posteriors in Bayesian neural networks with Empirical Bayes [[Krishnan et al. 2020](https://ojs.aaai.org/index.php/AAAI/article/view/5875)] - [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)] -- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): Specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2020](https://ojs.aaai.org/index.php/AAAI/article/view/5875)] -- [x] [dnn_to_bnn](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. ## Installation ## Usage There are two ways to build Bayesian deep neural networks using Bayesian-Torch: 1. Convert an existing deterministic deep neural network (dnn) model to Bayesian deep neural network (bnn) model with dnn_to_bnn() 2. Define your custom model using the Bayesian layers ([Flipout](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers) or [Reparameterization](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers)) -(1) For instance to build Bayesian-ResNet18 from torchvision deterministic ResNet18 model: +(1) For instance, building Bayesian-ResNet18 from torchvision deterministic ResNet18 model is as simple as: ``` +import torch import torchvision -from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn +from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn, get_kl_loss const_bnn_prior_parameters = { "prior_mu": 0.0, @@ -82,13 +84,13 @@ const_bnn_prior_parameters = { "posterior_rho_init": -3.0, "type": "Reparameterization", # Flipout or Reparameterization "moped_enable": False, # True to initialize mu/sigma from the pretrained dnn weights - "moped_delta": 0.2, + "moped_delta": 0.5, } model = torchvision.models.resnet18() dnn_to_bnn(model, const_bnn_prior_parameters) ``` -To use MOPED method, setting the prior and initializing variational parameters from a pretrained determined model (helps training convergence of larger models): +To use MOPED method, setting the prior and initializing variational parameters from a pretrained deterministic model (helps training convergence of larger models): ``` const_bnn_prior_parameters = { "prior_mu": 0.0, @@ -97,12 +99,47 @@ const_bnn_prior_parameters = { "posterior_rho_init": -3.0, "type": "Reparameterization", # Flipout or Reparameterization "moped_enable": True, # True to initialize mu/sigma from the pretrained dnn weights - "moped_delta": 0.2, + "moped_delta": 0.5, } model = torchvision.models.resnet18(pretrained=True) dnn_to_bnn(model, const_bnn_prior_parameters) ``` +Training snippet: +``` +criterion = torch.nn.CrossEntropyLoss() +optimizer = torch.optim.Adam(model.parameters(), args.learning_rate) + +output = model(x_train) +kl = get_kl_loss(model) +ce_loss = criterion(output, y_train) +loss = ce_loss + kl / args.batch_size + +loss.backward() +optimizer.step() +``` +Testing snippet: +``` +model.eval() +with torch.no_grad(): + output_mc = [] + for mc_run in range(args.num_monte_carlo): + logits = model(x_test) + probs = torch.nn.functional.softmax(logits, dim=-1) + output_mc.append(probs) + output = torch.stack(output_mc) + pred_mean = output.mean(dim=0) + y_pred = torch.argmax(pred_mean, axis=-1) + test_acc = (y_pred.data.cpu().numpy() == y_test.data.cpu().numpy()).mean() +``` +Uncertainty Quantification: +``` +from utils.util import predictive_entropy, mutual_information + +predictive_uncertainty = predictive_entropy(output.data.cpu().numpy()) +model_uncertainty = mutual_information(output.data.cpu().numpy()) +``` + (2) For building custom models, we have provided [example model implementations](bayesian_torch/models/bayesian) using the Bayesian layers. ## Example usage (training and evaluation of models) From 57ac5df8a93edef21e08c2a338e83f5d122faafc Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Mon, 17 Jan 2022 17:24:16 -0800 Subject: [PATCH 18/69] update version in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0ff1bc1..16cdc9f 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name = "bayesian-torch", packages = find_packages(), - version = "0.2", + version = "0.2.0", description = "Bayesian layers and utilities to perform stochastic variational inference in PyTorch", author = "ranganath.krishnan@intel.com", url = "https://github.com/IntelLabs/bayesian-torch", From e38d6961c5e4249eda0d5573cc43eca601973f72 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Wed, 26 Jan 2022 15:36:18 -0800 Subject: [PATCH 19/69] Update bayesian_torch.layers.md --- doc/bayesian_torch.layers.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/bayesian_torch.layers.md b/doc/bayesian_torch.layers.md index d03c3cf..80995a1 100644 --- a/doc/bayesian_torch.layers.md +++ b/doc/bayesian_torch.layers.md @@ -3,8 +3,10 @@ A set of Bayesian neural network layers to perform stochastic variational infere - Variational layers with reparameterized Monte Carlo estimators [[Blundell et al. 2015](https://arxiv.org/abs/1505.05424)] - Variational layers with Flipout Monte Carlo estimators [[Wen et al. 2018](https://arxiv.org/abs/1803.04386)] + # Layers @@ -29,7 +31,7 @@ A set of Bayesian neural network layers to perform stochastic variational infere * [ConvTranspose3dFlipout](#class-convtranspose3dflipout) * [LSTMFlipout](#class-lstmflipout) - + @@ -66,6 +68,7 @@ Calculates the Kullback-Leibler divergence from distribution normal Q (parametri ##### Returns torch.Tensor of shape 0 + ## class LinearReparameterization ### bayesian_torch.layers.LinearReparameterization(in_features, out_features, prior_mean, prior_variance, posterior_mu_init, posterior_rho_init, bias=True) @@ -539,6 +543,7 @@ Samples the weights with Flipout and performs LSTM feedforward operation. --- + From 7d343e5b2071243e38ff894f9f2e5ce0d79cc629 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Wed, 26 Jan 2022 15:38:07 -0800 Subject: [PATCH 20/69] Update links in README.md --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2965ee4..0765468 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Bayesian-Torch: Bayesian neural network layers for uncertainty estimation -**[Get started](#installation)** | **[Example usage](#example-usage-training-and-evaluation-of-models)** | **[Documentation](doc/bayesian_torch.layers.md)** | **[License](LICENSE)** | **[Citing](#citing)** +**[Get started](https://github.com/IntelLabs/bayesian-torch#installation)** | **[Example usage](https://github.com/IntelLabs/bayesian-torch#usage)** | **[Documentation](https://github.com/IntelLabs/bayesian-torch/blob/main/doc/bayesian_torch.layers.md)** | **[License](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)** | **[Citing](https://github.com/IntelLabs/bayesian-torch#citing)** ### Bayesian layers and utilities to perform stochastic variational inference in PyTorch @@ -8,14 +8,14 @@ Bayesian-Torch is designed to be flexible and seamless in extending a determinis The repository has implementations for the following Bayesian layers: -- [x] **[Variational layers with reparameterized Monte Carlo estimators](bayesian_torch/layers/variational_layers)** [[Blundell et al. 2015](https://arxiv.org/abs/1505.05424)] +- [x] **[Variational layers with reparameterized Monte Carlo estimators](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers)** [[Blundell et al. 2015](https://arxiv.org/abs/1505.05424)] LinearVariational Conv1dVariational, Conv2dVariational, Conv3dVariational, ConvTranspose1dVariational, ConvTranspose2dVariational, ConvTranspose3dVariational LSTMVariational -- [x] **[Variational layers with Flipout Monte Carlo estimators](bayesian_torch/layers/flipout_layers)** [[Wen et al. 2018](https://arxiv.org/abs/1803.04386)] +- [x] **[Variational layers with Flipout Monte Carlo estimators](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers)** [[Wen et al. 2018](https://arxiv.org/abs/1803.04386)] LinearFlipout Conv1dFlipout, Conv2dFlipout, Conv3dFlipout, ConvTranspose1dFlipout, ConvTranspose2dFlipout, ConvTranspose3dFlipout @@ -35,7 +35,9 @@ The repository has implementations for the following Bayesian layers: LSTMMixture --> + Other features include: - [x] [dnn_to_bnn()](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. @@ -140,7 +142,7 @@ predictive_uncertainty = predictive_entropy(output.data.cpu().numpy()) model_uncertainty = mutual_information(output.data.cpu().numpy()) ``` -(2) For building custom models, we have provided [example model implementations](bayesian_torch/models/bayesian) using the Bayesian layers. +(2) For building custom models, we have provided [example model implementations](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/models/bayesian) using the Bayesian layers. ## Example usage (training and evaluation of models) From 83fe7174eae3ae065af7ae847270364982b16dd9 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Wed, 26 Jan 2022 15:55:56 -0800 Subject: [PATCH 21/69] Update setup.py --- setup.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 16cdc9f..2ba22c1 100644 --- a/setup.py +++ b/setup.py @@ -12,8 +12,9 @@ name = "bayesian-torch", packages = find_packages(), version = "0.2.0", - description = "Bayesian layers and utilities to perform stochastic variational inference in PyTorch", - author = "ranganath.krishnan@intel.com", + description = "Bayesian-Torch: Bayesian neural network layers for uncertainty estimation", + author = "Intel Labs", + author_email = "ranganath.krishnan@intel.com", url = "https://github.com/IntelLabs/bayesian-torch", long_description = long_desc, long_description_content_type = "text/markdown", @@ -22,6 +23,11 @@ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "Intended Audience :: Science/Research", - "Programming Language :: Python :: 3.7" + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: " + "Artificial Intelligence", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: " + "Python Modules", ] ) From 7c6df36ced650f7b1dbdd01ec7e28353704a7315 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Wed, 26 Jan 2022 16:34:48 -0800 Subject: [PATCH 22/69] Update README.md --- README.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0765468..28c6f3e 100644 --- a/README.md +++ b/README.md @@ -197,12 +197,14 @@ sh scripts/test_deterministic_cifar.sh If you use this code, please cite as: ```sh -@misc{krishnan2020bayesiantorch, - author = {Ranganath Krishnan and Piero Esposito and Mahesh Subedar}, - title = {Bayesian-Torch: Bayesian neural network layers for uncertainty estimation}, - year = {2020}, - publisher = {GitHub}, - howpublished = {\url{https://github.com/IntelLabs/bayesian-torch}} +@software{krishnan2022bayesiantorch, + author = {Ranganath Krishnan and Pi Esposito and Mahesh Subedar}, + title = {Bayesian-Torch: Bayesian neural network layers for uncertainty estimation}, + month = jan, + year = 2022, + doi = {10.5281/zenodo.5908307}, + url = {https://doi.org/10.5281/zenodo.5908307} + howpublished = {\url{https://github.com/IntelLabs/bayesian-torch}} } ``` Accuracy versus Uncertainty Calibration (AvUC) loss From 3acddc9c258ed68e67e9861d53b15b17d5a689d2 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Fri, 28 Jan 2022 07:10:23 -0800 Subject: [PATCH 23/69] include assets folder Signed-off-by: Ranganath Krishnan --- assets/bayesian-torch.png | Bin 0 -> 31054 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 assets/bayesian-torch.png diff --git a/assets/bayesian-torch.png b/assets/bayesian-torch.png new file mode 100644 index 0000000000000000000000000000000000000000..95f640371fcf9c07db165b85ae2170af734a9752 GIT binary patch literal 31054 zcmeFZWmMbG);1gpg$hzAlwzewa4YWa?otA@MS@cjq&O8yp=b#14y8bg2Q8(=U5lnb zf&6Z;h*S3ly|DP+2$eKIflma`ttg{?p3_WdGJWc zoozNJJPBp;I3)!I#H>!;+zC@RZ#X@~Xs|9l_2jyxr_yk(w#hG{U%Qo2XFr&~&nH29 zr9}+Gj2qqmEB{Xd|JNj7P7jO)WcMCvKi!ZC6E2o9GtzV#rGImg^2KpqV(7JY@UGDX z{c5qe+^I+Z5O4R1FoJXyn$=^t5`XY6d->va+w7`Z@&QTq0rk~7fAza&jw4n7FFs$bbd{t^>OzMl5GAWAvK4-U+{j-JATPdj5{4kV*@uXKzA zsj#tlH#3E?SUwrz?K}X!b3kuymZnIkGXp};dJ;RyOXale)MQ)o3IJ&olK3wS6u_Zb zOyoSD)rMdlVtHWob#QlaxT8uVT_(XzM-N;k21CnOZ@TFlrorh9gKgpl6K1J>C(!+{ zqLpqI05M$4W<_;$C(vb8@r!y)WAs9iAINdOoutGG&W+LA_T}f7Bf077pQ7)(Mhv*4 zq`Bv-wXGZ}DbWaU~0JUpx zRM0;qwDG3iwxGu#d3Xe!%sC|#7L!(6fx6LV|0?F;gMd&~&5`HIXRNfSR{80DC$Ft- z78Gt8|1K`LIX$3?)XmZJx5*jYXYKh%djHgZyOoItFn}3y#}`#|m<4}=6net$|NC!HP-Oxo(_o+KH!xCqTrAU&dtbvGckiYMRl=%ec$7!xH10fqk;2g zb2$BX2(k(|xy^B$QF+5-lJ5NH`HR#WS*XRN=`sMn&ffjc^PqP)DK_k{zv@k@i~C>A zBW{|TasEcycvmR@r!_$91FpHaQ4PK~{2s-BrOA^7Qw!BY`?Eb*r=gh6chVa@!;jeL z^AwSIZsbvzb}ZrcD&d%w0af@hu7Y^A&0Aa|&}a~%Ff;dQa%Gk+W8p(K9nHl;$*z+b zSCK~|ZWuY=A@;nSO{yD@(qM}}=>Z$e|LphhuC+Ugr^cU{!#bJyVMB) z`f`X-aoNh5mdSX%<>ws2B7t}>3|@K@mLRx|rgkvulpYNAv)y^|G&0{NZ#^}g zBYS;m*4L@C0j*IskGQSR8k?1v{e`iMMrV4I47Y;+E~mt})%j!S*pj-R%A=#_pveV ztlzrLogw)>ja)RW@ZEp9c|ndlb|v9~XU3ymDauTcM&NldW z!HJAEFD^^e^02KTQuWfrwGXqV z@c#}Yjs9k01I6d8_EXJ%9jyORO8^~G+=7v;wlUxBo6Cj$Yh1Y3P_}=xkr6MyRxjUB zNB@5+0C#ZOB4@-4n30O-{J`OVW`e7h2jDu2HtlZiI$<=$PjB}7-<=hs$p0GsA13NK zFF;#&p++%nWMGp=Q$C~xr&pVkiy{nZrk7ZN6kMRYp+p9suRe_D=#`u#APQDP6vs{i zZGve5fTz}M$sdO;6VcraKhYVSo8$WX{)j*q?1zu1*>O@!5B-y6iw^476R&Rp2${_$ zVO46I_$6@&5HkQ!Vp1U-qdw0S&wv%K2vaobpawh&8w@?$;P6l_y4x$~wPMFQY2zc(J)qCK%uPF9GuKb9;VK$Udeqy9oSqIv?-py_g+~ z?B|F6fnO{2W^A(I&m-F8IwI&>{a8-fKPLE4hA_&a4q{T{e~Dd^J6+;Q3?88T_~Rhi z#Z-p}!a$*^3(d7~4KaST#H4}K%lRneP%SpdD#62L_q>g8wOqbp1)4&un)r7lm$L1e z>g*H!6)}8B?1bfN>#8q%I^-1U7ZI`oG`v_N^^Yjpx%ha&{lIryjCiHv1ynb6Kf6z& z<)z5>o~^T=ZqrIvQqRFv2h#Iu5F$}*Wwaurz_}WnCfXUCdFxT)rp0^Yb>%L?hb87T zV5joO>QwxNUlaisoBHntGmbM# z;=?MTE6*=lK(Tsdn|t66u_a+7eF$p^S@QDXYrb$q4CUNumo#7aM0nt%tGGbE1RX1l z1R9Y2VC_&sf`67jH-C-{p7)5~s{X$0O5N*?K6+1v_Hk;;ViCs5zzag}K#7l5GTG8c ze2m2yPbk)EKS);!gHw=wlj^4*H$yPcR-z$g`gU;vs5~O^6E2OptN7w@2&3!U<{=JbP@?|aL;JYXFRLBnM zV8=>W<}ywi|D_UGem@0bAz|C#WO$xGN#89{Ya@f|0LsfExa#|rWMEmDKrOI`I-M2Gn zZqw@&{b2Q#3!XCF=?m5-lek98>?e;s!(?efgs?HSB zca9rtS*)B=k+#K}&M(*o7T{c0I|VH=-JV?dRi z^V0o!c_#*Q2O?7M*+Oot5uHPMUCAGKjRCQ3@fy_Ffjx#Q!P3|9$fd^1Gnl1K`(9rc zpM;qT1fW1EgOp zZ7UNq9M_tifTK$NQG@}6Gs8oPZS5y&yW5)zbL#D`sweyf0SCr&XkgE6%VC-aHflp> zw1}O0Y~Zn*>}ncXCBNh3N+ad~Ji5VetS-ld7)jh-h=xNGltr5dSCRbcjUEsoggd$p zGGCS0=P$JB9fmtH%H={Y+JA`a#jg4Ti&>Nd$jJBhW=A0=b`Ha8g+f32DZFH_?uOU| z_s6dnIP+{Lp9W!Bk|jdo=tE_d{zu_VSO-Uesz7&#r}SlQRl9|)>$=QpIbi_Pa?8qWu+_N3gkKInHA@gpyy+*|992Fts5cygv zc^ruo#VC~yTrwRLB&yRV&57vIZ;_wYrJ7gZtc9UE+b1qE8@^Snej9Qr8Vo&7O^2lm zqBG{9Z&IW$pC&`cSjIwKy}+TVx%Hh~Y7qSEJF=_6I*@#oF=9h^YP8z5B_zjmjrdbG zXsDv#GQUCkj~XOTXN)RPJ45Gp|2FKQ%&`>y!tw#8D%n8jydv&O7Q3K;YOqP1d#zk3 zloK?m3p$3(FGTyy2jd;Ia;^%y>n#8!qDR&SL-V!h613!24r0~`9)kv=C@v|nU(|os zAhH0%?8lRj2t%)g`gF`OcLa==lko&tHmeHLHP)ZRERB)B;P_2K;?3d>=PMjE3@W)%7oD=zeh+vTv|6<}nd_I*->UCrtK8TqGP1dkm)el~1d zwY6hQ_9n6JENPeQBN|I9lTnL-`q)X`B8FxGF%mb97x`+fI-fjLpzq6yrtbqMcgtS| zEI$N}#MS0IL(X*+W=lBN3a2C>g>DU7&9IOilbFFfEi6y(X|tPFT=+t)DrUQI?O$hj z>^=)*53vr(iKuR4Mhw_OobH796ho!{-j&7JNwB|T!23A!iC!{&2Hc{|FdweL_QNn2 zXSL-XiUD6HPe#*8q;5`!b4&R`HyZ!u`DaLM$d7K&^d4vMNGx*31Z2J&c)n7_7`4HR{LT6HQ%6`qT{BPSSFV!Gx9%vGF*A5%_^6-iYk{xfW_kR+*jeTTMtB_+1L?ysUrotL$1TQ&ZK^~_tF zlL64Nb$>Rx5k`lv8$;-E$iKl8ZWK=N3FgYln+ZsU09iWUlfW$`cN$-OQ~SoM3oHF? zBe31zmYMUF$(u1b^IXflDwQ0K7@Y{bSm-xcPR^p^C?sV22lnwEj=XP}eMC~;~+20g@VUhM^?GU<(d+>`&B7BcwvY=~HVv5pEc+#Gkmzs-|! zKn9OcbG-*tGDsYh6q>({GqC-^KQ1D-E$|$c(f3(R7!$NgUyMgA&*?dcsxg<4(kkoi zNh}h<$b;HfAMi|hGa&2k?86k}Et)kWj7gk!s_X~{ZmFaz}ns!ZU32-Zb>+?`l_;5+^`A~T96Tvgx? z%MqZh4mr$J$OKClu8c>xDuT$Nfq(`g@PY|TDWdD$+&KSJU>;%=9a}qEcO!1RXVsphVA??ng}D^W%=$x(DK($TcHt?Z zu~nEfZ+SG`0@?2Bu}s9a&}M*38h1Pp6!ENYkTrRJ5TMhRXdINX>=bkD^;I?VkcKh9 zfd&?{vsq%hvH_#fao`=5xXRCnoHFFUtS9Fj$i^L5=QQ*^o@ZU^K9lbY)~I0#(lKZE zrnDMMtG&LWScmMk^}D{x9jyN0lh`X-3qKtE;%l(_b&b!1VA#gtY5PUw^d02ZZH_prdo*Z@jrX)B9|J5LpxE*^2nCU7ZGd9ljS8%xLR= zgBt#0i`=Bup=~_{HC=PzOlHmiP}kKOmSc@)`v=u_G_Xo3$gmCgu1W@cyXzrhQ%hGc z^+>h0F;=e9(MQu>OKy?u%Jhm}G(H0b9rpd2 z*uV%_N7QjZy)j$YydDJBzI8oZhRKUD44lj*G^Y@9=^|z=8FbWYtnJC|I~B=h^W0DI z$y+~ld^ElD5c{X)tx-?f%SLh1ZMBN)7Xs$YceU9EFEsT81eh&a)pou$AqKNAY$?ue z-*{~~hwHGaAxw+0;bN~hMunI6#FNW99e?(PBm`oAPQ4xdO7ep)^JTem#qDz5p?kYW zEpTLq>tni#e1~!^hH7{8fjp%6PjHq+hv1*#x>bUn2tuJSOU`OxeW+l1u6MsbTrT-L z4FCRSN{dAHCT0cWe6U^rcON_9nwL%aFrZc4jut!U*rSnT?93st0qISIJzK_08!DJi zH&ny=X!d^1ywdkl{FeN~pwP}c6y?vh8w9O+$UfI7m>p8dhGvC|{(zM;5GQpfw(@&Q zr}}?e|D{pbZ|$W~bK7qrb9{Q9>lpxdz&H5*{4&_)jdu&lU*hA}kd$segAU4_djIlf zkf946q{=RZ)%6v(srP0nY;b4*KS>%>_%v{RkfD?9;u(NJc7RDnRScNG>0L+a-d z7^%p1U76nseY3{FpnPx|xH&us8w>2WzV&|X$p#08Sz^z@q225vPr1iIwmRui1wX5k zN9dJd2+IG+lj7Is>Fi%)s~T{S!*Hl6&H>0mZMEMLxk1QeZZqIfE>sbx@x>d9Ux{rf zgb8uU`iv{zX!W-$Os=e7AJ)ac2FIc=Ih(CFG}E107JHJ&R?auO4a!V}(h?<8stUP* zj!~c;$?*WD9&a!LC-&F5U)D>#yK^laU*S&t`am!nXX1s6j)R}*K`ME<*?^-n zSYHhbhQ-7#^!l|IUFazA>b17U$Q8Q*@zWmPtQ2r729TFr8b|6uU{e9+j`BArE@U^V zbM^h;|Lowg$zdDrn3a-n%qBz)#1WVCdeCouXfl(p2)HE{mG_KU_8xpj?vMGKebm{M z5~R>(m+^I;@-B_D*r?KzoWWEFy|*jSpKjdydn04;zFWM}hD5I!*uDnE{Ipl4=U#mr zkrtlMkMf_DrNC1*st6Ed{Q9S9NzH0 zBFE{gb})KoV?ScJAE!a|$rgPHQab(bu#BrzNT<0J0d-%FcPw$7F?jd8Cv?XMBz+wT z7IYv=DebCD-svvxuw({W%4U6@rJ-oQ;mgg)*t0HNyII^G7Te%ytZS&FejS7JC7+5v}aT56UX$Y{eM%l-{k3Bwb zSClwb229;pT`ZgbWd>ZSNflJ5{EbKRGJ+1;7RNEows1xTStr$Xvlths?rGY) z)b|6;*G%>8KWDBVyu!PI9C3G+V&oI{(MA5U<;~Yg&E1K|Ea*8_a-Qfs@6!4ypW@g{ z-Gafad?1z0msj<8>op&$Jzs;@)dE2u+BiKgqE`>>Px}ZOuBPF6wXE-2cPH>+niZ~2 z_xFv+_9E;Vdfpa#dYvLoTrQAWd48PEg)tNkMG9VaTHrXto|!H#P@C=#m_iTvmgnLH zh}oPLZKchCUUQ%GIOcNlcV@0P3*o>Mrn!MolQwCz5TmuA5t(qkZyl^+(rLQIOXla) zuJ6!*@ob>8^w2q3+hkKuaVq^~v_~=wbDg;?C4Q}O^;WDj@A5C0PL-iIQ8^$V<9&zj zZb0H2F%E4aszSJO?bw;h2JPi1DEuz3bEMd%5dv(Q>Nex^UhuyV7!p zCwuh|)O7yZ22n=5XrY9X0=*R* zovwx3RO&IffP?#+8Ag?Yl?rXErZyIXeUxVS!Dm!Dy0$-g=Lzj}<3k;TWBzw~!LAPm z{0>Zlb-1;3Wm>e;`k&mUf*fLn9e+La>J){3(c|qX$T})rs)yL-C)RYu?wsvw2N|#N zLXJKfcQ><3)ILkMn6d7dDc&wgY1PYn)05=BGBn9^`67HY+|aCZXKe)JMAs^oZE4FV zaJC@V;|ccsk{)AIhwnguGySs8tq+snmr(&#YO9hx*)Ag^5Pyc^H_T zRZ>hRaHkFoZPtK3U8P_u?x;?4&L*9RL=dTMHCNHmnOQ7#dkN%+l<`f=hO6LttcI zE}5e_8lP5h2j74BI3AwXMo}MRwqzS%FY7Z!8e_%W>K6#L{L)rDSaWqd`(X%hgvIIaH z!N1UrwRBq#n}ij!hF2x^?C8rl4wKaX2gy+3A z{`VkF$tK?^wVpNK$KkAOMbrIHP3zksA+zBwmWrH%`tMTyP1RtjtIKmpZxrzyNB8}N z=+73O>%?o#P>ywMC4}o@zrDZP^N8v~v7$&ZjqWf>8)wdH)^7bKxwdVfzl=nE>?__x z&_5G$-})2?4n5&Z{w;)UGFXDTMGP6}^*-!#*!Y~d@GS5NZATmXKwFpVJDZRhxwrRh zhvM$wqC(?pYJPfrvz5su3s;@&U)$m^LtD`B&7~UT5AXb3+&2iW&S3U7X*h?*V^e|K z=a!?>O@Cy}@`PTjvn7cZVHa9RHvU|yy}mG=n-&CiN!>FDZce27^GEA-IU@EuTS0vNs1rBKp+!p()DSV0tndjHkJ?&S4b`2c(Zb5qM zTiFngoKzTOij@Ac(?@+-nfQ#BguNXME$mC`4dj{dnEYx%F}&E=0;$MxD!+~g1$5e= zN7Kp}x;=WyPLbWXm|uTY>+i!KZQSu&#JgF4kAG+P_#$EiI=u-0#BWZ2CVO|Qcq6bp z!bY#v&FF5cg*deCiS(QPet*-V)0JPsEn~ z#_J7dlIn!6)|mYOQ?*yvME-GM&8>qCiQCqj(1w>nyyK|~_k}WVy>E#yF4fL6b}m7E zRA(3SC!H{>?z7pimB1Uz zj)tcHEJX@}(yA_G_;5;I-Ip+he5!Y`%LNO?!bVUrKHCA4M0;Y4E#FO41Vhlq+O-?~wt12`ZR+LFEY$#o07qr$L+C%g7=%G8$ET_%VB+Uy{?-=%K%~>B`$7~n7VO*xQ zK%xxx)7$4xhG~{_NH7gABNSZujwje8Gx0)}hcIAcb*OEd?z<`3Q*GHFWQiHC2TmsQ zp)=O-5=5@hQ_%GOUcs7I#833dZQ-vc=90F_t=rEWEbGpd-PtGV`25?}+^};2Z3MLu z#$dqIkjA`1RtOPxVS6%Jc_TI_KfD6Q0V-M;E|Xf%Agqn8Mkk!s>c(@>*KUVerO9oSNe;dse4{J3B^u4^Zj^I}qODy0)&`|vJgL-lt!YAiT_GRrY|AF1) z7%5l9`rt=jg2!P|7S{G;S4}_5?CQjk$;R6?R|y4Mug87bRSP{AQ$~g!adx?fO+kl9 z+gNbo!uFGrY~B$l0$ZM&53^7IoYFcS z*~VCxI2CAt&rGazAiWM_fEy0Hg~+(Pp0)^=G$>5oWC$lZ`l#wn z@Oa|>oC8ha4EQ6TMx^N?q4~D=dFPwZnrfwa+^E*qDMHv)4^EaTal3mtI1y5KHm;Jh zh;!5)5IJNjsX_6Ki$zMQ!Dm?q-3Cos<78*RU_2rNrfO=l-L0q>-wU~+53k*%po_G+ zXf4ZT^(E7-*vZ$Kq#7!-1?dBgYs+PCZ>OyowoWYYymv*s*D?yICVr$=>Qiy&qrlIg z(8$#M_idco4l>)WCqKZIDbb^bHA%`oj+e}ZT;U_|eo#+DlH#UcV8+*>kkz4g=s#yl z#S{>OdR3?#3ZmP_Z(!pG(q&W{6S5qud3GJw>?lB8XC}DH&Y*kQ?iKv2)Yi|l=Hvl( zCkCWeiroMFc}!%*P%xc`v_`42lCpN}*C>vl^Fe8Imp?6a&~l3I_jdm7iS7D*A*dT` z^pgJy73|>?6)A%1jAh-;18nn&NktR=4|HNn^pQ*9r4dJ=DU5@@H8{<CJ_($gcBe7=N+&?0(ujUO9j$Fu@D3-p$tpQl|o?3~}RKK)0+PG>=e8`SL| z)y4!J9!__16=62bmez|Qq-rh^D_;tn)*H}8(?B6*&I*naA}DyM!cnIDV%-Rrh#EL5Qm=&X^;TK99_rG z1kxI3=_pjpn4q+bVm3_G5&<@j*92TK6__ht1MjN^#snYVM91mCw4UhI0YpX?m&Ayz zWTmnGV3^R-kj8Ic%G7B<;B7E_S^D)vUJ4>p8eI7Te_Q}N=*xQhg-u-`NA}~N<&2Jv zABLK29i4wAH4D15q#GzF9mQMVdc54rX z3pfScpAu0GYdl<(*Je22_s{lOIkOe-Q!Rvdcn^vX=t>E#m84Zo2S(79#kGEl&BSND z*LOTF0hQ8FdWQq?pTf9W0`ue&31iK}x$k;|LaNBLnUwu4(oI{ssK%8V-ksn}+NO8v zOmCOH;Jkfl6v1P3u9fHWd|KvFKijlJ`Cf_9I47r6V`m0a=?r3l^MZ(vKi*ye{I_*n zq)tsRk3BoJCj|V7rT1t7zDe`PP%ce02@X7qHKv!+dbcLj_n2y?wadOotS7&am-k43 zs&nRCmsLUFtWVA`4=5xg5V>ykt`j8pR)*u{~5)-=2%D~{DM_br2Z>!!! zq@#Ht6^{h^GRTf(Kc2q7j@3@08~NNO??gK_I5&WRU8(g%ga>whb#V^q zsMoXyXmRr@-ae*iE3T zAVx+ZYat#vyQ>yFW2|nYG_%xSqUVOA4ML-ShZi+Dg-y2H@D5-Pd9jm3J-+OklphU* zosNrOgx|nx-8MKSs1vazR2_4$&ZtT0RumHnBolBMCDaA_{xoukC-J@>$CERblhP|=?(tuCNp zY*s0MOV3ifP*vaE6q%f1{%G4rqnEVsIj7}wg}k-Bnt9;6z|Ss!HWo4pwhj(k=k&R> zD1#K#YpUUuhGDZ+Qn!y*GU57#Vr|JLT1i;pjpk#kASpGbzAU zn%9A|a?7Ikt5lis{;$j8ca!4A*fl6gtI5>qQwlxzQ{R&8mQbVVxl13c1vnPWNbNJ= z-8W8-or}Hy*|&`5kwh*@QPtuzu7dpQOu_d;16vCMDVKthw%_~#;{-Plx*aWQWvQRr zBWhW3mANOHs5sG2e8QcpMTQVYsIEm zZERF$0Rgz5G{ao9(T)Jc}F~3${i)H;9KB+PlxiQ_NBiQ%R+p zep-z&r0Lo3jDVS5wh+hySN$?Zvp`0#<%HW41#=mbhu%9|jXMyZa5 zgxJXQG7}E7)^u(rxk6WoU!@UAsU%?<*Pm*t@x{J&&{BbHVr;LO?>Y&6{u%R^86Hxm zob+Y{#8pyl7eL!E5JNfk;?CQwPGq4A@Dn5QK3KDSGJFhGMyOHanoz66`CK<|#z|8Z z9Qrazh0}pFPjNPxl_;q*18SLJh5aKdkb+1i!1jBTd+3)LTY8u8eiBM|UKL`3A24Cv zzPH&zT4O{|A;MG;m0i3 zKVoW4GavBqk`t;tEakq`i&*N35;kRuSu$y1821B_hT0jz^Xn(?+Qt7-o&&euj1xlCow;tEWAK zg6VBDa}Rj=V`~qrn$92d{5|b?>JVz<|q)MxPM1(Y-ulaV-rl9(+KOY{Ap-w?f>{wa_zz6$3lNKM&801>nTFW24QoK1{+{lBqMu) zm~et@hYcTWQYng-@WxFb;|{w9-}&bn0bY|M1VRoGkqT&k(q8X zoP=qMDyQi7CW1Z8^f>vx;rPj5D4BH%KPQfT0_YL+u8%!k*fD;CR`w&9zCY;@E~3a3 zMDkCFLv2JCcI&Zt^{-OHW6^CJ^Dn?=k_bAc>?;y%qa}0SxGe0`gCG*BnJn1Tv=~0O z{_PkyV3n3d7J5!S?9ltd@Ax3EI)vL#IH{;uEWg6*1K4~cy|nDe`tGYwa>{e zm#YIs4(V_+dZm=9jM&$vc}9{8-8LOhtO|%{jS`84&L;pT-e_9T11rHsIHZwcyr}xZ z zaZz=%C(C^#_9&$4xF1?y1D`l*bP@Dn^2}J$WgJQY42S%r{VT!+sqUe&dRn>$QIZTE zxc$7k{{_rZQpHfK_~>_m5uH)+R)w&P6>M&8l=A&Fx0XS7Gj;D4pwMk_H2l*|AgsB@ zGaCJ}Hz{9&L|CH+{h9UoEaun9VSvVTu97)X$kQsc2kuU?u>@T|`(R-9Jd-`&D?vrV zCst4C#MTE;xT5}(6us1Z|E?`!EZta{h~$3gxVjzGr5@DsWu~04U2hm(RyG5j`dH(@Vfpz^}{hB zUi`gbo>*0VS3VZ^r`74{Pvr2cY1B9LDZRr>RWRr1LF-zBY{e@bC#A4JtvdNz?mN^z z=59Tj)|DLd1Rp=VBYIo+TG$8eHV3}VBha#~E4#Ck(4ED^m=_bLkBcvT;#S#iU-|hU z=7q3Qp;_KcVY7(NfDN-0j;qRit=mLb%M}z=K)akWO5MO@>q%m3U)k|XyT!D3=l6^9 z%E*i7UuuVY_c^F)IlUKZU<6vli8#(demyKohZyhC>%%rydmJARG^rA$3(U{;aNNN8 zu(W*ud7Yor&@W3@3Why)$LuUYh03CZrI$Ns;JkU{mQ6~{K2 zwrp7UCTL()%F|YP#cf=6zRQ0ro*4m6q_$^~js_Av8C2gwjqc_?lY1kj!Smj>P3VWg ztby7(qwx3D&oKN49uCUcI-0ZDx6SUyW5bQKLJW zaDe0ooj9tEJ9ooy zRGGI!L9<#MErW?peJ(y~+EWc?-SwbnMVYXT!@9sC6yR4%Rbcx- zgef>wX>M3AuEG>5n`uVkI^fTj+=kg*Ud z$f)_iDuy&y=!eGf*CbX0E(>phu39-E`P`T|V;n#9Y)|%zIhwu|wB^p&)<$2d(9sPw zT~1j1h&rF$wfG(#c=pICh``K;Zz+A=>G(3acyM0~NK$v2B!w?r{*<#xSSK0IkR|yK zdug7HQ!3ESmxfnw)(wOWzLPJ;tUh7|t5Pz;&1q$18&<`yQntG(`4Ljs#c8>d!&1?) z^8A6BPoG4gnkE-5edgDXt+_q!YT(~CmWF`KF$-EHP7!rx%Y~xAO`EI*`PHMNH^I{Jq>L#;-BKMsF=w=u^H`Qq;U-=S#EXqFE+jYT_ z2F*l~OaYrqriX1#st))ONqm;ApND#wT#4(=?XUEk5HCk`7|EKF2a%l%SuCF8&!A9< zVXV4cxZ!MHx8paU*#=6?Ck;0MuY1{3y08*Vzx7l9?pcayTS8AS96-)NQ&lK*+Vna@ z*(XpsP~Rx48W&9JQ*!)QE(+WgG0K0c|Em~OjJK&n?QUJ0ya7hdFoAf7X0#tnnEYcj zgkv0ad$uyg-(C=cdz?oS#6A5N(f9{0uE=yD`(&`xU?NIVe;h|njiC7b9?{(h>XXbk zV>Y8VL-($6%{?$rTsFa_sWpZ%8i5dB7U)~Cy}&Nl&P{RDUVa&Uux|YEY5V~wVMW@0 z&@*EdLGN5Np7}Rf!H{&K(P94oe)+!~0rT&y-6SY=;NlT4r zh}t!a!FGR184ZnDUg7K&mrXXgpA3zGJ4N;hL$H2W zM9$XoDwW(&m{r^=Z$-xPbtgPz5f1g^Ta^T^HNBR^x6TRl57J;)tGnR6I;=?Qrtrn@ zjlHTT7{I0e)P4^h=^0+;%*K8u7+^B<54$hkbROw61HZT4%Bw($v)C)!+P6Q2s8i}p zTO_~sbE+@TA72FNY3!`DM^oUcC&hLVk{KigbexuF5{%)+>c&z!P%KE$m|6XS69Iq+ z=WF?-k~0?Q{v`~~yhm3Vz3(<+8QvUp`~|*W$7bU;Ao8mmF^Z5iIh2GOa`)&7<9Q3C zEw^LjEpqakc^Pee_+jX=BW{*{L+;3X6cdBTNbBvn>Lx-*86;A*I;j2k(PVz98^30a zU8c|!bEdyV-kmQg@ePcyVp-N?#t6osBfQm?PAcrwJGIdXwJE=tqS~2FeGimV^{5S>&0k0V?N}ogNRN9oN`y;udFiElUc(UL zK4LT3A&Lv4v6(GFn`c1GQuMXDUwb9#AcbYvpbH#E$k9UNZGYzTRxWA~08ur-naUJ6 zQi`jC-5-XU)y+z_;Df)ack+co5A`^+C9K||y1n)OVzTF5{2bP|IA{OR)eyzwl90@C zzS&Q5*Zoaz@>KGGm!o8_+v}nUqM=?xXX8u7h-XOU!TWGQmfHZA zv@?U!gr5F#g{DT^4X#)uxvraE>0YZcYaQyHf-WrYCh@@s0UkoV^`1?vKV$e>`j1|J zPG1PX?7O(Mu;E5FvBxs!Gy}i34K%y;AK+ioy01!s)gN)EB+&a z2ME}Da}|z|yw+<0tLnDYqwpz!Kb-s24-q48EmiT@fQmum+xV}mdweGo$E&d8CHC*O zv_>F-ZMUZuj+cj)SY?YrN>8#~V)@!|ev{6)MV2x{^>)nYhat(vnrGS7!R_d+OB89q zdrGU{$oKiDG&m-5ZZh;Tr|0Skfx%x1D64%eBag}LVPA(z;U)&Wi_D1u1bdXBkJxRo znvDP%PGa8o$V6k|TBN`qceT((;nh7lGfiN1%-{3l%{!co-X92X`GWE?I*bSWTy{S9 zCvC|Rzu17~E5Olzf2vnp=vBN~_+2{`ae?t*9s2q6UCTX1xL)?qiM7bLAD(T`-L}GZ z9{YXGtZXV3-|>#va`MW*C#$vIT3K#MtlGpD0C(Ur4+s_$tbRp*P2M|jCT6b%HUdi+ zUE-DWpbR7X5I%gX?)h;4cVs_oAS@6UzekUX@`TsT*`buPue){}jtM34bJ~o1gIm-$ zi-8j$`uY0W%k!?Cr%6|7Lcgy3df8-zKhSU#1p~e>`t@uPKdri+-<-fXa1U@cHf_6W z|CgS9ul_I&=fsgoTp%JSwF=EVRb)-V7xlu}PS`Oht6*z)q7-&7{w#*oc~!hE32xu` zJ(}y3-d?r)B-2JBlk1|bXt9DZ%5-WRr_Qd6V)i{w16v}b?*`pOBQdiYhjr6!3Z8yo zKet6?!Sr)A6-Bj6k}Ri?@oZfF<67~8_4yHVp*74HnwsSIS$c>k*(r9?25!vKpMSv8 z+WJQ8PA2dgDeVX!zjscKU38mVF%qxF@7E4>U6-!d zl~#(J0aBkSPQC0dfCK|^O07W6&1t6mMy{7VGkS5i z3jKb4jUpN9FIZ@(1f*W-)>`Wa%f&wv#=;ZVFWS7AgRV%Jsy zM@{Ta#H^0fC_9^rJUpw^Ee5HK;hUMd@H%ax&J0A6xOy(3z`7*o3#=i{q)N z3wbudu%BCy7R%ODX(a}Epv6VF`fw;19Fd=PavDvSz4AItMT?+soH;g*^=5z6D6RK9`J&#Am)t^QM6{x7G8_s)~ zn$3pgUiK3YALO?+Ydg}0&im{`x;F5`6IRr zc9O<}PxdK}^eR@p3^&9heth)bqWLDzWb4cF(#@}W&DF_9V`4DB%gK4=S;Q8dFf~=q zT4_tWiR(Ez;}pC4_?GBXG^h{R$II$Yir6Q=Xbj;d|C5s^JVRR6bi{CsTcB@n>6zRT z=F+o{5*I*d_YK5uSF(xA&k(A-uEvKHjzt{qQM~1C+-ccJ?zP>fB*_K&)hO2*U6sPV zWajJQSiXVOW5FzHEu0ZEl%wD{z1jFlAF=tE;t6a(FmLV9obi&K5CfjF4_F^Bn?Jus zTmQo`|EETOQq#}^T%z3B`y%U^&K(zD>2a&%!K?FS()Ox6&-kBTh%T3G9pou)+*~d= zqb?YLz1$=g&|-^IRFkJT9ZZE){T_svChV9;yY#248#mqjJAiv#_jIJ)WEvM>T{Bc; zCk9MHF@gVId*A)mRMU1FET||TD5A7br56cElK@KZgb31$NG}qaKp+SrAkssx(h&ir zcWg-Sy+%ra&?21xfwS>_pXWR0T-W&z&ius1&g{(W%$~W+TI&{0y)MIDr=eQ%q*K}~ zaIv$8uIpgss&aiNQpEK5<7p5L^FEpwUSzL*%S`>Lr&$B$Ymei?=S$(E`%z}nlwI$B zDO|45KWZ)cO;kM>29Q@68)s@pHPJoGg=3VwJ$s#7W3REmJWQWdh%f0llTUNTklT5G z(#Sh_rZS=#M{K$Qby5ALUX`Iw&P)ejt3DSbK|{!J8i!tBM}qGy!N|#EL|&kL2V`zv zl%Q~G%KEt{tl#$J!xQ!xS?(!X;ePfv{^iZTq<5Ac;#=XEnn#eV*A<2fLaUSH7S;i( z7S^(kzb8G7B#;esJE=#8EIU!FFH!Y$W}Co zQ#FOneS9Mnc$HLh00MZMnRRcrpVutlHF?42w62l*e%Z-|6rBkN(aZ+09h*OEKdis? zFn2&MIz}~md{)1?8wY-D6QJjy_K_fdO3U#g#QFE9QStZnqb%Kyt|y`#mvwBKo=DFj z9c$ChkBCreB@=$`RK4i7ytQ?L1oz~#Y_3=DdCR?aNRH=Eisub5{F*hlXH`TTYd3$Ux;)vhQk56 z&MZ4u{GIS>aH)-v>LPa=31F5$ev_tcB2%M6yi*AP^b~%JQsa3PJUW5yV^9nVDeJy6 z_3c|-ZZ;;yCQggc5)DVV?;}$IOnF%LZ|>%qA1?-+7KhZKznr7Dab(b?J7%SRr};Q) zfi{5!%#NKzK^5BVXb7RndphH;`*K#~ETOD{VAHiNHoow@*2JVXL21@f+I&TjQL(8o zS<0TVp={=wMSwwO>zsSwLQPSt1$Qx*93xB2pAX-&5z@D6--DjJn zi>hKSRlF#7_W5KnH8df`$muY<}&&K?PJ+C)GH zp;7PTu>GjC#b}SWvmkDas)>&gqQAWT4UwFMYV*kZ!SSr#@xVEk!FgvqTV@F*HL13w z&gY^6mF9|JOzEB9DQw-p16YV6kzJ1<;{6nW7gs^Yqf^d+T2oFL+s-(5Y}hd8Uq@? zJi6iHQql<%eY1v($A0H5cbQ+BN&$6W^~xeaPoxA@J0ss26vO+feIOOKfvKglA9MU^ z>%Fn&qg~&{Ad;`<=m$QX%>)_5-bu`Yu<-!%Y5K^{CpVNW z;Mn)|rO+znk&6EMX!ys6v29E9k`~?F6bn1JRNytw7;`}3xv_o*Fi;2Z$im)G(%cD7 zCW(07FxJqSgOudJDKPpTUrjOd4(cPFi>NG#eBT(rvw9(A;HYs?(X8}@9W&*al=dsL zwU6Dz41k~`9XI%OHnwr6iC}APo{}Ce_;q)s{xG;;-5~Ddluvc@=hg6&k7dlBpnlZqfHiy9)Wd;l6pPE#>mtMdeZ3#CTf<{DM zlO9iHPeLI+`yqtcs6O-TxP*gDP9e@y@OSy{1*q5r6F+^1F^xS`;Z zUq~_uaY)A}ouMj4{>}<`%hmS-5LAo)yI-)8WR z@c~urs+0pGiK7+4QDr2A!s5-yPo{>{R=LU(Z&Xx%)hTC^o)LquD2?ISO?PwbPyx(h zU5oG~{hgCcHnKRGanFP&zeO{+dLZr8P$*^k_!T=LzRY#O#taJLjK!NMHygl8deLs5 zbLV+$`>8CI0`f6&3#E#|-7zPG)&jP#f9%wNRK^!>GZXP@rP*wis~0V6DtbIxl$8!X z3F_=qf1Bsy=v`bS$ON)tIo7{sY{-)v(YVU?ES_?YNx|W;Su)y}+ z=aH>3_{_Pj4P}94iaY?%{L$dSp!c%sVquxGRTyv7&5!S};Z6~ZpFplyhhy3_6Vw$I zQA%tiUUuWnXAB(n~s6?gY+Al}XXjC76KgdWxOGi%X z!9adquGRwBDd!JJYii6W_Ewi%ZpS9#QRauy50Tr0VjrR;X%s`=7IYlc6d;d~9iblx z`-1hNjcE?l++B&nbX5gZKT{DK&R^|qSI>MKe2Ko_5*$nBIw^=$(53Hd;JsYr@r zhWdLq2-_DkoXwyLh|rkT=ss|3kl(Tz^@XGR z?K`n}o6+dYowGNBPPeOOs=hLcCfuK{2Lxd@< zMAETw`-*53JU(I#n^%h=_JTsW`S6!6{-&gO{-lG3YwT99c^=oGRSP7*{khnYdaKoN z6%1gLGthNxWJm+TM4^1=R$bkV3{5a14&q=B?>mY~>Tq~BRKsz*=CQ}2t{ za}^z~a`~lWQg4znnMx4uY1RlaHdRSi{eQtU!B=G*Ltb_$0jXSePud!P8;x-Yi+zCjW zDbDcQJ-Z|GroKtSKZ?gsF59Xr{-7Wdak8%x+6B`Cv7ASG;D@VHvE@C+Cnj)!36?ia zMLjV$$PxRl%#mO;%|n6vt}m}y*EtW96YxvEFY{12TY-CQ_7ez`Dp2%GIfnQv9!9V2 zyzfsr8>jT-B6jj zPSF*aY;8R%H}ds9QA|ZdGfmM%ie;VN{6@Brqz|jI+1H+_wK^h_)}z-szI){d>G!{H zKZRggFNm}1uou);(O}@J(C5+)zruzJ9ZE{$(M}s8 zb?8{YZ&st-e%*+Se(1Yl*P;$&*E{3a1$`aqRUd)Q7w=;s;qj`LQrE92W9rN}Lrow) zctQ&h)Y9|7+jg!>Ve7h7AANEe!s)N+7UV*CjmpJ%6bIlcq6vfh^yI1=oo%z5)S^+2 z9jByA+M*n~AK;yvwlz7bTWdFy4oYVby?>??*=e*UCZ={&mW9IBe~7*aSc)68Fb*(% zz2yG~qQa@RzaNqC%&W8~1noi$&;1|>z!+-HQKqcn!zsJXs!LlA0H>z5(ahK+JHqVP z@8qPa{1QUG)ynL8z)iqG%o@0FCK~EqMkEK5S5R0sdu*|Nm3AVqs&orRs|%2`lxU*4 z2ur*Tv}00T_f-5;W70VSMTm(}3(jG5yb5#bEu&Pp^}Ub_73lJnL~8l1QfHVAQIoc9vKM{8AJ6=bt3_zRSIb-(G zBYXe8O8JQAzCk;m?Up4~Ch~ALVKY)+^$S$&uMN+hK(w?yN3kMbID6GJF0h|#qtQ9_ z+S^1U#uD2F6d)W`T-fm@`VX`w<_`d)MMfv@ zX*(3`7RA~(RtTSQptv!`B)X}v>SexbvHbeNYZG|-+<3RkFS;1z>Y2LB9{_1URkZtA zjFQwNlNdPCcB+fsdrM#$k*A4Bb5Xuc!FLM|=6-Mg<|=o-3-&m%`1OnUS-V)(x5jod zP#q?|7?TuMVP0nIWln6!&^=j^BtvTP@q&t72SlNZ8VUD7!{Sc1LiVN>! zuoWCMh)^#WSgn(y&p-|M0)D4&X4O_Ea0hD|sM`hkcTh9xgrcUpCU*e-HtPRldyz1g z79ve#rh%MAUp->OSX+Zzw8{B&S3#>Q6fSnnK5GCY+6UAHPUZ;SjjpNdNlzrEfwI1x z&xxyUGrcIWG`1aT)*{#Xpn}~H4NDMQFMMUnZkJid%`KBl+P%CIDmuG^n+p6D#6L0E z$I;4HqO##jJBa3PmLJeoHv4%n_dX!!k{~UbAKbT`u0rh>35}8Lq*cBd*}F}GH1_9J z=JlWemn9BRjgeoUx0=*ua#+R@MEv@^sar$VCi$0*MMLk;Sa|IaeS}i)@_2lZpMy~d zz{Y~Y@7=xmAmiAm!tIm1r(H@ysDa;Rzn!r_aD|oXJ8NY!Qu!(19N0lV&C1DX^!Ep2 zeX3d8AWm7YbtTo@W>eZc{+OvzfJCnT4NJd2EfE)UYjvXO_@Xty5o`G9g)h3`+=46P zkweyUmkx6^u-PWcD}CPDIEu&wfw?R^`_ ztN0#v8Ze&FvV2@pEZ3g>te1mo_C;t4?)facYA{?Z``e;-;$^nA1s?sQ%P7|$_gG4s zmWz0w_wER3kNEhB2Qk6yB@nnlb5wmT*{P>)qH>k8rW5+;9=0qR!Uy=DgzQ?XZO2Pd2kU^&!6;^bzaN zcVxbyvnT}!FTNyxCGr%pB&yy9+@k}+^r2eVwX+O=@|IzbCd__eZN<6JcVEC|2SaO`qhG{!c$XLGAso=xOY# zRetALx$#-9_|NLzTNozan0p%aqq;+%Ji9l8zT-m^dmEF+Eso0|^1q6V?Fk22#srqq z%WuSatXL)ivmj5mb5nsj#B+)4M-9ij(sjOy8`mW-+d8qXGRxjaxqXM=Fk}lm+>Z=4Aj{1K4MkrUCv;k~R zd-bNXZtHUuh9Rmf99`J=4!8aWKMp?uZ~#wh*=&Pku4s%@&lHN zbT(ZtV9t#R-dCrFN0$lDYE83k`-{v}cZENsbM*@4;{D+)I&K|9q zmhOSWu2@7!tQIBA#?Gq)CqwL@PPk(D?sCPY`@PO_X@~6zdd#S1qBJLZGXc&Z%@VZ_ z2&2AeSgZYMEQ3|0-Ki^jz<390O**V_84r<%`|mCwPp|n}FHqJx?<&!u;%7K6I|-{@ z44$Qu71Lp1oc|Pbe{rY-=}%tg#zD$>0dOI;+@Q5F45E67b(m)3}M5Gev_eC&#f@7cgG%tyGNxs;(XEb_28lhvCorHY&3>02_O*NIkx*@Qnfs~->$sf5CK?o{B?XYnN#I5#hkij+H-7;|LhQ3 z{^BMTGBrnX23`tb>+#sPKO$qP`0EBAo=7)9S8A^Y?}V7Utu_nmraFlL$_LEgC@r1p zW04eNI@Y=YH#qJ(5;^@HfA-Zzhc7HAThyVjWjpzpphowd@N*(&mbWK&0Uz~Cz19(T z1zz-sVUi093TR#hw0jW855(=hra2X52aLdGY4ST^BBJTpY#aeHJY~;OA?ZC_}imWhzpJ{H;oiYS~BC;a~t!-e>~h~8@{!&=9T@15tUW*6x&e-Hm82Z+}*1EVgUXTdG=EDvj$@) ze&i{&$lHkQubN$^zxw1S z&998MgZ;#kQvpP_xoIy+dR`vg3v0-#1wru z-@V7p!LLaC@2Q8#djbIWein(j{#7lsOT^A#94PGdJ9fTKxV7@sl(tAYPYO?IK6+kh zUP&u%*%l$O6n&ky`T4tSDY{+AZuLbvmO-6Wb|sCtcBn>F3@onpcZ&kzyN%+K_Nx5p z9|}Omt^ecK_=7b00k@w-ev$PbiY|}$g|S2X2}wnX`xC{nn|^O#n{=_a1WLej3&)9Y z<&XxZjo3;6=Rb|5;9O1b##8~aN9;$c(3(BJ%&7HGK`M1~{KfDhL z35on*neU%!n9}6ER1mp<;4n;)*u$o~a~eb_q^USaV@V9793MuH{- zmsV9X*>rZ=P^~zg4egRR#f@+$zJn@vZNN9MPoDFHktyPw( zCL@$TWp+9%Ps0csxh;{ z)^S6>bP7E-ZSA>VX#GZcC>&XHITMVDOKZNK!<6{uxx1@6@ujKS8D=4Gpz>CY1U~jE z^##bqtgyV>XBgxaW7Vy*b|)5n^7&Unv=4)VGy?$1H}@N_J{C#|PFJFldeHpX zC*UnW<}COuf;Y@CV*w>@$VlS)^7LU75+x^Ui|!Xozuva{zy;QM%}b=`JNk$YRnx7U zom!N0JQYh1t$0|Ld@burOGoC&`#(|_R_6RYE&+%lCVQoT7*%rYK?eQlv~5>>8JTtX zNF44GpzXyby^CGNP_Zhd#-wj0_I0ykjtk`X^M-+lT6o}s1+0DTY{{p39<|JnQZW18 zSSjGOh2q|8@;59L`S?H3Dsh<7mcWAR zg;gKlfyy#1e0&lKb7o9g>1I6crV9A4w}a-6a}D~%LUWo48C*_^=$fmnFbuegG+IY4;(HN85o_}ua5T|idc1TYh?gPSzxcO)@oOsQUS$}nI-r~KyM{ZvZGT)!A{d;V>2R*kJT0Y_r2qm+*c(5ZkD1dav@(q9uho>Q_hv;X6Og)QEM|Ds(n-~f62Xm)C&ATg?1t6GxMSq>XamH23kfab zt`A4=+(6ywgM}aSrHiggczbREmc%X-`Es8lqqB(9<4Fvfq3W8V@xyINvQR%7kl%jmEgg(0+=z!UsG@7&Oca zQFp?gSfRD$y|0oze7xwt%i>qrd8Kmt2yVO0c%^H{#h^Sd~XX}^&oZ9DRvg2K9*8F2HoU+iXgwl34 zPJkR-C%m`%w0(qkXN!3Gv_XT>pLe^lWEB!^^PqBb?s-l11mxNMLi~A(m+C)iB%#;N z6g#7;YAg!wnP}|}+4plDJ^%R1tzDyMu40HGQYm^lidn3%QN&!AgD6!@o>=Qn>YhOP z`|+7@*s9XN(r`S53wM-H$KI!JG#F=pX?iuyODlS7*N9zCJ@y;cE~YB0=x9cF1GtAt~Q{t}ES)LGg(mI`4&Q4j=VwjX4Lbx8`~yAe0A8^jjLfH^z$7(^#9LQHDcTzrsYIo^OxAiKMl;ARaRvnvgd0K(r%8h5$ zjt9vnOcp~+{>6VD6j$dWjhxhs0KH&5u(|lXc@YSjZ+Bch+}idm$oBzj&D|o#QHzGs z-Fd%TW-33Ev7pi6(PLe{2S|j{{-_kDT}#o@Khi>;`G^*#`ce;2nI~zyV4TdXtoF4i zzHiUy{L`xTVU3u?rvGB$#UQ1M?&xgBz=1*6&8q;V)rqjT=XYO@jSJcq)4{Xdi}G1O8L;y4DIsI~ zo8kzBr;qtZrAfLltgZnf$KTu?xvm`R?@ocFF5GFi*=H$9b#X7|xBgl0Nujc+u?yy%8%v%FtUIb`39g9NmT3v`p)H;h^Q+amKTn5-_ zrHNub5ATEcNB~D?UVcqe0L}@hOjoBSw3qI`!)$)3z2%t?)z<42zHQmRHD z323z-$^R{b*O>WFDZb!?8%1y%=ToU zw4(WgljC+dC0X|LmYz1aI-Z(u;=0;bkE6#+#@9g1+2`k*2e1(ndiA~)5`DO>D3*^~ zo(5TGpD8q1x%~KDtNT&2;^N!;zAE`hBe9m`hL@{Yds1Y=Y0OAuz-1sZ9i3gag>NxH zn5&?nXqliR7rI$xLN_jV4<44H6nv%>|Kszi2uK7D@GiJST^MTI)}4*%y$msr`GY0{ z4IFQpSa+CaU;jvQ5;fWu3@f+-T00+-LA+N-!8N!tSbfYLe4X zR)RExA=HGP#@D#86DDL-zN0Q?@1!82Ih}seD0Z|!RTFZ2g%F=^=QEW~EmrgRXe@Um zswms9LnS4($ZNLuL=qo+7sPDX-E7+|6R9f*horn2J!3^Ey88LnFUn5e9z?NYot{8v zSR{I$thp8TSC6n>Tm><2pFd#oL{hf>?PNbL3|$lMn6N7tdQ#>Vb{4z%(=5+2|2Gp@ zeao1d5_0iE5nQ?A;;x8_Nao3SMaPg7;>VnShYD0{r!Jz8%^K~`MkCA_JJdi$sIMDs zH@T1?n*~#)ZwnbyOibDcKjxoIM_-zzvcU+;>FShXh%2fw&NGJmc)u(vqG;GA}rfh3Zr#F)mD zmDK&th8w3fbxi@B6c)QGsg|T;b}>x>&M`vE?|J1hO}z3fjLV($?vSw7_Of4&e8S3( zZ6`WcpkqKLUstuk?3%zVN8e>Mj2`B?4iC%8OKOgPm#l--JoOKxcczSn9iOA_>Icuz zkDjF}~5`jm3teN}sofxudaT)ZxWK?^_7W}+Q%<~$yJc?!2Beqz2jS=$nUhL0mv=&%a zx9z0(NFHnp?z%*b@=*k`6C=|V7ocU)-Ym7RY$PW|T8~f6508AdZ5@SH`7uW+i;9U8 z@DY(|q;yK(1g)C$v~)M;7x0$NXL64!Q-1G$ed`2SKtv8Hgzp;oWN(ee@e+R48-2c_ zsb9~Y-1WA}6~x@xogvU{8r8Lg`NN-)vm#1N#B1dyu&ixM_$Z3rq6W68%%!13!cHb% zo2)v)>$1m(@8c$^&Sf>v40V`D63#26=y~_WX<6TU4^cVBaCptG!rL8x1TTMqt7Ei@ z$(L~UL`MC=8s?JbE}goRdsLIYp?FDP+omaf&!|99KO3{wrx1v}8waNU*|x96{xRJk zj-Cxo+vi?MGl*f%t6qNlg9A|ZZ}tyr5wX!$aQPlFQ#Q2gZyf#po#m1z)aEPz{=x~W zS@7rEQQ@Oo;C!~Xwk}Y-Q^#`C_2Oh9bHbOgZC?q2@p0HODt_9wf0q@LsUB$=1p<6_>n`tSO%9TmFYZ=Ekwl=zmScFTup(o02 zq;vpTH^R;t2WIrQ4_0T<*VX>)QFnZ2P%TrXt~PDptEA)CvA6$Xqqk3!iD%wT*wRO^qJj+stXsCLZ7;fluul$EN@{Y+ z@2>=Kd2z%7U!7i(a}9nRa6%&iJ$s>=mcadIKjFK`=9V-hN+u;^Ho9-f`pLu_KDSGv z+*ZALmQzdL1|udLM8@BU`Iz1>$&b~V0-{)Iol#Bd+bjZa^MOs;FYw~!J1*PAAM7JY zQ{Q;ox~C%iEsaXL4k^)Cu0XcBfcSY6Cun+dT-C>oz&nEjnI#+k*fgCS z1X0g<+|6KlLw(Vit8ZL)03TfpW@&wq)>kv1bl9VZE6xQAKAdYRXM4Y|S<##ru^l(1 zm&+1QR9@SX?-e+&G!dJ78ey#8oHF}Vf2j3dg*^Aya%}wp%f|W-rzO8y=cd@j(Wyty zWo8{7Q@lmI-+Aw4O|MA-pzdo7u!ucT}9~2k|v$ER`s0jbE;6*KyFv6BfVg zy?GPke*I}JtmGuB-Bo73Vp!><0_>`Qd(sPGi_Bz3WHQJ5WV@t-o@hv_nl1OW&vHG- zoYByV=hJq)}9*VPc=r8fo8H_fYOk!QsJHT_2LPH||6NGga24mE|AFL(Vxi zkrs8OdEn@bccFb@yVoERJw}c8pb=Pu4R=Ku_qr0SJEdB&=?0WWaTA%X$XTsuv6X9* zzQe^MW)vWh8V$O`t?i{$kjv>^Xjx>e%{rOD>DokEeZ*P%1l$&47clAGy;%RGD-x-` z(cU&5X^c92_vm?I0eD%m^9R!R%^Dao%}=GsmtxA6|19_} zw&&ox=XR&k{NJ4&7qn|gE7(N%L_S|B`%6EiTiYgSVX{McAkr*O`~{U%DcMTiZY>~A z#;J|9Usl=$Y-uE?@~JGhX!4)D())$f|J-<9ISQ5BOHx+O!YvTaXc*--%9c5ltG7&_ z*pcsFndJ<>_bUpFI1Q0o{^T%-99n%U5NGyV1Owmu673%dCiy-Bl-fZUjbiKl_Qpkn zG}N17hgEZ%u(h+z4B)XJ9m~%p#QkRg?ZGnv!mhv=5aU}&0`D~-(^|S(7QC~Ee^dq@ zT-Es2O#}K3343$_1VZW^cWZQLNDBaL%cWH79w*yRPgI`>1v3I81zHxt{GtU2^z0Gm z-g5wlH+?TQQ1P+Sf3AUKbpezVa1nk?9@N2TjnDov+NZ;^^8%>W^`Dm2D+B@lx`U;t zc*Gk(77Kje-xfd$C{=<$e43g}8+Q$$0KNFXt^XY80t)EsJTGCg_VgnCuD8gGcY^;` zsuMZaH3$8!5MkutmO_93^iRJ4(sPgu=uZ(4FXQM#8MD$m7a)BqbN}g>CHHqA(5G7{ zR-ogr|LJ)3>-iK4C38jI=)p4qOiTz`D=J##pBsz(t@$N8Le?7vAkgv4K<@9qjT}I; z0Uft@!(z*=R&J>L^MBbOz=#HcLd&sD+zQ12p>yLb{_o2B=K~_=f*POz<+TNXIfHn> z`_KQGAAf!=1d^@jn%PIm&prompJQ6O4=Q5*ts{4n1VFKY`q$m_bYY(2Zw&sK9Eb{_ zSAhboM}hiLuvGy0@_#(io9D&lK(^LCvzJs1viowlb&>y;4`gONe@*&_hs)S4LSRWW z{|x1?3ktH;tK0ND^aqj|^Pa1!XS##`US|1wOTEeD|Lb9f!p>{K_2lMM-jnED;yOLx z$~4vapKcHSj{iNwQ5`@vBi{R;%g$^2{iji#=j#IUb2MY$Fm|^7&(9hP5KKU_kdB0APJjhZ1W2(B9RIGq z(7#^?*;>($E&svN{cp9rP!b?b4-yFv!}4?IHM7SsvJKg&{d<-5+|C8{d>`_(xf~6R z%I7ls=Z2tAp}!r2&Wo9;Zg9sis#@s%GeTK)Ae#iry7wx`pPSprxuA4chU4Fgn^ZvA zIbir%_f@}B Date: Fri, 28 Jan 2022 14:52:39 -0800 Subject: [PATCH 24/69] Update README.md --- README.md | 78 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 28c6f3e..cb788ba 100644 --- a/README.md +++ b/README.md @@ -1,52 +1,55 @@ -# Bayesian-Torch: Bayesian neural network layers for uncertainty estimation -**[Get started](https://github.com/IntelLabs/bayesian-torch#installation)** | **[Example usage](https://github.com/IntelLabs/bayesian-torch#usage)** | **[Documentation](https://github.com/IntelLabs/bayesian-torch/blob/main/doc/bayesian_torch.layers.md)** | **[License](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)** | **[Citing](https://github.com/IntelLabs/bayesian-torch#citing)** +
-### Bayesian layers and utilities to perform stochastic variational inference in PyTorch + +

+A library for Bayesian neural network layers and uncertainty estimation in Deep Learning +

-Bayesian-Torch is a library of neural network layers and utilities extending the core of PyTorch to enable the user to perform stochastic variational inference in Bayesian deep neural networks. -Bayesian-Torch is designed to be flexible and seamless in extending a deterministic deep neural network architecture to corresponding Bayesian form by simply replacing the deterministic layers with Bayesian layers. +[![python](https://img.shields.io/badge/python-3.7%2B-blue)]() +[![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)]() +[![version](https://img.shields.io/badge/release-0.2.0-green)]() +[![license](https://img.shields.io/badge/license-BSD%203--Clause-darkblue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE) +

+ Get Started | + Example usage | + Documentation | + Citing +

+
-The repository has implementations for the following Bayesian layers: -- [x] **[Variational layers with reparameterized Monte Carlo estimators](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers)** [[Blundell et al. 2015](https://arxiv.org/abs/1505.05424)] +___ + +Bayesian-Torch is a library of neural network layers and utilities extending the core of PyTorch to enable Bayesian inference in deep learning models to quantify principled uncertainty estimates in model predictions. + +## Overview +Bayesian-Torch is designed to be flexible and enables seamless extension of deterministic deep neural network model to corresponding Bayesian form by simply replacing the deterministic layers with Bayesian layers. It enables user to perform stochastic variational inference in deep neural networks. + +**Bayesian layers:** + +* **[Variational layers with reparameterized Monte Carlo estimators](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers)** [[Blundell et al. 2015](https://arxiv.org/abs/1505.05424)] - LinearVariational - Conv1dVariational, Conv2dVariational, Conv3dVariational, ConvTranspose1dVariational, ConvTranspose2dVariational, ConvTranspose3dVariational - LSTMVariational + LinearReparameterization + Conv1dReparameterization, Conv2dReparameterization, Conv3dReparameterization, ConvTranspose1dReparameterization, ConvTranspose2dReparameterization, ConvTranspose3dReparameterization + LSTMReparameterization -- [x] **[Variational layers with Flipout Monte Carlo estimators](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers)** [[Wen et al. 2018](https://arxiv.org/abs/1803.04386)] +* **[Variational layers with Flipout Monte Carlo estimators](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers)** [[Wen et al. 2018](https://arxiv.org/abs/1803.04386)] LinearFlipout Conv1dFlipout, Conv2dFlipout, Conv3dFlipout, ConvTranspose1dFlipout, ConvTranspose2dFlipout, ConvTranspose3dFlipout LSTMFlipout - - - - -Other features include: -- [x] [dnn_to_bnn()](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. -- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): Specifying weight priors and variational posteriors in Bayesian neural networks with Empirical Bayes [[Krishnan et al. 2020](https://ojs.aaai.org/index.php/AAAI/article/view/5875)] -- [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)] +**Key features:** +* [dnn_to_bnn()](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. +* [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): Specifying weight priors and variational posteriors in Bayesian neural networks with Empirical Bayes [[Krishnan et al. 2020](https://ojs.aaai.org/index.php/AAAI/article/view/5875)] +* [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)] -## Installation +## Installing Bayesian-Torch + ## Usage There are two ways to build Bayesian deep neural networks using Bayesian-Torch: -1. Convert an existing deterministic deep neural network (dnn) model to Bayesian deep neural network (bnn) model with dnn_to_bnn() -2. Define your custom model using the Bayesian layers ([Flipout](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers) or [Reparameterization](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers)) +1. Convert an existing deterministic deep neural network (dnn) model to Bayesian deep neural network (bnn) model with dnn_to_bnn() API +2. Define your custom model using the Bayesian layers ([Reparameterization](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers) or [Flipout](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers)) (1) For instance, building Bayesian-ResNet18 from torchvision deterministic ResNet18 model is as simple as: ``` @@ -92,7 +96,7 @@ const_bnn_prior_parameters = { model = torchvision.models.resnet18() dnn_to_bnn(model, const_bnn_prior_parameters) ``` -To use MOPED method, setting the prior and initializing variational parameters from a pretrained deterministic model (helps training convergence of larger models): +To use MOPED method i.e. setting the prior and initializing variational parameters from a pretrained deterministic model (helps training convergence of larger models): ``` const_bnn_prior_parameters = { "prior_mu": 0.0, @@ -234,7 +238,7 @@ MOdel Priors with Empirical Bayes using DNN (MOPED) } ``` -This code is intended for researchers and developers, enables to quantify principled uncertainty estimates from deep neural network predictions using stochastic variational inference in Bayesian neural networks. +This library and code is intended for researchers and developers, enables to quantify principled uncertainty estimates from deep learning model predictions using stochastic variational inference in Bayesian neural networks. Feedbacks, issues and contributions are welcome. Email to for any questions. From a081d57c5c30de738c9c2dd411690f1218adb5f7 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Fri, 28 Jan 2022 14:58:11 -0800 Subject: [PATCH 25/69] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index cb788ba..75e9363 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,9 @@ A library for Bayesian neural network layers and uncertainty estimation in Deep Learning -[![python](https://img.shields.io/badge/python-3.7%2B-blue)]() -[![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)]() -[![version](https://img.shields.io/badge/release-0.2.0-green)]() +[![python](https://img.shields.io/badge/python-3.7%2B-blue)](https://github.com/IntelLabs/bayesian-torch) +[![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)](https://github.com/IntelLabs/bayesian-torch) +[![version](https://img.shields.io/badge/release-0.2.0-green)](https://github.com/IntelLabs/bayesian-torch/releases) [![license](https://img.shields.io/badge/license-BSD%203--Clause-darkblue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)

Get Started | From f0207c0273cea2402def384c859a1cbbaf016525 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Fri, 28 Jan 2022 15:02:13 -0800 Subject: [PATCH 26/69] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 75e9363..77f65bf 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ A library for Bayesian neural network layers and uncertainty estimation in Deep [![version](https://img.shields.io/badge/release-0.2.0-green)](https://github.com/IntelLabs/bayesian-torch/releases) [![license](https://img.shields.io/badge/license-BSD%203--Clause-darkblue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)

- Get Started | + Get Started | Example usage | Documentation | Citing From 3793ce523b261740c635e7e0597854eb8ffe1fe5 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Fri, 28 Jan 2022 15:13:58 -0800 Subject: [PATCH 27/69] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 77f65bf..075c8f9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
- +

A library for Bayesian neural network layers and uncertainty estimation in Deep Learning

From b71fc1790a3fe58080c73b2145dafce2748aa3d3 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Fri, 28 Jan 2022 15:14:29 -0800 Subject: [PATCH 28/69] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2ba22c1..ae08428 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ name = "bayesian-torch", packages = find_packages(), version = "0.2.0", - description = "Bayesian-Torch: Bayesian neural network layers for uncertainty estimation", + description = "A library for Bayesian neural network layers and uncertainty estimation in Deep Learning", author = "Intel Labs", author_email = "ranganath.krishnan@intel.com", url = "https://github.com/IntelLabs/bayesian-torch", From 5019bf9e8f4f5f501ca8857e22b50a734487dce6 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Mon, 31 Jan 2022 10:20:59 -0800 Subject: [PATCH 29/69] release to PyPI, update install instruction through "pip" command --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 075c8f9..df8d4c7 100644 --- a/README.md +++ b/README.md @@ -48,12 +48,12 @@ Bayesian-Torch is designed to be flexible and enables seamless extension of dete * [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)] ## Installing Bayesian-Torch - + **To install latest development version from source:** ```sh git clone https://github.com/IntelLabs/bayesian-torch From 7e8d246c2f2c2fd719eb7753345708e9b7cb84d8 Mon Sep 17 00:00:00 2001 From: Michael Beale Date: Mon, 31 Jan 2022 10:28:35 -0800 Subject: [PATCH 30/69] Switched to permanent URL for the top image. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index df8d4c7..999023c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
- +

A library for Bayesian neural network layers and uncertainty estimation in Deep Learning

From bf8c3e37f273a705955d091653360fa6526553d4 Mon Sep 17 00:00:00 2001 From: Michael Beale Date: Mon, 31 Jan 2022 11:15:29 -0800 Subject: [PATCH 31/69] changing to raw.githubusercontent.com Url for top image. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 999023c..65bfe1b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
- +

A library for Bayesian neural network layers and uncertainty estimation in Deep Learning

From 3ca0190a6659a1760e30773bdec8ae3c32ec9df3 Mon Sep 17 00:00:00 2001 From: Michael Beale Date: Mon, 31 Jan 2022 11:15:54 -0800 Subject: [PATCH 32/69] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 65bfe1b..0ed9e67 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
- +

A library for Bayesian neural network layers and uncertainty estimation in Deep Learning

From a8cb7ebd13f55621e929180d484038c60a3a1738 Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Mon, 31 Jan 2022 04:33:22 -0800 Subject: [PATCH 33/69] update links and release number for PyPI documentation Signed-off-by: Ranganath Krishnan --- README.md | 6 ++---- setup.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0ed9e67..833acc5 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,8 @@ A library for Bayesian neural network layers and uncertainty estimation in Deep [![python](https://img.shields.io/badge/python-3.7%2B-blue)](https://github.com/IntelLabs/bayesian-torch) [![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)](https://github.com/IntelLabs/bayesian-torch) -[![version](https://img.shields.io/badge/release-0.2.0-green)](https://github.com/IntelLabs/bayesian-torch/releases) -[![license](https://img.shields.io/badge/license-BSD%203--Clause-darkblue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE) +[![version](https://img.shields.io/badge/release-0.2.1-green)](https://github.com/IntelLabs/bayesian-torch/releases) +[![license](https://img.shields.io/badge/license-BSD%203--Clause-blue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)

Get Started | Example usage | @@ -240,5 +240,3 @@ MOdel Priors with Empirical Bayes using DNN (MOPED) This library and code is intended for researchers and developers, enables to quantify principled uncertainty estimates from deep learning model predictions using stochastic variational inference in Bayesian neural networks. Feedbacks, issues and contributions are welcome. Email to for any questions. - - diff --git a/setup.py b/setup.py index ae08428..5a02fb8 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name = "bayesian-torch", packages = find_packages(), - version = "0.2.0", + version = "0.2.1", description = "A library for Bayesian neural network layers and uncertainty estimation in Deep Learning", author = "Intel Labs", author_email = "ranganath.krishnan@intel.com", From a5750c7c5bba7e5dd1c11464d4e96ead38cae2ee Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Wed, 9 Feb 2022 12:56:15 -0800 Subject: [PATCH 34/69] Update README.md add downloads statistics badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 833acc5..6e428bd 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ A library for Bayesian neural network layers and uncertainty estimation in Deep [![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)](https://github.com/IntelLabs/bayesian-torch) [![version](https://img.shields.io/badge/release-0.2.1-green)](https://github.com/IntelLabs/bayesian-torch/releases) [![license](https://img.shields.io/badge/license-BSD%203--Clause-blue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE) +[![Downloads](https://pepy.tech/badge/bayesian-torch/month)](https://pepy.tech/project/bayesian-torch)

Get Started | Example usage | From 789657dd753657d707f23289609f2c523eff99ff Mon Sep 17 00:00:00 2001 From: Ranganath Krishnan Date: Wed, 2 Mar 2022 18:58:48 -0800 Subject: [PATCH 35/69] update download count badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6e428bd..e56245a 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ A library for Bayesian neural network layers and uncertainty estimation in Deep [![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)](https://github.com/IntelLabs/bayesian-torch) [![version](https://img.shields.io/badge/release-0.2.1-green)](https://github.com/IntelLabs/bayesian-torch/releases) [![license](https://img.shields.io/badge/license-BSD%203--Clause-blue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE) -[![Downloads](https://pepy.tech/badge/bayesian-torch/month)](https://pepy.tech/project/bayesian-torch) +[![Downloads](https://static.pepy.tech/personalized-badge/bayesian-torch?period=total&units=international_system&left_color=grey&right_color=darkblue&left_text=downloads)](https://pepy.tech/project/bayesian-torch)

Get Started | Example usage | From 5802ef9a2730d5b3d9f93081d7f1e2267dd8bab8 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Sun, 13 Nov 2022 21:22:51 -0500 Subject: [PATCH 36/69] implement quantized convolution variational layers --- .../quantize_conv_variational.py | 994 ++++++++++++++++++ 1 file changed, 994 insertions(+) create mode 100644 bayesian_torch/layers/variational_layers/quantize_conv_variational.py diff --git a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py new file mode 100644 index 0000000..c7eafc0 --- /dev/null +++ b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py @@ -0,0 +1,994 @@ +# Copyright (C) 2021 Intel Labs +# +# BSD-3-Clause License +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# @authors: Jun-Liang Lin +# +# ====================================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Parameter +from ..base_variational_layer import BaseVariationalLayer_ +from .conv_variational import * +import math + +__all__ = [ + 'QuantizedConv1dReparameterization', + 'QuantizedConv2dReparameterization', + 'QuantizedConv3dReparameterization', + 'QuantizedConvTranspose1dReparameterization', + 'QuantizedConvTranspose2dReparameterization', + 'QuantizedConvTranspose3dReparameterization', +] + + +class QuantizedConv1dReparameterization(Conv1dReparameterization): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + + super(QuantizedConv1dReparameterization, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias + ) + + ## redundant ## + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + ## redundant ## + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x): + # symmetric quantization + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically) + scale = xmax*2/255 # original range divided by target range (int8, -128 to 127) + + return scale, zero_point + + def get_quantized_tensor(self, x): + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([0.1]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): # Deprecated. Only for forward mode #1. + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, input, mode=2): + + if mode==1: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + + weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_()) + bias = None + + if self.bias: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.conv1d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups) + + else: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + out = torch.nn.quantized.functional.conv1d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32 + + return out, 0 # disable kl divergence computing + + +class QuantizedConv2dReparameterization(Conv2dReparameterization): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + + """ + + super(QuantizedConv2dReparameterization, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias + ) + + ## redundant ## + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + ## redundant ## + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x): + # symmetric quantization + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically) + scale = xmax*2/255 # original range divided by target range (int8, -128 to 127) + + return scale, zero_point + + def get_quantized_tensor(self, x): + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([0.1]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): # Deprecated. Only for forward mode #1. + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, input, mode=2): + + if mode==1: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + + weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_()) + bias = None + + if self.bias: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.conv2d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups) + + else: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + out = torch.nn.quantized.functional.conv2d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32 + + return out, 0 # disable kl divergence computing + + +class QuantizedConv3dReparameterization(Conv3dReparameterization): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + + super(QuantizedConv3dReparameterization, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias + ) + + ## redundant ## + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + ## redundant ## + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x): + # symmetric quantization + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically) + scale = xmax*2/255 # original range divided by target range (int8, -128 to 127) + + return scale, zero_point + + def get_quantized_tensor(self, x): + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([0.1]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): # Deprecated. Only for forward mode #1. + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, input, mode=2): + + if mode==1: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + + weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_()) + bias = None + + if self.bias: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.conv3d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups) + + else: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + out = torch.nn.quantized.functional.conv3d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32 + + return out, 0 # disable kl divergence computing + +class QuantizedConvTranspose1dReparameterization(ConvTranspose1dReparameterization): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + + super(ConvTranspose1dReparameterization, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias + ) + + ## redundant ## + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + ## redundant ## + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x): + # symmetric quantization + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically) + scale = xmax*2/255 # original range divided by target range (int8, -128 to 127) + + return scale, zero_point + + def get_quantized_tensor(self, x): + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([0.1]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): # Deprecated. Only for forward mode #1. + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, input, mode=2): + + if mode==1: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + + weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_()) + bias = None + + if self.bias: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.conv_transpose1d(input, weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + else: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + out = torch.ops.quantized.conv_transpose1d(input, self._packed_params, scale=0.1, zero_point=128) + + + return out, 0 # disable kl divergence computing + +class QuantizedConvTranspose2dReparameterization(ConvTranspose2dReparameterization): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + + super(ConvTranspose2dReparameterization, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias + ) + + ## redundant ## + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + ## redundant ## + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x): + # symmetric quantization + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically) + scale = xmax*2/255 # original range divided by target range (int8, -128 to 127) + + return scale, zero_point + + def get_quantized_tensor(self, x): + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([0.1]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): # Deprecated. Only for forward mode #1. + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, input, mode=2): + + if mode==1: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + + weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_()) + bias = None + + if self.bias: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.conv_transpose2d(input, weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + else: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + out = torch.ops.quantized.conv_transpose2d(input, self._packed_params, scale=0.1, zero_point=128) + + + return out, 0 # disable kl divergence computing + +class QuantizedConvTranspose3dReparameterization(ConvTranspose3dReparameterization): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + + super(ConvTranspose3dReparameterization, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias + ) + + ## redundant ## + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + ## redundant ## + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x): + # symmetric quantization + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically) + scale = xmax*2/255 # original range divided by target range (int8, -128 to 127) + + return scale, zero_point + + def get_quantized_tensor(self, x): + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([0.1]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): # Deprecated. Only for forward mode #1. + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, input, mode=2): + + if mode==1: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + + weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_()) + bias = None + + if self.bias: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.conv_transpose3d(input, weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + else: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + out = torch.ops.quantized.conv_transpose3d(input, self._packed_params, scale=0.1, zero_point=128) + + + return out, 0 # disable kl divergence computing \ No newline at end of file From d910ae8bf1b7a603c89991699f4c6909d86d1c5a Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Tue, 29 Nov 2022 21:12:28 -0500 Subject: [PATCH 37/69] replace hardcoded variables with function parameters and add comments --- .../quantize_conv_variational.py | 540 +++++++++++++++--- 1 file changed, 468 insertions(+), 72 deletions(-) diff --git a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py index c7eafc0..59470df 100644 --- a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py +++ b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py @@ -94,19 +94,53 @@ def __init__(self, self.is_dequant = False - def get_scale_and_zero_point(self, x): - # symmetric quantization + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # scale = torch.zeros(1).to(x.device) # initialize zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization - xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically) - scale = xmax*2/255 # original range divided by target range (int8, -128 to 127) - + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range return scale, zero_point - def get_quantized_tensor(self, x): + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ scale, zero_point = self.get_scale_and_zero_point(x) if scale == 0: - scale = torch.tensor([0.1]) # avoid zero scale + scale = torch.tensor([default_scale]) # avoid zero scale quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) return quantized_x @@ -165,9 +199,41 @@ def dequantize(self): # Deprecated. Only for forward mode #1. return - def forward(self, input, mode=2): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. - if mode==1: # Deprecated. Use this method for reducing model size only. + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() self.is_dequant = True @@ -182,7 +248,7 @@ def forward(self, input, mode=2): self.dilation, self.groups) else: - eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. @@ -198,10 +264,10 @@ def forward(self, input, mode=2): bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) if input.dtype!=torch.quint8: # check if input has been quantized - input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format out = torch.nn.quantized.functional.conv1d(input, weight, bias, self.stride, self.padding, - self.dilation, self.groups, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32 + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 return out, 0 # disable kl divergence computing @@ -250,19 +316,53 @@ def __init__(self, self.is_dequant = False - def get_scale_and_zero_point(self, x): - # symmetric quantization + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # scale = torch.zeros(1).to(x.device) # initialize zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization - xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically) - scale = xmax*2/255 # original range divided by target range (int8, -128 to 127) - + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range return scale, zero_point - def get_quantized_tensor(self, x): + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ scale, zero_point = self.get_scale_and_zero_point(x) if scale == 0: - scale = torch.tensor([0.1]) # avoid zero scale + scale = torch.tensor([default_scale]) # avoid zero scale quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) return quantized_x @@ -321,9 +421,41 @@ def dequantize(self): # Deprecated. Only for forward mode #1. return - def forward(self, input, mode=2): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. - if mode==1: # Deprecated. Use this method for reducing model size only. + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() self.is_dequant = True @@ -338,7 +470,7 @@ def forward(self, input, mode=2): self.dilation, self.groups) else: - eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. @@ -354,10 +486,10 @@ def forward(self, input, mode=2): bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) if input.dtype!=torch.quint8: # check if input has been quantized - input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format out = torch.nn.quantized.functional.conv2d(input, weight, bias, self.stride, self.padding, - self.dilation, self.groups, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32 + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 return out, 0 # disable kl divergence computing @@ -405,19 +537,53 @@ def __init__(self, self.is_dequant = False - def get_scale_and_zero_point(self, x): - # symmetric quantization + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # scale = torch.zeros(1).to(x.device) # initialize zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization - xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically) - scale = xmax*2/255 # original range divided by target range (int8, -128 to 127) - + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range return scale, zero_point - def get_quantized_tensor(self, x): + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ scale, zero_point = self.get_scale_and_zero_point(x) if scale == 0: - scale = torch.tensor([0.1]) # avoid zero scale + scale = torch.tensor([default_scale]) # avoid zero scale quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) return quantized_x @@ -476,9 +642,41 @@ def dequantize(self): # Deprecated. Only for forward mode #1. return - def forward(self, input, mode=2): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. - if mode==1: # Deprecated. Use this method for reducing model size only. + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() self.is_dequant = True @@ -493,7 +691,7 @@ def forward(self, input, mode=2): self.dilation, self.groups) else: - eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. @@ -509,10 +707,10 @@ def forward(self, input, mode=2): bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) if input.dtype!=torch.quint8: # check if input has been quantized - input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format out = torch.nn.quantized.functional.conv3d(input, weight, bias, self.stride, self.padding, - self.dilation, self.groups, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32 + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 return out, 0 # disable kl divergence computing @@ -559,19 +757,53 @@ def __init__(self, self.is_dequant = False - def get_scale_and_zero_point(self, x): - # symmetric quantization + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # scale = torch.zeros(1).to(x.device) # initialize zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization - xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically) - scale = xmax*2/255 # original range divided by target range (int8, -128 to 127) - + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range return scale, zero_point - def get_quantized_tensor(self, x): + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ scale, zero_point = self.get_scale_and_zero_point(x) if scale == 0: - scale = torch.tensor([0.1]) # avoid zero scale + scale = torch.tensor([default_scale]) # avoid zero scale quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) return quantized_x @@ -630,9 +862,41 @@ def dequantize(self): # Deprecated. Only for forward mode #1. return - def forward(self, input, mode=2): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. - if mode==1: # Deprecated. Use this method for reducing model size only. + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() self.is_dequant = True @@ -648,7 +912,7 @@ def forward(self, input, mode=2): self.dilation, self.groups) else: - eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. @@ -664,13 +928,13 @@ def forward(self, input, mode=2): bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) if input.dtype!=torch.quint8: # check if input has been quantized - input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(weight, bias, self.stride, self.padding, self.output_padding, self.dilation, self.groups) - out = torch.ops.quantized.conv_transpose1d(input, self._packed_params, scale=0.1, zero_point=128) + out = torch.ops.quantized.conv_transpose1d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point) return out, 0 # disable kl divergence computing @@ -718,19 +982,53 @@ def __init__(self, self.is_dequant = False - def get_scale_and_zero_point(self, x): - # symmetric quantization + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # scale = torch.zeros(1).to(x.device) # initialize zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization - xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically) - scale = xmax*2/255 # original range divided by target range (int8, -128 to 127) - + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range return scale, zero_point - def get_quantized_tensor(self, x): + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ scale, zero_point = self.get_scale_and_zero_point(x) if scale == 0: - scale = torch.tensor([0.1]) # avoid zero scale + scale = torch.tensor([default_scale]) # avoid zero scale quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) return quantized_x @@ -789,9 +1087,41 @@ def dequantize(self): # Deprecated. Only for forward mode #1. return - def forward(self, input, mode=2): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. - if mode==1: # Deprecated. Use this method for reducing model size only. + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() self.is_dequant = True @@ -807,7 +1137,7 @@ def forward(self, input, mode=2): self.dilation, self.groups) else: - eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. @@ -823,13 +1153,13 @@ def forward(self, input, mode=2): bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) if input.dtype!=torch.quint8: # check if input has been quantized - input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(weight, bias, self.stride, self.padding, self.output_padding, self.dilation, self.groups) - out = torch.ops.quantized.conv_transpose2d(input, self._packed_params, scale=0.1, zero_point=128) + out = torch.ops.quantized.conv_transpose2d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point) return out, 0 # disable kl divergence computing @@ -877,19 +1207,53 @@ def __init__(self, self.is_dequant = False - def get_scale_and_zero_point(self, x): - # symmetric quantization + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # scale = torch.zeros(1).to(x.device) # initialize zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization - xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically) - scale = xmax*2/255 # original range divided by target range (int8, -128 to 127) - + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range return scale, zero_point - def get_quantized_tensor(self, x): + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ scale, zero_point = self.get_scale_and_zero_point(x) if scale == 0: - scale = torch.tensor([0.1]) # avoid zero scale + scale = torch.tensor([default_scale]) # avoid zero scale quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) return quantized_x @@ -948,9 +1312,41 @@ def dequantize(self): # Deprecated. Only for forward mode #1. return - def forward(self, input, mode=2): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. - if mode==1: # Deprecated. Use this method for reducing model size only. + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() self.is_dequant = True @@ -966,7 +1362,7 @@ def forward(self, input, mode=2): self.dilation, self.groups) else: - eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. @@ -982,13 +1378,13 @@ def forward(self, input, mode=2): bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) if input.dtype!=torch.quint8: # check if input has been quantized - input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(weight, bias, self.stride, self.padding, self.output_padding, self.dilation, self.groups) - out = torch.ops.quantized.conv_transpose3d(input, self._packed_params, scale=0.1, zero_point=128) + out = torch.ops.quantized.conv_transpose3d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point) return out, 0 # disable kl divergence computing \ No newline at end of file From 878c3f2200e1a51e2ee9cb1f404c12d02ad760b2 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Sun, 4 Dec 2022 22:54:37 -0500 Subject: [PATCH 38/69] implement quantized linear variational layer --- bayesian_torch/layers/variational_layers/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bayesian_torch/layers/variational_layers/__init__.py b/bayesian_torch/layers/variational_layers/__init__.py index 1c083e3..fa39917 100644 --- a/bayesian_torch/layers/variational_layers/__init__.py +++ b/bayesian_torch/layers/variational_layers/__init__.py @@ -1,3 +1,6 @@ from .linear_variational import * from .conv_variational import * from .rnn_variational import * +# from .quantize_linear_variational import * +from .quantize_conv_variational import * +# from .quantize_rnn_variational import * \ No newline at end of file From 9c1493a363a0c46db6c556c7c683f984a04ec12f Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Sun, 4 Dec 2022 22:56:13 -0500 Subject: [PATCH 39/69] update init file --- bayesian_torch/layers/variational_layers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bayesian_torch/layers/variational_layers/__init__.py b/bayesian_torch/layers/variational_layers/__init__.py index fa39917..6fae454 100644 --- a/bayesian_torch/layers/variational_layers/__init__.py +++ b/bayesian_torch/layers/variational_layers/__init__.py @@ -1,6 +1,6 @@ from .linear_variational import * from .conv_variational import * from .rnn_variational import * -# from .quantize_linear_variational import * +from .quantize_linear_variational import * from .quantize_conv_variational import * # from .quantize_rnn_variational import * \ No newline at end of file From 7a89b6deb8987790145cf5cc7cdfe29914743344 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Sun, 4 Dec 2022 22:59:18 -0500 Subject: [PATCH 40/69] quantized linear variational layer --- .../quantize_linear_variational.py | 199 ++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 bayesian_torch/layers/variational_layers/quantize_linear_variational.py diff --git a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py new file mode 100644 index 0000000..a1ce3fd --- /dev/null +++ b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py @@ -0,0 +1,199 @@ +# Copyright (C) 2021 Intel Labs +# +# BSD-3-Clause License +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ====================================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Module, Parameter +from ..base_variational_layer import BaseVariationalLayer_ +import math +from .linear_variational import LinearReparameterization + + + +class QuantizedLinearReparameterization(LinearReparameterization): + def __init__(self, + in_features, + out_features): + """ + + """ + super(QuantizedLinearReparameterization, self).__init__( + in_features, + out_features) + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_weight), requires_grad=False) + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_weight))), requires_grad=False) + delattr(self, "mu_weight") + delattr(self, "rho_weight") + + self.quantized_mu_bias = Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False) + self.quantized_sigma_bias = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False) + delattr(self, "mu_bias") + delattr(self, "rho_bias") + + def dequantize(self): # Deprecated + self.mu_weight = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + return + + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + weight = self.mu_weight + (self.sigma_weight * self.eps_weight.data.normal_()) + bias = None + if self.sigma_bias is not None: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.linear(input, weight, bias) + + else: + eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale()) + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + if self.quantized_sigma_bias is not None: + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + if input.dtype!=torch.quint8: + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) + + out = torch.nn.quantized.functional.linear(input, weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + out = out.dequantize() + + return out, 0 # kl=0 From 3ea51a585d49b6e057f817a85629d141c51f52a1 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Sun, 4 Dec 2022 23:40:35 -0500 Subject: [PATCH 41/69] quantized conv flipout layers --- .../layers/flipout_layers/__init__.py | 3 + .../flipout_layers/quantized_conv_flipout.py | 661 ++++++++++++++++++ 2 files changed, 664 insertions(+) create mode 100644 bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py diff --git a/bayesian_torch/layers/flipout_layers/__init__.py b/bayesian_torch/layers/flipout_layers/__init__.py index 3aeb698..fda0925 100644 --- a/bayesian_torch/layers/flipout_layers/__init__.py +++ b/bayesian_torch/layers/flipout_layers/__init__.py @@ -1,3 +1,6 @@ from .conv_flipout import * from .linear_flipout import * from .rnn_flipout import * +# from .quantized_linear_flipout import * +from .quantized_conv_flipout import * +# from .quantize_rnn_flipout import * \ No newline at end of file diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py new file mode 100644 index 0000000..45eacfe --- /dev/null +++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py @@ -0,0 +1,661 @@ +# Copyright (C) 2021 Intel Labs +# +# BSD-3-Clause License +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# +# Convolutional layers with flipout Monte Carlo weight estimator to perform +# variational inference in Bayesian neural networks. Variational layers +# enables Monte Carlo approximation of the distribution over the kernel +# +# +# ====================================================================================== +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Parameter +from ..base_variational_layer import BaseVariationalLayer_ +from .conv_flipout import * + +from torch.distributions.normal import Normal +from torch.distributions.uniform import Uniform + +__all__ = [ + 'QuantizedConv1dFlipout', + 'QuantizedConv2dFlipout', + 'QuantizedConv3dFlipout', + # 'QuantizedConvTranspose1dFlipout', + # 'QuantizedConvTranspose2dFlipout', + # 'QuantizedConvTranspose3dFlipout', +] + + +class QuantizedConv1dFlipout(Conv1dFlipout): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + super(QuantizedConv1dFlipout).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias) + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format. + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if x.dtype!=torch.quint8: + x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) + + bias = None + if self.bias: + bias = self.quantized_mu_bias + + outputs = torch.nn.quantized.functional.conv1d(x, self.quantized_mu_weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + + bias = None + if self.bias: + eps_bias = self.eps_bias.data.normal_() + bias = (self.quantized_sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + perturbed_outputs = torch.nn.quantized.functional.conv1d(x, + weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding, + dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + + return out, 0 + + +class QuantizedConv2dFlipout(Conv2dFlipout): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): # be aware of bias + """ + + """ + super(QuantizedConv2dFlipout, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias) + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format. + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if x.dtype!=torch.quint8: + x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) + + bias = None + if self.bias: + bias = self.quantized_mu_bias + + outputs = torch.nn.quantized.functional.conv2d(x, self.quantized_mu_weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + + bias = None + if self.bias: + eps_bias = self.eps_bias.data.normal_() + bias = (self.quantized_sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + perturbed_outputs = torch.nn.quantized.functional.conv2d(x, + weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding, + dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + + return out, 0 + + +class QuantizedConv3dFlipout(Conv3dFlipout): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + super(QuantizedConv3dFlipout).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias) + + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format. + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if x.dtype!=torch.quint8: + x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) + + bias = None + if self.bias: + bias = self.quantized_mu_bias + + outputs = torch.nn.quantized.functional.conv3d(x, self.quantized_mu_weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + + bias = None + if self.bias: + eps_bias = self.eps_bias.data.normal_() + bias = (self.quantized_sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + perturbed_outputs = torch.nn.quantized.functional.conv3d(x, + weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding, + dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + + return out, 0 From 5b696ae7c36ca51842f91a331baca63c870c8a4f Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Mon, 5 Dec 2022 14:01:19 -0500 Subject: [PATCH 42/69] quantized linear flipout layer --- .../quantized_linear_flipout.py | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py diff --git a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py new file mode 100644 index 0000000..1449428 --- /dev/null +++ b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py @@ -0,0 +1,136 @@ +# Copyright (C) 2021 Intel Labs +# +# BSD-3-Clause License +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# +# Linear Flipout Layers with flipout weight estimator to perform +# variational inference in Bayesian neural networks. Variational layers +# enables Monte Carlo approximation of the distribution over the weights +# +# @authors: Ranganath Krishnan, Piero Esposito +# +# ====================================================================================== +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Module, Parameter +from torch.distributions.normal import Normal +from torch.distributions.uniform import Uniform + +from .linear_flipout import LinearFlipout + +__all__ = ["QuantizedLinearFlipout"] + +class QuantizedLinearFlipout(LinearFlipout): + def __init__(self, + in_features, + out_features): + + super(QuantizedLinearFlipout, self).__init__( + in_features, + out_features) + + self.is_dequant = False + + def get_scale_and_zero_point(self, x): + + # symmetry + scale = torch.zeros(1).to(x.device) + zero_point = torch.zeros(1).to(x.device) + xmax = torch.clamp(x.abs().max(), -100, 100) + scale = xmax*2/255 + + return scale, zero_point + + def get_quantized_tensor(self, x): + scale, zero_point = self.get_scale_and_zero_point(x) + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + # int8_x = dequantized_x*scale.to(torch.int8) + + return dequantized_x + + + def quantize(self): + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_weight), requires_grad=False) + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_weight))), requires_grad=False) + delattr(self, "mu_weight") + delattr(self, "rho_weight") + + self.quantized_mu_bias = Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False) + self.quantized_sigma_bias = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False) + delattr(self, "mu_bias") + delattr(self, "rho_bias") + + def dequantize(self): + self.mu_weight = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + return + + def forward(self, x): + + bias = None + if self.quantized_mu_bias is not None: + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + bias = self.mu_bias + + outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32 + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, 1, 128, torch.quint8) # scale? + sign_output = torch.quantize_per_tensor(sign_output, 1, 128, torch.quint8) # scale? + + # getting perturbation weights + eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), 6/255, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale()) + delta_weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0) + + bias = None + if self.quantized_sigma_bias is not None: + eps_bias = self.eps_bias.data.normal_() + bias = (self.sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, 0.1, 128) + + perturbed_outputs = torch.nn.quantized.functional.linear(x, + weight=delta_weight, bias=bias, scale=0.1, zero_point=128) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, 0.1, 128) + out = torch.ops.quantized.add(outputs, perturbed_outputs, 0.1, 128) + out = out.dequantize() + + return out, 0 From 33986eb8c5b9c8303ac6f52e26040905ee421ab5 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Mon, 5 Dec 2022 14:02:34 -0500 Subject: [PATCH 43/69] template for quantized transposed conv1d flipout layer --- .../flipout_layers/quantized_conv_flipout.py | 201 ++++++++++++++++++ 1 file changed, 201 insertions(+) diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py index 45eacfe..972dad7 100644 --- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py @@ -659,3 +659,204 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) return out, 0 + +class QuantizedConvTranspose1dFlipout(ConvTranspose1dFlipout): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + super(QuantizedConvTranspose1dFlipout).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias) + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format. + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if x.dtype!=torch.quint8: + x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) + + bias = None + if self.bias: + bias = self.quantized_mu_bias + + self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(self.quantized_mu_weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + out = torch.ops.quantized.conv_transpose1d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + + + outputs = torch.nn.quantized.functional.conv1d(x, self.quantized_mu_weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + + bias = None + if self.bias: + eps_bias = self.eps_bias.data.normal_() + bias = (self.quantized_sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + perturbed_outputs = torch.nn.quantized.functional.conv1d(x, + weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding, + dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + + return out, 0 \ No newline at end of file From 3e39d223972713d5f5b743afa4a32400b8ef55fd Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Wed, 7 Dec 2022 10:32:34 -0500 Subject: [PATCH 44/69] quantized flipout layers --- .../flipout_layers/quantized_conv_flipout.py | 417 +++++++++++++++++- .../quantized_linear_flipout.py | 102 ++++- 2 files changed, 494 insertions(+), 25 deletions(-) diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py index 972dad7..2414cd6 100644 --- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py @@ -691,6 +691,9 @@ def __init__(self, self.is_dequant = False + if not hasattr(self, "output_padding"): + self.output_padding = 0 + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): """ An implementation for symmetric quantization @@ -828,12 +831,210 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 self.padding, self.output_padding, self.dilation, self.groups) - out = torch.ops.quantized.conv_transpose1d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + outputs = torch.ops.quantized.conv_transpose1d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) - outputs = torch.nn.quantized.functional.conv1d(x, self.quantized_mu_weight, bias, self.stride, self.padding, - self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + + bias = None + if self.bias: + eps_bias = self.eps_bias.data.normal_() + bias = (self.quantized_sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(delta_kernel, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + perturbed_outputs = torch.ops.quantized.conv_transpose1d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + + return out, 0 + +class QuantizedConvTranspose2dFlipout(ConvTranspose2dFlipout): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + super(QuantizedConvTranspose2dFlipout).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias) + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + if not hasattr(self, "output_padding"): + self.output_padding = 0 + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format. + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if x.dtype!=torch.quint8: + x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) + + bias = None + if self.bias: + bias = self.quantized_mu_bias + + self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(self.quantized_mu_weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + outputs = torch.ops.quantized.conv_transpose2d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + # sampling perturbation signs sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() @@ -853,9 +1054,213 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 # perturbed feedforward x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) - perturbed_outputs = torch.nn.quantized.functional.conv1d(x, - weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding, - dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point) + self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(delta_kernel, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + perturbed_outputs = torch.ops.quantized.conv_transpose2d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + + return out, 0 + +class QuantizedConvTranspose3dFlipout(ConvTranspose3dFlipout): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + super(QuantizedConvTranspose3dFlipout).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias) + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + if not hasattr(self, "output_padding"): + self.output_padding = 0 + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format. + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if x.dtype!=torch.quint8: + x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) + + bias = None + if self.bias: + bias = self.quantized_mu_bias + + self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(self.quantized_mu_weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + outputs = torch.ops.quantized.conv_transpose3d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + + bias = None + if self.bias: + eps_bias = self.eps_bias.data.normal_() + bias = (self.quantized_sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(delta_kernel, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + perturbed_outputs = torch.ops.quantized.conv_transpose3d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) diff --git a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py index 1449428..9673242 100644 --- a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py +++ b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py @@ -30,7 +30,7 @@ # variational inference in Bayesian neural networks. Variational layers # enables Monte Carlo approximation of the distribution over the weights # -# @authors: Ranganath Krishnan, Piero Esposito +# @authors: Jun-Liang Lin # # ====================================================================================== import torch @@ -55,25 +55,59 @@ def __init__(self, self.is_dequant = False - def get_scale_and_zero_point(self, x): + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point - # symmetry - scale = torch.zeros(1).to(x.device) - zero_point = torch.zeros(1).to(x.device) - xmax = torch.clamp(x.abs().max(), -100, 100) - scale = xmax*2/255 + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. - return scale, zero_point - def get_quantized_tensor(self, x): + Returns + ---------- + quantized_x: tensors + + + """ scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) return quantized_x def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() - # int8_x = dequantized_x*scale.to(torch.int8) return dequantized_x @@ -97,7 +131,37 @@ def dequantize(self): self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) return - def forward(self, x): + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. Already dequantized. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ bias = None if self.quantized_mu_bias is not None: @@ -106,16 +170,16 @@ def forward(self, x): self.is_dequant = True bias = self.mu_bias - outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32 + outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 # sampling perturbation signs sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() - sign_input = torch.quantize_per_tensor(sign_input, 1, 128, torch.quint8) # scale? - sign_output = torch.quantize_per_tensor(sign_output, 1, 128, torch.quint8) # scale? + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) # getting perturbation weights - eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), 6/255, 0, torch.qint8) + eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), normal_scale, 0, torch.qint8) new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale()) delta_weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0) @@ -125,12 +189,12 @@ def forward(self, x): bias = (self.sigma_bias * eps_bias) # perturbed feedforward - x = torch.ops.quantized.mul(x, sign_input, 0.1, 128) + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) perturbed_outputs = torch.nn.quantized.functional.linear(x, - weight=delta_weight, bias=bias, scale=0.1, zero_point=128) - perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, 0.1, 128) - out = torch.ops.quantized.add(outputs, perturbed_outputs, 0.1, 128) + weight=delta_weight, bias=bias, scale=default_scale, zero_point=default_zero_point) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) out = out.dequantize() return out, 0 From 61c34079b9725542a6f85b274714b8aca76396ca Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Wed, 7 Dec 2022 10:35:50 -0500 Subject: [PATCH 45/69] update init file --- bayesian_torch/layers/flipout_layers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bayesian_torch/layers/flipout_layers/__init__.py b/bayesian_torch/layers/flipout_layers/__init__.py index fda0925..b1b18c4 100644 --- a/bayesian_torch/layers/flipout_layers/__init__.py +++ b/bayesian_torch/layers/flipout_layers/__init__.py @@ -1,6 +1,6 @@ from .conv_flipout import * from .linear_flipout import * from .rnn_flipout import * -# from .quantized_linear_flipout import * +from .quantized_linear_flipout import * from .quantized_conv_flipout import * # from .quantize_rnn_flipout import * \ No newline at end of file From f647bcf8c7ceb52f3e8b9bd128df0edb8f520717 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Wed, 7 Dec 2022 10:44:54 -0500 Subject: [PATCH 46/69] update name list --- .../layers/flipout_layers/quantized_conv_flipout.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py index 2414cd6..8cde630 100644 --- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py @@ -46,9 +46,9 @@ 'QuantizedConv1dFlipout', 'QuantizedConv2dFlipout', 'QuantizedConv3dFlipout', - # 'QuantizedConvTranspose1dFlipout', - # 'QuantizedConvTranspose2dFlipout', - # 'QuantizedConvTranspose3dFlipout', + 'QuantizedConvTranspose1dFlipout', + 'QuantizedConvTranspose2dFlipout', + 'QuantizedConvTranspose3dFlipout', ] From 9524bc0af37d8fa4e42c38b97c77288caaf917dd Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Wed, 7 Dec 2022 10:59:01 -0500 Subject: [PATCH 47/69] bnn to qbnn conversion --- bayesian_torch/models/bnn_to_qbnn.py | 228 +++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 bayesian_torch/models/bnn_to_qbnn.py diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py new file mode 100644 index 0000000..6732a65 --- /dev/null +++ b/bayesian_torch/models/bnn_to_qbnn.py @@ -0,0 +1,228 @@ +# Copyright (C) 2021 Intel Labs +# +# BSD-3-Clause License +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Functions related to BNN to QBNN model conversion. +# +# @authors: Jun-Liang Lin +# +# =============================================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import bayesian_torch.layers as bayesian_layers +import torch +import torch.nn as nn +from torch.nn import Identity +from torch.nn.quantized import BatchNorm2d as QBatchNorm2d +from torch.nn import Module, Parameter + + +def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + +def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + +def qbnn_linear_layer(d): + layer_type = "Quantized" + d.__class__.__name__ + layer_fn = getattr(bayesian_layers, layer_type) # Get QBNN layer + qbnn_layer = layer_fn( + in_features=d.in_features, + out_features=d.out_features, + ) + qbnn_layer.__dict__.update(d.__dict__) + qbnn_layer.quantize() + return qbnn_layer + +def qbnn_conv_layer(d): + layer_type = "Quantized" + d.__class__.__name__ + layer_fn = getattr(bayesian_layers, layer_type) # Get QBNN layer + qbnn_layer = layer_fn( + in_channels=d.in_channels, + out_channels=d.out_channels, + kernel_size=d.kernel_size, + stride=d.stride, + padding=d.padding, + dilation=d.dilation, + groups=d.groups, + ) + qbnn_layer.__dict__.update(d.__dict__) + qbnn_layer.quantize() + return qbnn_layer + +def qbnn_lstm_layer(d): + layer_type = "Quantized" + d.__class__.__name__ + layer_fn = getattr(bayesian_layers, layer_type) # Get QBNN layer + qbnn_layer = layer_fn( + in_features=d.input_size, + out_features=d.hidden_size, + ) + qbnn_layer.__dict__.update(d.__dict__) + qbnn_layer.quantize() + return qbnn_layer + +def qbnn_batchnorm2d_layer(d): + layer_fn = QBatchNorm2d # Get QBNN layer + qbnn_layer = layer_fn( + num_features=d.num_features + ) + qbnn_layer.__dict__.update(d.__dict__) + # qbnn_layer.weight = Parameter(get_quantized_tensor(d.weight), requires_grad=False) + # qbnn_layer.bias = Parameter(get_quantized_tensor(d.bias), requires_grad=False) + # qbnn_layer.running_mean = Parameter(get_quantized_tensor(d.running_mean), requires_grad=False) + # qbnn_layer.running_var = Parameter(get_quantized_tensor(d.running_var), requires_grad=False) + qbnn_layer.scale = Parameter(torch.tensor([0.1]), requires_grad=False) + qbnn_layer.zero_point = Parameter(torch.tensor([128]), requires_grad=False) + return qbnn_layer + + +# batch norm folding +def batch_norm_folding(conv, bn): + layer_type = "Quantized" + conv.__class__.__name__ + layer_fn = getattr(bayesian_layers, layer_type) # Get QBNN layer + qbnn_layer = layer_fn( + in_channels=conv.in_channels, + out_channels=conv.out_channels, + kernel_size=conv.kernel_size, + stride=conv.stride, + padding=conv.padding, + dilation=conv.dilation, + groups=conv.groups, + ) + qbnn_layer.__dict__.update(conv.__dict__) + qbnn_layer.bn_weight = bn.weight + qbnn_layer.bn_bias = bn.bias + qbnn_layer.bn_running_mean = bn.running_mean + qbnn_layer.bn_running_var = bn.running_var + qbnn_layer.bn_eps = bn.eps + qbnn_layer.quantize() + return qbnn_layer + +# replaces linear and conv layers +def bnn_to_qbnn(m, fuse_conv_bn=False): + for name, value in list(m._modules.items()): + if m._modules[name]._modules: + bnn_to_qbnn(m._modules[name], fuse_conv_bn=fuse_conv_bn) + elif "Linear" in m._modules[name].__class__.__name__: + setattr(m, name, qbnn_linear_layer(m._modules[name])) + elif "LSTM" in m._modules[name].__class__.__name__: + setattr(m, name, qbnn_lstm_layer(m._modules[name])) + else: + if fuse_conv_bn: + if 'conv1' in m._modules.keys() and 'bn1' in m._modules.keys(): + if 'Identity' not in m._modules['bn1'].__class__.__name__: + setattr(m, 'conv1', batch_norm_folding(m._modules['conv1'], m._modules['bn1'])) + setattr(m, 'bn1', Identity()) + if 'conv2' in m._modules.keys() and 'bn2' in m._modules.keys(): + if 'Identity' not in m._modules['bn2'].__class__.__name__: + setattr(m, 'conv2', batch_norm_folding(m._modules['conv2'], m._modules['bn2'])) + setattr(m, 'bn2', Identity()) + if 'conv3' in m._modules.keys() and 'bn3' in m._modules.keys(): + if 'Identity' not in m._modules['bn3'].__class__.__name__: + setattr(m, 'conv3', batch_norm_folding(m._modules['conv3'], m._modules['bn3'])) + setattr(m, 'bn3', Identity()) + if 'downsample' in m._modules.keys(): + if m._modules['downsample'].__class__.__name__=='Sequential' and len(m._modules['downsample'])==2: + if 'Identity' not in m._modules['downsample'][1].__class__.__name__: + m._modules['downsample'][0]=batch_norm_folding(m._modules['downsample'][0], m._modules['downsample'][1]) + m._modules['downsample'][1]=Identity() + else: + if "Conv" in m._modules[name].__class__.__name__: + setattr(m, name, qbnn_conv_layer(m._modules[name])) + + elif "Batch" in m._modules[name].__class__.__name__: + setattr(m, name, qbnn_batchnorm2d_layer(m._modules[name])) + + return + +if __name__ == "__main__": + class FusionTest(nn.Module): + def __init__(self): + super(FusionTest, self).__init__() + self.conv1 = bayesian_layers.Conv2dReparameterization(1,3,2,bias=False) + self.bn1 = nn.BatchNorm2d(3) + def forward(self, x): + x = self.conv1(x)[0] + x = self.bn1(x) + return x + m = FusionTest() + m.conv1.rho_kernel = Parameter(torch.zeros(m.conv1.rho_kernel.shape)-100) + m.eval() + print(m) + input = torch.randn(1,1,3,3) + print(m(input)) + bnn_to_qbnn(m) + print(m) + if input.dtype!=torch.quint8: + input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) + print(m(input)) \ No newline at end of file From 3930c61944c7d83cb2987ee5699e9b5439cb6a0f Mon Sep 17 00:00:00 2001 From: Jun-Liang Lin <82939287+junliang-lin@users.noreply.github.com> Date: Mon, 6 Feb 2023 12:52:39 -0500 Subject: [PATCH 48/69] Merge remote-tracking branch (#5) * fix minor typo. Signed-off-by: Ranganath Krishnan * Update links in README.md * update MOPED layer example utility function Signed-off-by: Ranganath Krishnan * Update README.md * feat: add possibility to return no kl, save it as attribute * feat: add possibility to return no kl on flipout layers, save it as attribute * updates to support dnn to bnn imodel auto conversion * updates to support dnn to bnn imodel auto conversion * remove duplicate kl_loss definition in Conv1dReparameterization layer Signed-off-by: Ranganath Krishnan * include kl_loss() function in Convolutional flipout layers, to compute kl when 'return_kl' flag is set to False. Fix for issue#12. Signed-off-by: Ranganath Krishnan * Update README.md * Update README.md * update the posterior variational param init value Signed-off-by: Ranganath Krishnan * Update release version with dnn_to_bnn() feature * Update README.md update usage instructions in README file * Update requirements.txt * Include training, testing and uncertainty quantification snippet in README.md * update version in setup.py * Update bayesian_torch.layers.md * Update links in README.md * Update setup.py * Update README.md * include assets folder Signed-off-by: Ranganath Krishnan * Update README.md * Update README.md * Update README.md * Update README.md * Update setup.py * release to PyPI, update install instruction through "pip" command * Switched to permanent URL for the top image. * changing to raw.githubusercontent.com Url for top image. * Update README.md * update links and release number for PyPI documentation Signed-off-by: Ranganath Krishnan * Update README.md add downloads statistics badge * update download count badge * Added support for arbitrary kernel sizes for Bayesian Conv layers * update version number Signed-off-by: Ranganath Krishnan * Update README.md * Add support for output padding in flipout layers --------- Signed-off-by: Ranganath Krishnan Co-authored-by: Ranganath Krishnan Co-authored-by: Pi Co-authored-by: msubedar Co-authored-by: Michael Beale --- README.md | 2 +- .../layers/base_variational_layer.py | 6 + .../layers/flipout_layers/conv_flipout.py | 104 ++++++++++-------- .../variational_layers/conv_variational.py | 90 +++++++-------- bayesian_torch/models/dnn_to_bnn.py | 2 +- setup.py | 2 +- 6 files changed, 116 insertions(+), 90 deletions(-) diff --git a/README.md b/README.md index e56245a..36ebf19 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ A library for Bayesian neural network layers and uncertainty estimation in Deep [![python](https://img.shields.io/badge/python-3.7%2B-blue)](https://github.com/IntelLabs/bayesian-torch) [![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)](https://github.com/IntelLabs/bayesian-torch) -[![version](https://img.shields.io/badge/release-0.2.1-green)](https://github.com/IntelLabs/bayesian-torch/releases) +[![version](https://img.shields.io/badge/release-0.3.0-green)](https://github.com/IntelLabs/bayesian-torch/releases) [![license](https://img.shields.io/badge/license-BSD%203--Clause-blue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE) [![Downloads](https://static.pepy.tech/personalized-badge/bayesian-torch?period=total&units=international_system&left_color=grey&right_color=darkblue&left_text=downloads)](https://pepy.tech/project/bayesian-torch)

diff --git a/bayesian_torch/layers/base_variational_layer.py b/bayesian_torch/layers/base_variational_layer.py index 4d63cc9..8263e82 100644 --- a/bayesian_torch/layers/base_variational_layer.py +++ b/bayesian_torch/layers/base_variational_layer.py @@ -29,7 +29,13 @@ import torch import torch.nn as nn import torch.distributions as distributions +from itertools import repeat +import collections +def get_kernel_size(x, n): + if isinstance(x, collections.abc.Iterable): + return tuple(x) + return tuple(repeat(x, n)) class BaseVariationalLayer_(nn.Module): def __init__(self): diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py index ce13897..3cd81d1 100644 --- a/bayesian_torch/layers/flipout_layers/conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py @@ -36,7 +36,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from ..base_variational_layer import BaseVariationalLayer_ +from ..base_variational_layer import BaseVariationalLayer_, get_kernel_size from torch.distributions.normal import Normal from torch.distributions.uniform import Uniform @@ -263,28 +263,32 @@ def __init__(self, self.bias = bias self.kl = 0 +<<<<<<< HEAD +======= + kernel_size = get_kernel_size(kernel_size, 2) +>>>>>>> upstream/main self.mu_kernel = nn.Parameter( - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size)) + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1])) self.rho_kernel = nn.Parameter( - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size)) + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1])) self.register_buffer( 'eps_kernel', - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size), + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1]), persistent=False) self.register_buffer( 'prior_weight_mu', - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size), + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1]), persistent=False) self.register_buffer( 'prior_weight_sigma', - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size), + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1]), persistent=False) if self.bias: @@ -430,27 +434,29 @@ def __init__(self, self.posterior_mu_init = posterior_mu_init self.posterior_rho_init = posterior_rho_init + kernel_size = get_kernel_size(kernel_size, 3) + self.mu_kernel = nn.Parameter( - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size, kernel_size)) + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2])) self.rho_kernel = nn.Parameter( - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size, kernel_size)) + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2])) self.register_buffer( 'eps_kernel', - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size, kernel_size), + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2]), persistent=False) self.register_buffer( 'prior_weight_mu', - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size, kernel_size), + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2]), persistent=False) self.register_buffer( 'prior_weight_sigma', - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size, kernel_size), + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2]), persistent=False) if self.bias: @@ -555,6 +561,7 @@ def __init__(self, padding=0, dilation=1, groups=1, + output_padding=0, prior_mean=0, prior_variance=1, posterior_mu_init=0, @@ -586,6 +593,7 @@ def __init__(self, self.kernel_size = kernel_size self.stride = stride self.padding = padding + self.output_padding = output_padding self.dilation = dilation self.groups = groups self.bias = bias @@ -667,6 +675,7 @@ def forward(self, x, return_kl=True): bias=self.mu_bias, stride=self.stride, padding=self.padding, + output_padding=self.output_padding, dilation=self.dilation, groups=self.groups) @@ -700,6 +709,7 @@ def forward(self, x, return_kl=True): bias=bias, stride=self.stride, padding=self.padding, + output_padding=self.output_padding, dilation=self.dilation, groups=self.groups) * sign_output @@ -717,6 +727,7 @@ def __init__(self, kernel_size, stride=1, padding=0, + output_padding=0, dilation=1, groups=1, prior_mean=0, @@ -750,6 +761,7 @@ def __init__(self, self.kernel_size = kernel_size self.stride = stride self.padding = padding + self.output_padding = output_padding self.dilation = dilation self.groups = groups self.bias = bias @@ -760,28 +772,28 @@ def __init__(self, self.prior_variance = prior_variance self.posterior_mu_init = posterior_mu_init self.posterior_rho_init = posterior_rho_init - + kernel_size = get_kernel_size(kernel_size, 2) self.mu_kernel = nn.Parameter( - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size)) + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1])) self.rho_kernel = nn.Parameter( - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size)) + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1])) self.register_buffer( 'eps_kernel', - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size), + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1]), persistent=False) self.register_buffer( 'prior_weight_mu', - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size), + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1]), persistent=False) self.register_buffer( 'prior_weight_sigma', - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size), + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1]), persistent=False) if self.bias: @@ -835,6 +847,7 @@ def forward(self, x, return_kl=True): weight=self.mu_kernel, stride=self.stride, padding=self.padding, + output_padding=self.output_padding, dilation=self.dilation, groups=self.groups) @@ -868,6 +881,7 @@ def forward(self, x, return_kl=True): weight=delta_kernel, stride=self.stride, padding=self.padding, + output_padding=self.output_padding, dilation=self.dilation, groups=self.groups) * sign_output @@ -885,6 +899,7 @@ def __init__(self, kernel_size, stride=1, padding=0, + output_padding=0, dilation=1, groups=1, prior_mean=0, @@ -918,6 +933,7 @@ def __init__(self, self.kernel_size = kernel_size self.stride = stride self.padding = padding + self.output_padding = output_padding self.dilation = dilation self.groups = groups @@ -928,28 +944,28 @@ def __init__(self, self.bias = bias self.kl = 0 - + kernel_size = get_kernel_size(kernel_size, 3) self.mu_kernel = nn.Parameter( - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size, kernel_size)) + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2])) self.rho_kernel = nn.Parameter( - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size, kernel_size)) + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2])) self.register_buffer( 'eps_kernel', - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size, kernel_size), + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2]), persistent=False) self.register_buffer( 'prior_weight_mu', - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size, kernel_size), + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2]), persistent=False) self.register_buffer( 'prior_weight_sigma', - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size, kernel_size), + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2]), persistent=False) if self.bias: @@ -1003,6 +1019,7 @@ def forward(self, x, return_kl=True): bias=self.mu_bias, stride=self.stride, padding=self.padding, + output_padding=self.output_padding, dilation=self.dilation, groups=self.groups) @@ -1035,6 +1052,7 @@ def forward(self, x, return_kl=True): bias=bias, stride=self.stride, padding=self.padding, + output_padding=self.output_padding, dilation=self.dilation, groups=self.groups) * sign_output diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py index 7855ad8..0d2ebfd 100644 --- a/bayesian_torch/layers/variational_layers/conv_variational.py +++ b/bayesian_torch/layers/variational_layers/conv_variational.py @@ -46,7 +46,7 @@ import torch.nn as nn import torch.nn.functional as F from torch.nn import Parameter -from ..base_variational_layer import BaseVariationalLayer_ +from ..base_variational_layer import BaseVariationalLayer_, get_kernel_size import math __all__ = [ @@ -255,26 +255,28 @@ def __init__(self, self.posterior_rho_init = posterior_rho_init, self.bias = bias + kernel_size = get_kernel_size(kernel_size, 2) + self.mu_kernel = Parameter( - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size)) + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1])) self.rho_kernel = Parameter( - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size)) + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1])) self.register_buffer( 'eps_kernel', - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size), + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1]), persistent=False) self.register_buffer( 'prior_weight_mu', - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size), + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1]), persistent=False) self.register_buffer( 'prior_weight_sigma', - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size), + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1]), persistent=False) if self.bias: @@ -403,27 +405,27 @@ def __init__(self, # variance of weight --> sigma = log (1 + exp(rho)) self.posterior_rho_init = posterior_rho_init, self.bias = bias - + kernel_size = get_kernel_size(kernel_size, 3) self.mu_kernel = Parameter( - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size, kernel_size)) + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2])) self.rho_kernel = Parameter( - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size, kernel_size)) + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2])) self.register_buffer( 'eps_kernel', - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size, kernel_size), + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2]), persistent=False) self.register_buffer( 'prior_weight_mu', - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size, kernel_size), + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2]), persistent=False) self.register_buffer( 'prior_weight_sigma', - torch.Tensor(out_channels, in_channels // groups, kernel_size, - kernel_size, kernel_size), + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2]), persistent=False) if self.bias: @@ -698,27 +700,27 @@ def __init__(self, # variance of weight --> sigma = log (1 + exp(rho)) self.posterior_rho_init = posterior_rho_init, self.bias = bias - + kernel_size = get_kernel_size(kernel_size, 2) self.mu_kernel = Parameter( - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size)) + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1])) self.rho_kernel = Parameter( - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size)) + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1])) self.register_buffer( 'eps_kernel', - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size), + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1]), persistent=False) self.register_buffer( 'prior_weight_mu', - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size), + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1]), persistent=False) self.register_buffer( 'prior_weight_sigma', - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size), + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1]), persistent=False) if self.bias: @@ -850,27 +852,27 @@ def __init__(self, # variance of weight --> sigma = log (1 + exp(rho)) self.posterior_rho_init = posterior_rho_init, self.bias = bias - + kernel_size = get_kernel_size(kernel_size, 3) self.mu_kernel = Parameter( - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size, kernel_size)) + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2])) self.rho_kernel = Parameter( - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size, kernel_size)) + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2])) self.register_buffer( 'eps_kernel', - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size, kernel_size), + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2]), persistent=False) self.register_buffer( 'prior_weight_mu', - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size, kernel_size), + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2]), persistent=False) self.register_buffer( 'prior_weight_sigma', - torch.Tensor(in_channels, out_channels // groups, kernel_size, - kernel_size, kernel_size), + torch.Tensor(in_channels, out_channels // groups, kernel_size[0], + kernel_size[1], kernel_size[2]), persistent=False) if self.bias: diff --git a/bayesian_torch/models/dnn_to_bnn.py b/bayesian_torch/models/dnn_to_bnn.py index 18b9b51..92e18b4 100644 --- a/bayesian_torch/models/dnn_to_bnn.py +++ b/bayesian_torch/models/dnn_to_bnn.py @@ -79,7 +79,7 @@ def bnn_conv_layer(params, d): bnn_layer = layer_fn( in_channels=d.in_channels, out_channels=d.out_channels, - kernel_size=d.kernel_size[0], + kernel_size=d.kernel_size, stride=d.stride, padding=d.padding, dilation=d.dilation, diff --git a/setup.py b/setup.py index 5a02fb8..6629022 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name = "bayesian-torch", packages = find_packages(), - version = "0.2.1", + version = "0.3.0", description = "A library for Bayesian neural network layers and uncertainty estimation in Deep Learning", author = "Intel Labs", author_email = "ranganath.krishnan@intel.com", From f3f32e88f3b0b1b162a6a9649cf573c5ee5a4546 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Mon, 6 Feb 2023 19:50:34 -0500 Subject: [PATCH 49/69] add kl flag for BNN to QBNN conversion --- .../flipout_layers/quantized_conv_flipout.py | 60 ++++++++++++++---- .../quantized_linear_flipout.py | 10 ++- .../quantize_conv_variational.py | 62 +++++++++++++++---- .../quantize_linear_variational.py | 9 ++- bayesian_torch/models/bnn_to_qbnn.py | 8 +++ 5 files changed, 121 insertions(+), 28 deletions(-) diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py index 8cde630..cf771c7 100644 --- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py @@ -177,7 +177,7 @@ def quantize(self): delattr(self, "bn_running_var") delattr(self, "bn_eps") - def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -209,6 +209,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 """ + if self.dnn_to_bnn_flag: + return_kl = False + if x.dtype!=torch.quint8: x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) @@ -244,7 +247,10 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) - return out, 0 + if return_kl: + return out, 0 + + return out class QuantizedConv2dFlipout(Conv2dFlipout): @@ -384,7 +390,7 @@ def dequantize(self): return - def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -416,6 +422,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 """ + if self.dnn_to_bnn_flag: + return_kl = False + if x.dtype!=torch.quint8: x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) @@ -451,7 +460,10 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) - return out, 0 + if return_kl: + return out, 0 + + return out class QuantizedConv3dFlipout(Conv3dFlipout): @@ -591,7 +603,7 @@ def dequantize(self): return - def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -623,6 +635,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 """ + if self.dnn_to_bnn_flag: + return_kl = False + if x.dtype!=torch.quint8: x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) @@ -658,7 +673,10 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) - return out, 0 + if return_kl: + return out, 0 + + return out class QuantizedConvTranspose1dFlipout(ConvTranspose1dFlipout): def __init__(self, @@ -788,7 +806,7 @@ def quantize(self): delattr(self, "bn_running_var") delattr(self, "bn_eps") - def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -820,6 +838,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 """ + if self.dnn_to_bnn_flag: + return_kl = False + if x.dtype!=torch.quint8: x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) @@ -860,7 +881,10 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) - return out, 0 + if return_kl: + return out, 0 + + return out class QuantizedConvTranspose2dFlipout(ConvTranspose2dFlipout): def __init__(self, @@ -990,7 +1014,7 @@ def quantize(self): delattr(self, "bn_running_var") delattr(self, "bn_eps") - def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -1022,6 +1046,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 """ + if self.dnn_to_bnn_flag: + return_kl = False + if x.dtype!=torch.quint8: x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) @@ -1062,7 +1089,10 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) - return out, 0 + if return_kl: + return out, 0 + + return out class QuantizedConvTranspose3dFlipout(ConvTranspose3dFlipout): def __init__(self, @@ -1192,7 +1222,7 @@ def quantize(self): delattr(self, "bn_running_var") delattr(self, "bn_eps") - def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -1224,6 +1254,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 """ + if self.dnn_to_bnn_flag: + return_kl = False + if x.dtype!=torch.quint8: x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) @@ -1264,4 +1297,7 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) - return out, 0 \ No newline at end of file + if return_kl: + return out, 0 + + return out \ No newline at end of file diff --git a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py index 9673242..289da98 100644 --- a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py +++ b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py @@ -131,7 +131,7 @@ def dequantize(self): self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) return - def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -163,6 +163,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 """ + if self.dnn_to_bnn_flag: + return_kl = False + bias = None if self.quantized_mu_bias is not None: if not self.is_dequant: @@ -197,4 +200,7 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) out = out.dequantize() - return out, 0 + if return_kl: + return out, 0 + + return out diff --git a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py index 59470df..a8b25dc 100644 --- a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py +++ b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py @@ -199,7 +199,7 @@ def dequantize(self): # Deprecated. Only for forward mode #1. return - def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -233,6 +233,10 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s """ + + if self.dnn_to_bnn_flag: + return_kl = False + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() @@ -269,7 +273,11 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s out = torch.nn.quantized.functional.conv1d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 - return out, 0 # disable kl divergence computing + if return_kl: + return out, 0 # disable kl divergence computing + + return out + class QuantizedConv2dReparameterization(Conv2dReparameterization): @@ -421,7 +429,7 @@ def dequantize(self): # Deprecated. Only for forward mode #1. return - def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -455,6 +463,9 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s """ + if self.dnn_to_bnn_flag: + return_kl = False + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() @@ -491,7 +502,10 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s out = torch.nn.quantized.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 - return out, 0 # disable kl divergence computing + if return_kl: + return out, 0 # disable kl divergence computing + + return out class QuantizedConv3dReparameterization(Conv3dReparameterization): @@ -642,7 +656,7 @@ def dequantize(self): # Deprecated. Only for forward mode #1. return - def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -676,6 +690,9 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s """ + if self.dnn_to_bnn_flag: + return_kl = False + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() @@ -712,7 +729,10 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s out = torch.nn.quantized.functional.conv3d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 - return out, 0 # disable kl divergence computing + if return_kl: + return out, 0 # disable kl divergence computing + + return out class QuantizedConvTranspose1dReparameterization(ConvTranspose1dReparameterization): def __init__(self, @@ -862,7 +882,7 @@ def dequantize(self): # Deprecated. Only for forward mode #1. return - def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -896,6 +916,9 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s """ + if self.dnn_to_bnn_flag: + return_kl = False + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() @@ -937,7 +960,10 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s out = torch.ops.quantized.conv_transpose1d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point) - return out, 0 # disable kl divergence computing + if return_kl: + return out, 0 # disable kl divergence computing + + return out class QuantizedConvTranspose2dReparameterization(ConvTranspose2dReparameterization): def __init__(self, @@ -1087,7 +1113,7 @@ def dequantize(self): # Deprecated. Only for forward mode #1. return - def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -1121,6 +1147,9 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s """ + if self.dnn_to_bnn_flag: + return_kl = False + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() @@ -1162,7 +1191,10 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s out = torch.ops.quantized.conv_transpose2d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point) - return out, 0 # disable kl divergence computing + if return_kl: + return out, 0 # disable kl divergence computing + + return out class QuantizedConvTranspose3dReparameterization(ConvTranspose3dReparameterization): def __init__(self, @@ -1312,7 +1344,7 @@ def dequantize(self): # Deprecated. Only for forward mode #1. return - def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -1346,6 +1378,9 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s """ + if self.dnn_to_bnn_flag: + return_kl = False + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() @@ -1387,4 +1422,7 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s out = torch.ops.quantized.conv_transpose3d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point) - return out, 0 # disable kl divergence computing \ No newline at end of file + if return_kl: + return out, 0 # disable kl divergence computing + + return out \ No newline at end of file diff --git a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py index a1ce3fd..e666f9b 100644 --- a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py +++ b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py @@ -130,7 +130,7 @@ def dequantize(self): # Deprecated self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) return - def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): """ Forward pass Parameters @@ -165,6 +165,8 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s """ + if self.dnn_to_bnn_flag: + return_kl = False if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: @@ -196,4 +198,7 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s out = torch.nn.quantized.functional.linear(input, weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 out = out.dequantize() - return out, 0 # kl=0 + if return_kl: + return out, 0 # disable kl divergence computing + + return out diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py index 6732a65..d689465 100644 --- a/bayesian_torch/models/bnn_to_qbnn.py +++ b/bayesian_torch/models/bnn_to_qbnn.py @@ -102,6 +102,8 @@ def qbnn_linear_layer(d): ) qbnn_layer.__dict__.update(d.__dict__) qbnn_layer.quantize() + if d.dnn_to_bnn_flag: + qbnn_layer.dnn_to_bnn_flag = True return qbnn_layer def qbnn_conv_layer(d): @@ -118,6 +120,8 @@ def qbnn_conv_layer(d): ) qbnn_layer.__dict__.update(d.__dict__) qbnn_layer.quantize() + if d.dnn_to_bnn_flag: + qbnn_layer.dnn_to_bnn_flag = True return qbnn_layer def qbnn_lstm_layer(d): @@ -129,6 +133,8 @@ def qbnn_lstm_layer(d): ) qbnn_layer.__dict__.update(d.__dict__) qbnn_layer.quantize() + if d.dnn_to_bnn_flag: + qbnn_layer.dnn_to_bnn_flag = True return qbnn_layer def qbnn_batchnorm2d_layer(d): @@ -166,6 +172,8 @@ def batch_norm_folding(conv, bn): qbnn_layer.bn_running_var = bn.running_var qbnn_layer.bn_eps = bn.eps qbnn_layer.quantize() + if conv.dnn_to_bnn_flag: + qbnn_layer.dnn_to_bnn_flag = True return qbnn_layer # replaces linear and conv layers From b0a99d218e72518ff7694fcf82d4524196fd5804 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Thu, 9 Feb 2023 19:25:35 -0500 Subject: [PATCH 50/69] resolve merge conflicts --- bayesian_torch/layers/flipout_layers/conv_flipout.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py index 3cd81d1..c92d24b 100644 --- a/bayesian_torch/layers/flipout_layers/conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py @@ -263,11 +263,7 @@ def __init__(self, self.bias = bias self.kl = 0 -<<<<<<< HEAD - -======= kernel_size = get_kernel_size(kernel_size, 2) ->>>>>>> upstream/main self.mu_kernel = nn.Parameter( torch.Tensor(out_channels, in_channels // groups, kernel_size[0], kernel_size[1])) From 5c691d6e7c1e7acd0197a1541108799350428d3c Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Mon, 13 Feb 2023 02:04:06 -0500 Subject: [PATCH 51/69] quantized flipout models --- .../main_bayesian_imagenet_bnn2qbnn.py | 293 ++++++++++ .../main_bayesian_imagenet_dnn2bnn.py | 551 ++++++++++++++++++ .../quantized_resnet_flipout_large.py | 282 +++++++++ .../quantized_resnet_variational_large.py | 282 +++++++++ 4 files changed, 1408 insertions(+) create mode 100644 bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py create mode 100644 bayesian_torch/examples/main_bayesian_imagenet_dnn2bnn.py create mode 100644 bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py create mode 100644 bayesian_torch/models/bayesian/quantized_resnet_variational_large.py diff --git a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py new file mode 100644 index 0000000..687c1d0 --- /dev/null +++ b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py @@ -0,0 +1,293 @@ +import argparse +import os +import shutil +import time + +import torch +import torch.nn as nn +import torch.nn.parallel +import torch.backends.cudnn as cudnn +import torch.optim +import torch.utils.data +from torch.utils.tensorboard import SummaryWriter +import torchvision.transforms as transforms +import torchvision.datasets as datasets + +import bayesian_torch.models.bayesian.resnet_variational_large as resnet +import numpy as np +from bayesian_torch.models.bnn_to_qbnn import bnn_to_qbnn +# import bayesian_torch.models.bayesian.quantized_resnet_variational_large as qresnet +import bayesian_torch.models.bayesian.quantized_resnet_flipout_large as qresnet + +torch.cuda.is_available = lambda : False +os.environ["CUDA_VISIBLE_DEVICES"] = "-1" +torch.backends.quantized.engine='onednn' +model_names = sorted( + name + for name in resnet.__dict__ + if name.islower() and not name.startswith("__") and name.startswith("resnet") and callable(resnet.__dict__[name]) +) + +print(model_names) +best_acc1 = 0 +len_trainset = 1281167 +len_valset = 50000 + + +parser = argparse.ArgumentParser(description="ImageNet") +parser.add_argument('data', + metavar='DIR', + default='data/imagenet', + help='path to dataset') +parser.add_argument( + "--arch", + "-a", + metavar="ARCH", + default="resnet50", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet50)", +) +parser.add_argument( + "-j", "--workers", default=8, type=int, metavar="N", help="number of data loading workers (default: 8)" +) +parser.add_argument("--epochs", default=200, type=int, metavar="N", help="number of total epochs to run") +parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="manual epoch number (useful on restarts)") +parser.add_argument("-b", "--batch-size", default=1000, type=int, metavar="N", help="mini-batch size (default: 512)") +parser.add_argument('--val_batch_size', default=1000, type=int) +parser.add_argument("--lr", "--learning-rate", default=0.001, type=float, metavar="LR", help="initial learning rate") +parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") +parser.add_argument( + "--weight-decay", "--wd", default=1e-4, type=float, metavar="W", help="weight decay (default: 5e-4)" +) +parser.add_argument("--print-freq", "-p", default=50, type=int, metavar="N", help="print frequency (default: 20)") +parser.add_argument("--resume", default="", type=str, metavar="PATH", help="path to latest checkpoint (default: none)") +parser.add_argument("-e", "--evaluate", dest="evaluate", action="store_true", help="evaluate model on validation set") +parser.add_argument("--pretrained", dest="pretrained", action="store_true", help="use pre-trained model") +parser.add_argument("--half", dest="half", action="store_true", help="use half-precision(16-bit) ") +parser.add_argument( + "--save-dir", + dest="save_dir", + help="The directory used to save the trained models", + default="./checkpoint/bayesian", + type=str, +) +parser.add_argument( + "--moped-init-model", + dest="moped_init_model", + help="DNN model to intialize MOPED method", + default="", + type=str, +) +parser.add_argument( + "--moped-delta-factor", + dest="moped_delta_factor", + help="MOPED delta scale factor", + default=0.2, + type=float, +) + +parser.add_argument( + "--bnn-rho-init", + dest="bnn_rho_init", + help="rho init for bnn layers", + default=-3.0, + type=float, +) + +parser.add_argument( + "--use-flipout-layers", + type=bool, + default=False, + metavar="use_flipout_layers", + help="Use Flipout layers for BNNs, default is Reparameterization layers", +) + +parser.add_argument( + "--save-every", + dest="save_every", + help="Saves checkpoints at every specified number of epochs", + type=int, + default=10, +) +parser.add_argument("--mode", type=str, required=True, help="train | test") + +parser.add_argument( + "--num_monte_carlo", + type=int, + default=20, + metavar="N", + help="number of Monte Carlo samples to be drawn during inference", +) +parser.add_argument("--num_mc", type=int, default=1, metavar="N", help="number of Monte Carlo runs during training") +parser.add_argument( + "--tensorboard", + type=bool, + default=True, + metavar="N", + help="use tensorboard for logging and visualization of training progress", +) +parser.add_argument( + "--log_dir", + type=str, + default="./logs/cifar/bayesian", + metavar="N", + help="use tensorboard for logging and visualization of training progress", +) + +def evaluate(args, model, val_loader): + pred_probs_mc = [] + test_loss = 0 + correct = 0 + output_list = [] + labels_list = [] + model.eval() + with torch.no_grad(): + begin = time.time() + i=0 + for data, target in val_loader: + if torch.cuda.is_available(): + data, target = data.cuda(), target.cuda() + else: + data, target = data.cpu(), target.cpu() + output_mc = [] + for mc_run in range(args.num_monte_carlo): + output, _ = model.forward(data) + output_mc.append(output) + output_ = torch.stack(output_mc) + output_list.append(output_) + labels_list.append(target) + i+=1 + end = time.time() + print("inference throughput: ", i*args.val_batch_size / (end - begin), " images/s") + + output = torch.cat(output_list, 1) + output = torch.nn.functional.softmax(output, dim=2) + labels = torch.cat(labels_list) + pred_mean = output.mean(dim=0) + Y_pred = torch.argmax(pred_mean, axis=1) + print("Test accuracy:", (Y_pred.data.cpu().numpy() == labels.data.cpu().numpy()).mean() * 100) + + +def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): + """ + Save the training model + """ + torch.save(state, filename) + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def accuracy(output, target, topk=(1,)): + """Computes the precision@k for the specified values of k""" + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + +best_prec1 = 0 + +def main(): + global args, best_prec1 + args = parser.parse_args() + moped_enable = False + if len(args.moped_init_model) > 0: # use moped method if trained dnn model weights are provided + moped_enable = True + + # Check the save_dir exists or not + if not os.path.exists(args.save_dir): + os.makedirs(args.save_dir) + + model = torch.nn.DataParallel(resnet.__dict__[args.arch]()) + if moped_enable: + checkpoint = torch.load(args.moped_init_model) + if "state_dict" in checkpoint.keys(): + model.load_state_dict(checkpoint["state_dict"]) + else: + model.load_state_dict(checkpoint) + + tb_writer = None + + valdir = os.path.join(args.data, 'Imagenet_2012Val') + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + val_dataset = datasets.ImageFolder( + valdir, + transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])) + + val_loader = torch.utils.data.DataLoader(val_dataset, + batch_size=args.val_batch_size, + shuffle=False, + num_workers=args.workers, + pin_memory=True) + + print('len valset: ', len(val_dataset)) + + if not os.path.exists(args.save_dir): + os.makedirs(args.save_dir) + + if args.mode == "test": + checkpoint_file = args.save_dir + "/bayesian_{}_imagenet.pth".format(args.arch) + + checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu")) + model.load_state_dict(checkpoint["state_dict"]) + model.module = model.module.cpu() + + bnn_to_qbnn(model, fuse_conv_bn=False) # only replaces linear and conv layers + + model = model.cpu() + + # save weights + # save_checkpoint( + # { + # 'epoch': None, + # 'state_dict': model.state_dict(), + # 'best_prec1': None, + # }, + # True, + # filename=os.path.join( + # args.save_dir, + # 'quantized_bayesian_q{}_imagenet.pth'.format(args.arch))) + + qmodel = torch.nn.DataParallel(qresnet.__dict__['q'+args.arch](bias=False)) # set bias=True to make qconv has bias + qmodel.module.quant_then_dequant(qmodel, fuse_conv_bn=False) + + # load weights + # checkpoint_file = args.save_dir + "/quantized_bayesian_q{}_imagenet.pth".format(args.arch) + # checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu")) + # qmodel.load_state_dict(checkpoint["state_dict"]) + + qmodel.load_state_dict(model.state_dict()) + evaluate(args, qmodel, val_loader) + +if __name__ == "__main__": + main() diff --git a/bayesian_torch/examples/main_bayesian_imagenet_dnn2bnn.py b/bayesian_torch/examples/main_bayesian_imagenet_dnn2bnn.py new file mode 100644 index 0000000..28e03ae --- /dev/null +++ b/bayesian_torch/examples/main_bayesian_imagenet_dnn2bnn.py @@ -0,0 +1,551 @@ +import argparse +import os +import shutil +import time + +import torch +import torch.nn as nn +import torch.nn.parallel +import torch.backends.cudnn as cudnn +import torch.optim +import torch.utils.data +from torch.utils.tensorboard import SummaryWriter +import torchvision.transforms as transforms +import torchvision.datasets as datasets + +import bayesian_torch.models.deterministic.resnet_large as resnet +import numpy as np +from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn, get_kl_loss + +model_names = sorted( + name + for name in resnet.__dict__ + if name.islower() and not name.startswith("__") and name.startswith("resnet") and callable(resnet.__dict__[name]) +) + +print(model_names) +best_acc1 = 0 +len_trainset = 1281167 +len_valset = 50000 + + +parser = argparse.ArgumentParser(description="ImageNet") +parser.add_argument('data', + metavar='DIR', + default='data/imagenet', + help='path to dataset') +parser.add_argument( + "--arch", + "-a", + metavar="ARCH", + default="resnet50", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet50)", +) +parser.add_argument( + "-j", "--workers", default=8, type=int, metavar="N", help="number of data loading workers (default: 8)" +) +parser.add_argument("--epochs", default=90, type=int, metavar="N", help="number of total epochs to run") +parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="manual epoch number (useful on restarts)") +parser.add_argument("-b", "--batch-size", default=128, type=int, metavar="N", help="mini-batch size (default: 128)") +parser.add_argument('--val_batch_size', default=1000, type=int) +parser.add_argument("--lr", "--learning-rate", default=0.001, type=float, metavar="LR", help="initial learning rate") +parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") +parser.add_argument( + "--weight-decay", "--wd", default=1e-4, type=float, metavar="W", help="weight decay (default: 5e-4)" +) +parser.add_argument("--print-freq", "-p", default=50, type=int, metavar="N", help="print frequency (default: 20)") +parser.add_argument("--resume", default="", type=str, metavar="PATH", help="path to latest checkpoint (default: none)") +parser.add_argument("-e", "--evaluate", dest="evaluate", action="store_true", help="evaluate model on validation set") +parser.add_argument("--pretrained", dest="pretrained", action="store_true", help="use pre-trained model") +parser.add_argument("--half", dest="half", action="store_true", help="use half-precision(16-bit) ") +parser.add_argument( + "--save-dir", + dest="save_dir", + help="The directory used to save the trained models", + default="./checkpoint/bayesian", + type=str, +) +parser.add_argument( + "--moped-init-model", + dest="moped_init_model", + help="DNN model to intialize MOPED method", + default="", + type=str, +) +parser.add_argument( + "--moped-delta-factor", + dest="moped_delta_factor", + help="MOPED delta scale factor", + default=0.001, + type=float, +) + +parser.add_argument( + "--bnn-rho-init", + dest="bnn_rho_init", + help="rho init for bnn layers", + default=-10.0, + type=float, +) + +parser.add_argument( + "--use-flipout-layers", + type=bool, + default=False, + metavar="use_flipout_layers", + help="Use Flipout layers for BNNs, default is Reparameterization layers", +) + +parser.add_argument( + "--save-every", + dest="save_every", + help="Saves checkpoints at every specified number of epochs", + type=int, + default=10, +) +parser.add_argument("--mode", type=str, required=True, help="train | test") + +parser.add_argument( + "--num_monte_carlo", + type=int, + default=20, + metavar="N", + help="number of Monte Carlo samples to be drawn during inference", +) +parser.add_argument("--num_mc", type=int, default=1, metavar="N", help="number of Monte Carlo runs during training") +parser.add_argument( + "--tensorboard", + type=bool, + default=True, + metavar="N", + help="use tensorboard for logging and visualization of training progress", +) +parser.add_argument( + "--log_dir", + type=str, + default="./logs/imagenet/bayesian", + metavar="N", + help="use tensorboard for logging and visualization of training progress", +) + +best_prec1 = 0 + + +def main(): + global args, best_prec1 + args = parser.parse_args() + moped_enable = False + if len(args.moped_init_model) > 0: # use moped method if trained dnn model weights are provided + moped_enable = True + + const_bnn_prior_parameters = { + "prior_mu": 0.0, + "prior_sigma": 1.0, + "posterior_mu_init": 0.0, + "posterior_rho_init": args.bnn_rho_init, + "type": "Flipout" if args.use_flipout_layers else "Reparameterization", # Flipout or Reparameterization + "moped_enable": moped_enable, # initialize mu/sigma from the dnn weights + "moped_delta": args.moped_delta_factor, + } + + # Check the save_dir exists or not + if not os.path.exists(args.save_dir): + os.makedirs(args.save_dir) + + model = torch.nn.DataParallel(resnet.__dict__[args.arch](pretrained=True)) + model.cuda() if torch.cuda.is_available() else model.cpu() + if moped_enable: + checkpoint = torch.load(args.moped_init_model) + if "state_dict" in checkpoint.keys(): + model.load_state_dict(checkpoint["state_dict"]) + else: + model.load_state_dict(checkpoint) + + const_bnn_prior_parameters["moped_enable"]=True + dnn_to_bnn(model, const_bnn_prior_parameters) # only replaces linear and conv layers + + save_checkpoint( + { + "epoch": 0, + "state_dict": model.state_dict(), + "best_prec1": best_prec1, + }, + False, + filename=os.path.join(args.save_dir, "bayesian_{}_imagenet.pth".format(args.arch)), + ) + + if torch.cuda.is_available(): + model.cuda() + else: + model.cpu() + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + checkpoint = torch.load(args.resume) + args.start_epoch = checkpoint["epoch"] + best_prec1 = checkpoint["best_prec1"] + model.load_state_dict(checkpoint) + print("=> loaded checkpoint '{}' (epoch {})".format(args.evaluate, checkpoint["epoch"])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + + tb_writer = None + if args.tensorboard: + logger_dir = os.path.join(args.log_dir, "tb_logger") + if not os.path.exists(logger_dir): + os.makedirs(logger_dir) + tb_writer = SummaryWriter(logger_dir) + + valdir = os.path.join(args.data, 'val') #Imagenet_2012Val + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + # train_loader = torch.utils.data.DataLoader( + # datasets.CIFAR10( + # root="./data", + # train=True, + # transform=transforms.Compose( + # [ + # transforms.RandomHorizontalFlip(), + # transforms.RandomCrop(32, 4), + # transforms.ToTensor(), + # normalize, + # ] + # ), + # download=True, + # ), + # batch_size=args.batch_size, + # shuffle=True, + # num_workers=args.workers, + # pin_memory=True, + # ) + + val_dataset = datasets.ImageFolder( + valdir, + transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])) + val_loader = torch.utils.data.DataLoader(val_dataset, + batch_size=args.val_batch_size, + shuffle=False, + num_workers=args.workers, + pin_memory=True) + + print('len valset: ', len(val_dataset)) + + if not os.path.exists(args.save_dir): + os.makedirs(args.save_dir) + + if torch.cuda.is_available(): + criterion = nn.CrossEntropyLoss().cuda() + else: + criterion = nn.CrossEntropyLoss().cpu() + + if args.half: + model.half() + criterion.half() + + if args.arch in ["resnet110"]: + for param_group in optimizer.param_groups: + param_group["lr"] = args.lr * 0.1 + + if args.evaluate: + validate(val_loader, model, criterion) + return + + if args.mode == "train": + pass + + for epoch in range(args.start_epoch, args.epochs): + + lr = args.lr + if epoch >= 80 and epoch < 120: + lr = 0.1 * args.lr + elif epoch >= 120 and epoch < 160: + lr = 0.01 * args.lr + elif epoch >= 160 and epoch < 180: + lr = 0.001 * args.lr + elif epoch >= 180: + lr = 0.0005 * args.lr + + optimizer = torch.optim.Adam(model.parameters(), lr) + + # train for one epoch + print("current lr {:.5e}".format(optimizer.param_groups[0]["lr"])) + train(args, train_loader, model, criterion, optimizer, epoch, tb_writer) + + prec1 = validate(args, val_loader, model, criterion, epoch, tb_writer) + + is_best = prec1 > best_prec1 + best_prec1 = max(prec1, best_prec1) + + if is_best: + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model.state_dict(), + "best_prec1": best_prec1, + }, + is_best, + filename=os.path.join(args.save_dir, "bayesian_{}_imagenet.pth".format(args.arch)), + ) + + elif args.mode == "test": + checkpoint_file = args.save_dir + "/bayesian_{}_imagenet.pth".format(args.arch) + if torch.cuda.is_available(): + checkpoint = torch.load(checkpoint_file) + else: + checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu")) + model.load_state_dict(checkpoint["state_dict"]) + evaluate(args, model, val_loader) + + +def train(args, train_loader, model, criterion, optimizer, epoch, tb_writer=None): + batch_time = AverageMeter() + data_time = AverageMeter() + losses = AverageMeter() + top1 = AverageMeter() + + # switch to train mode + model.train() + + end = time.time() + for i, (input, target) in enumerate(train_loader): + + # measure data loading time + data_time.update(time.time() - end) + + if torch.cuda.is_available(): + target = target.cuda() + input_var = input.cuda() + target_var = target + else: + target = target.cpu() + input_var = input.cpu() + target_var = target + + if args.half: + input_var = input_var.half() + + # compute output + output_ = [] + kl_ = [] + for mc_run in range(args.num_mc): + output = model(input_var) + kl = get_kl_loss(model) + output_.append(output) + kl_.append(kl) + output = torch.mean(torch.stack(output_), dim=0) + kl = torch.mean(torch.stack(kl_), dim=0) + cross_entropy_loss = criterion(output, target_var) + scaled_kl = kl / args.batch_size + + # ELBO loss + loss = cross_entropy_loss + scaled_kl + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + output = output.float() + loss = loss.float() + # measure accuracy and record loss + prec1 = accuracy(output.data, target)[0] + losses.update(loss.item(), input.size(0)) + top1.update(prec1.item(), input.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + print( + "Epoch: [{0}][{1}/{2}]\t" + "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t" + "Data {data_time.val:.3f} ({data_time.avg:.3f})\t" + "Loss {loss.val:.4f} ({loss.avg:.4f})\t" + "Prec@1 {top1.val:.3f} ({top1.avg:.3f})".format( + epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1 + ) + ) + + if tb_writer is not None: + tb_writer.add_scalar("train/cross_entropy_loss", cross_entropy_loss.item(), epoch) + tb_writer.add_scalar("train/kl_div", scaled_kl.item(), epoch) + tb_writer.add_scalar("train/elbo_loss", loss.item(), epoch) + tb_writer.add_scalar("train/accuracy", prec1.item(), epoch) + tb_writer.flush() + + +def validate(args, val_loader, model, criterion, epoch, tb_writer=None): + batch_time = AverageMeter() + losses = AverageMeter() + top1 = AverageMeter() + + # switch to evaluate mode + model.eval() + + end = time.time() + with torch.no_grad(): + for i, (input, target) in enumerate(val_loader): + if torch.cuda.is_available(): + target = target.cuda() + input_var = input.cuda() + target_var = target.cuda() + else: + target = target.cpu() + input_var = input.cpu() + target_var = target.cpu() + + if args.half: + input_var = input_var.half() + + # compute output + output_ = [] + kl_ = [] + for mc_run in range(args.num_mc): + output = model(input_var) + kl = get_kl_loss(model) + output_.append(output) + kl_.append(kl) + output = torch.mean(torch.stack(output_), dim=0) + kl = torch.mean(torch.stack(kl_), dim=0) + cross_entropy_loss = criterion(output, target_var) + # scaled_kl = kl / len_trainset + scaled_kl = kl / args.batch_size + # scaled_kl = 0.2 * (kl / len_trainset) + + # ELBO loss + loss = cross_entropy_loss + scaled_kl + + output = output.float() + loss = loss.float() + + # measure accuracy and record loss + prec1 = accuracy(output.data, target)[0] + losses.update(loss.item(), input.size(0)) + top1.update(prec1.item(), input.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + print( + "Test: [{0}/{1}]\t" + "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t" + "Loss {loss.val:.4f} ({loss.avg:.4f})\t" + "Prec@1 {top1.val:.3f} ({top1.avg:.3f})".format( + i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1 + ) + ) + + if tb_writer is not None: + tb_writer.add_scalar("val/cross_entropy_loss", cross_entropy_loss.item(), epoch) + tb_writer.add_scalar("val/kl_div", scaled_kl.item(), epoch) + tb_writer.add_scalar("val/elbo_loss", loss.item(), epoch) + tb_writer.add_scalar("val/accuracy", prec1.item(), epoch) + tb_writer.flush() + + print(" * Prec@1 {top1.avg:.3f}".format(top1=top1)) + + return top1.avg + + +def evaluate(args, model, val_loader): + pred_probs_mc = [] + test_loss = 0 + correct = 0 + output_list = [] + labels_list = [] + model.eval() + with torch.no_grad(): + begin = time.time() + i=0 + for data, target in val_loader: + if torch.cuda.is_available(): + data, target = data.cuda(), target.cuda() + else: + data, target = data.cpu(), target.cpu() + output_mc = [] + for mc_run in range(args.num_monte_carlo): + output = model.forward(data) + output_mc.append(output) + output_ = torch.stack(output_mc) + output_list.append(output_) + labels_list.append(target) + i+=1 + # if i==10: + # break + end = time.time() + print("inference throughput: ", 50000 / (end - begin), " images/s") + + # output = torch.stack(output_list) + # output = output.permute(1, 0, 2, 3) + # output = output.contiguous().view(args.num_monte_carlo, len_valset, -1) + output = torch.cat(output_list, 1) + output = torch.nn.functional.softmax(output, dim=2) + labels = torch.cat(labels_list) + pred_mean = output.mean(dim=0) + Y_pred = torch.argmax(pred_mean, axis=1) + + np.save("./probs_cifar_mc.npy", output.data.cpu().numpy()) + np.save("./cifar_test_labels_mc.npy", labels.data.cpu().numpy()) + print(Y_pred.shape, labels.shape) + print(Y_pred[:100], labels[:100]) + print("Test accuracy:", (Y_pred.data.cpu().numpy() == labels.data.cpu().numpy()).mean() * 100) + + +def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): + """ + Save the training model + """ + torch.save(state, filename) + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def accuracy(output, target, topk=(1,)): + """Computes the precision@k for the specified values of k""" + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == "__main__": + main() diff --git a/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py b/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py new file mode 100644 index 0000000..8b18234 --- /dev/null +++ b/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py @@ -0,0 +1,282 @@ +''' +Bayesian ResNet for CIFAR10. + +ResNet architecture ref: +[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Deep Residual Learning for Image Recognition. arXiv:1512.03385 +''' + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init +from bayesian_torch.layers import QuantizedConv2dFlipout +from bayesian_torch.layers import QuantizedLinearFlipout +from torch.nn.quantized import BatchNorm2d as QuantizedBatchNorm2d +from torch.nn import Identity + +__all__ = [ + 'QResNet', 'qresnet18', 'qresnet34', 'qresnet50', 'qresnet101', 'qresnet152' +] + +def _weights_init(m): + classname = m.__class__.__name__ + if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d): + init.kaiming_normal_(m.weight) + + +class LambdaLayer(nn.Module): + def __init__(self, lambd): + super(LambdaLayer, self).__init__() + self.lambd = lambd + + def forward(self, x): + return self.lambd(x) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, in_planes, planes, stride=1, option='A', bias=False): + super(BasicBlock, self).__init__() + self.conv1 = QuantizedConv2dFlipout( + in_channels=in_planes, + out_channels=planes, + kernel_size=3, + stride=stride, + padding=1, + bias=bias) + self.bn1 = QuantizedBatchNorm2d(planes) + self.conv2 = QuantizedConv2dFlipout( + in_channels=planes, + out_channels=planes, + kernel_size=3, + stride=1, + padding=1, + bias=bias) + self.bn2 = QuantizedBatchNorm2d(planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != planes: + if option == 'A': + """ + For CIFAR10 ResNet paper uses option A. + """ + self.shortcut = LambdaLayer(lambda x: F.pad( + x[:, :, ::2, ::2], + (0, 0, 0, 0, planes // 4, planes // 4), "constant", 0)) + elif option == 'B': + self.shortcut = nn.Sequential( + QuantizedConv2dFlipout( + in_channels=in_planes, + out_channels=self.expansion * planes, + kernel_size=1, + stride=stride, + bias=bias), QuantizedBatchNorm2d(self.expansion * planes)) + + def forward(self, x): + out, _ = self.conv1(x) + out = self.bn1(out) + out = F.relu(out) + out, _ = self.conv2(out) + out = self.bn2(out) + sh = self.shortcut(x.contiguous()).contiguous() + new_scale = max(out.q_scale(), sh.q_scale()) + out = torch.ops.quantized.add(out, sh, new_scale, 0) + # out += self.shortcut(x) + out = F.relu(out) + return out, 0 # kl=0 + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None, bias=False): + super(Bottleneck, self).__init__() + self.conv1 = QuantizedConv2dFlipout( + in_channels=inplanes, + out_channels=planes, + kernel_size=1, + bias=bias) + self.bn1 =QuantizedBatchNorm2d(planes) + self.conv2 = QuantizedConv2dFlipout( + in_channels=planes, + out_channels=planes, + kernel_size=3, + stride=stride, + padding=1, + bias=bias) + self.bn2 = QuantizedBatchNorm2d(planes) + self.conv3 = QuantizedConv2dFlipout( + in_channels=planes, + out_channels=planes * 4, + kernel_size=1, + bias=bias) + self.bn3 = QuantizedBatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + kl_sum = 0 + out, kl = self.conv1(x) + kl_sum += kl + out = self.bn1(out) + out = self.relu(out) + + out, kl = self.conv2(out) + kl_sum += kl + out = self.bn2(out) + out = self.relu(out) + + out, kl = self.conv3(out) + kl_sum += kl + out = self.bn3(out) + + if self.downsample is not None: + residual, kl = self.downsample(x) + kl_sum += kl + + # out += residual + new_scale = max(out.q_scale(), residual.q_scale()) + out = torch.ops.quantized.add(out, residual, new_scale, 0) + out = self.relu(out) + + return out, kl_sum + +class QResNet(nn.Module): + def __init__(self, block, layers, num_classes=1000, bias=False): + super(QResNet, self).__init__() + self.inplanes = 64 + self.conv1 = QuantizedConv2dFlipout( + in_channels=3, + out_channels=64, + kernel_size=7, + stride=2, + padding=3, + bias=bias) + self.bn1 = QuantizedBatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0], bias=bias) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2, bias=bias) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2, bias=bias) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2, bias=bias) + self.avgpool = nn.AvgPool2d(7, stride=1) + self.fc = QuantizedLinearFlipout( + in_features=512 * block.expansion, + out_features=num_classes, + ) + + self.apply(_weights_init) + + def _make_layer(self, block, planes, blocks, stride=1, bias=False): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + QuantizedConv2dFlipout(in_channels=self.inplanes, + out_channels=planes * block.expansion, + kernel_size=1, + stride=stride, + bias=bias), + QuantizedBatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, bias=bias)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes, bias=bias)) + + return nn.Sequential(*layers) + + def quant_then_dequant(self, m, fuse_conv_bn=False): ## quantize only; need to rename this function + for name, value in list(m._modules.items()): + if m._modules[name]._modules: + self.quant_then_dequant(m._modules[name], fuse_conv_bn=fuse_conv_bn) + + if "QuantizedConv" in m._modules[name].__class__.__name__: + m._modules[name].quantize() + m._modules[name].quantized_sigma_bias = None ### work around + m._modules[name].dnn_to_bnn_flag = True ## since we don't compute kl in quantized models, this flag will be removed after refactoring + + if "QuantizedLinear" in m._modules[name].__class__.__name__: + m._modules[name].quantize() + m._modules[name].dnn_to_bnn_flag = True ## since we don't compute kl in quantized models, this flag will be removed after refactoring + + if fuse_conv_bn and "BatchNorm2d" in m._modules[name].__class__.__name__: # quite confusing, should be quantizedbatchnorm2d + setattr(m, name, Identity()) + + def forward(self, x): + + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + for layer in self.layer1: + x = layer(x) + + for layer in self.layer2: + x = layer(x) + + for layer in self.layer3: + x = layer(x) + + for layer in self.layer4: + x = layer(x) + + x = self.avgpool(x) + x = x.view(x.size(0), -1) + x = self.fc(x) + + return x + + +def qresnet18(pretrained=False, **kwargs): + model = QResNet(BasicBlock, [2, 2, 2, 2], **kwargs) + return model + + +def qresnet34(pretrained=False, **kwargs): + model = QResNet(BasicBlock, [3, 4, 6, 3], **kwargs) + return model + + +def qresnet50(pretrained=False, **kwargs): + model = QResNet(Bottleneck, [3, 4, 6, 3], **kwargs) + return model + + +def qresnet101(pretrained=False, **kwargs): + model = QResNet(Bottleneck, [3, 4, 23, 3], **kwargs) + return model + + +def qresnet152(pretrained=False, **kwargs): + model = QResNet(Bottleneck, [3, 8, 36, 3], **kwargs) + return model + + + +def test(net): + import numpy as np + total_params = 0 + + for x in filter(lambda p: p.requires_grad, net.parameters()): + total_params += np.prod(x.data.numpy().shape) + print("Total number of params", total_params) + print( + "Total layers", + len( + list( + filter(lambda p: p.requires_grad and len(p.data.size()) > 1, + net.parameters())))) + + +if __name__ == "__main__": + for net_name in __all__: + if net_name.startswith('qresnet'): + print(net_name) + test(globals()[net_name]()) + print() diff --git a/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py b/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py new file mode 100644 index 0000000..6f3077e --- /dev/null +++ b/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py @@ -0,0 +1,282 @@ +''' +Bayesian ResNet for CIFAR10. + +ResNet architecture ref: +[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Deep Residual Learning for Image Recognition. arXiv:1512.03385 +''' + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init +from bayesian_torch.layers import QuantizedConv2dReparameterization +from bayesian_torch.layers import QuantizedLinearReparameterization +from torch.nn.quantized import BatchNorm2d as QuantizedBatchNorm2d +from torch.nn import Identity + +__all__ = [ + 'QResNet', 'qresnet18', 'qresnet34', 'qresnet50', 'qresnet101', 'qresnet152' +] + +def _weights_init(m): + classname = m.__class__.__name__ + if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d): + init.kaiming_normal_(m.weight) + + +class LambdaLayer(nn.Module): + def __init__(self, lambd): + super(LambdaLayer, self).__init__() + self.lambd = lambd + + def forward(self, x): + return self.lambd(x) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, in_planes, planes, stride=1, option='A', bias=False): + super(BasicBlock, self).__init__() + self.conv1 = QuantizedConv2dReparameterization( + in_channels=in_planes, + out_channels=planes, + kernel_size=3, + stride=stride, + padding=1, + bias=bias) + self.bn1 = QuantizedBatchNorm2d(planes) + self.conv2 = QuantizedConv2dReparameterization( + in_channels=planes, + out_channels=planes, + kernel_size=3, + stride=1, + padding=1, + bias=bias) + self.bn2 = QuantizedBatchNorm2d(planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != planes: + if option == 'A': + """ + For CIFAR10 ResNet paper uses option A. + """ + self.shortcut = LambdaLayer(lambda x: F.pad( + x[:, :, ::2, ::2], + (0, 0, 0, 0, planes // 4, planes // 4), "constant", 0)) + elif option == 'B': + self.shortcut = nn.Sequential( + QuantizedConv2dReparameterization( + in_channels=in_planes, + out_channels=self.expansion * planes, + kernel_size=1, + stride=stride, + bias=bias), QuantizedBatchNorm2d(self.expansion * planes)) + + def forward(self, x): + out, _ = self.conv1(x) + out = self.bn1(out) + out = F.relu(out) + out, _ = self.conv2(out) + out = self.bn2(out) + sh = self.shortcut(x.contiguous()).contiguous() + new_scale = max(out.q_scale(), sh.q_scale()) + out = torch.ops.quantized.add(out, sh, new_scale, 0) + # out += self.shortcut(x) + out = F.relu(out) + return out, 0 # kl=0 + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None, bias=False): + super(Bottleneck, self).__init__() + self.conv1 = QuantizedConv2dReparameterization( + in_channels=inplanes, + out_channels=planes, + kernel_size=1, + bias=bias) + self.bn1 =QuantizedBatchNorm2d(planes) + self.conv2 = QuantizedConv2dReparameterization( + in_channels=planes, + out_channels=planes, + kernel_size=3, + stride=stride, + padding=1, + bias=bias) + self.bn2 = QuantizedBatchNorm2d(planes) + self.conv3 = QuantizedConv2dReparameterization( + in_channels=planes, + out_channels=planes * 4, + kernel_size=1, + bias=bias) + self.bn3 = QuantizedBatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + kl_sum = 0 + out, kl = self.conv1(x) + kl_sum += kl + out = self.bn1(out) + out = self.relu(out) + + out, kl = self.conv2(out) + kl_sum += kl + out = self.bn2(out) + out = self.relu(out) + + out, kl = self.conv3(out) + kl_sum += kl + out = self.bn3(out) + + if self.downsample is not None: + residual, kl = self.downsample(x) + kl_sum += kl + + # out += residual + new_scale = max(out.q_scale(), residual.q_scale()) + out = torch.ops.quantized.add(out, residual, new_scale, 0) + out = self.relu(out) + + return out, kl_sum + +class QResNet(nn.Module): + def __init__(self, block, layers, num_classes=1000, bias=False): + super(QResNet, self).__init__() + self.inplanes = 64 + self.conv1 = QuantizedConv2dReparameterization( + in_channels=3, + out_channels=64, + kernel_size=7, + stride=2, + padding=3, + bias=bias) + self.bn1 = QuantizedBatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0], bias=bias) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2, bias=bias) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2, bias=bias) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2, bias=bias) + self.avgpool = nn.AvgPool2d(7, stride=1) + self.fc = QuantizedLinearReparameterization( + in_features=512 * block.expansion, + out_features=num_classes, + ) + + self.apply(_weights_init) + + def _make_layer(self, block, planes, blocks, stride=1, bias=False): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + QuantizedConv2dReparameterization(in_channels=self.inplanes, + out_channels=planes * block.expansion, + kernel_size=1, + stride=stride, + bias=bias), + QuantizedBatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, bias=bias)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes, bias=bias)) + + return nn.Sequential(*layers) + + def quant_then_dequant(self, m, fuse_conv_bn=False): ## quantize only; need to rename this function + for name, value in list(m._modules.items()): + if m._modules[name]._modules: + self.quant_then_dequant(m._modules[name], fuse_conv_bn=fuse_conv_bn) + + if "QuantizedConv" in m._modules[name].__class__.__name__: + m._modules[name].quantize() + m._modules[name].quantized_sigma_bias = None ### work around + m._modules[name].dnn_to_bnn_flag = True ## since we don't compute kl in quantized models, this flag will be removed after refactoring + + if "QuantizedLinear" in m._modules[name].__class__.__name__: + m._modules[name].quantize() + m._modules[name].dnn_to_bnn_flag = True ## since we don't compute kl in quantized models, this flag will be removed after refactoring + + if fuse_conv_bn and "BatchNorm2d" in m._modules[name].__class__.__name__: # quite confusing, should be quantizedbatchnorm2d + setattr(m, name, Identity()) + + def forward(self, x): + + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + for layer in self.layer1: + x = layer(x) + + for layer in self.layer2: + x = layer(x) + + for layer in self.layer3: + x = layer(x) + + for layer in self.layer4: + x = layer(x) + + x = self.avgpool(x) + x = x.view(x.size(0), -1) + x = self.fc(x) + + return x + + +def qresnet18(pretrained=False, **kwargs): + model = QResNet(BasicBlock, [2, 2, 2, 2], **kwargs) + return model + + +def qresnet34(pretrained=False, **kwargs): + model = QResNet(BasicBlock, [3, 4, 6, 3], **kwargs) + return model + + +def qresnet50(pretrained=False, **kwargs): + model = QResNet(Bottleneck, [3, 4, 6, 3], **kwargs) + return model + + +def qresnet101(pretrained=False, **kwargs): + model = QResNet(Bottleneck, [3, 4, 23, 3], **kwargs) + return model + + +def qresnet152(pretrained=False, **kwargs): + model = QResNet(Bottleneck, [3, 8, 36, 3], **kwargs) + return model + + + +def test(net): + import numpy as np + total_params = 0 + + for x in filter(lambda p: p.requires_grad, net.parameters()): + total_params += np.prod(x.data.numpy().shape) + print("Total number of params", total_params) + print( + "Total layers", + len( + list( + filter(lambda p: p.requires_grad and len(p.data.size()) > 1, + net.parameters())))) + + +if __name__ == "__main__": + for net_name in __all__: + if net_name.startswith('qresnet'): + print(net_name) + test(globals()[net_name]()) + print() From 51385b39c23ae92815a8845d4cf95243f8c99e52 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Mon, 13 Feb 2023 02:48:32 -0500 Subject: [PATCH 52/69] remove kl computations --- .../main_bayesian_imagenet_bnn2qbnn.py | 2 +- .../quantized_resnet_flipout_large.py | 21 +++++++------------ .../quantized_resnet_variational_large.py | 21 +++++++------------ 3 files changed, 17 insertions(+), 27 deletions(-) diff --git a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py index 687c1d0..1577651 100644 --- a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py +++ b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py @@ -151,7 +151,7 @@ def evaluate(args, model, val_loader): data, target = data.cpu(), target.cpu() output_mc = [] for mc_run in range(args.num_monte_carlo): - output, _ = model.forward(data) + output = model.forward(data) output_mc.append(output) output_ = torch.stack(output_mc) output_list.append(output_) diff --git a/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py b/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py index 8b18234..61c0dd0 100644 --- a/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py +++ b/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py @@ -75,17 +75,17 @@ def __init__(self, in_planes, planes, stride=1, option='A', bias=False): bias=bias), QuantizedBatchNorm2d(self.expansion * planes)) def forward(self, x): - out, _ = self.conv1(x) + out = self.conv1(x) out = self.bn1(out) out = F.relu(out) - out, _ = self.conv2(out) + out = self.conv2(out) out = self.bn2(out) sh = self.shortcut(x.contiguous()).contiguous() new_scale = max(out.q_scale(), sh.q_scale()) out = torch.ops.quantized.add(out, sh, new_scale, 0) # out += self.shortcut(x) out = F.relu(out) - return out, 0 # kl=0 + return out class Bottleneck(nn.Module): expansion = 4 @@ -118,31 +118,26 @@ def __init__(self, inplanes, planes, stride=1, downsample=None, bias=False): def forward(self, x): residual = x - kl_sum = 0 - out, kl = self.conv1(x) - kl_sum += kl + out = self.conv1(x) out = self.bn1(out) out = self.relu(out) - out, kl = self.conv2(out) - kl_sum += kl + out = self.conv2(out) out = self.bn2(out) out = self.relu(out) - out, kl = self.conv3(out) - kl_sum += kl + out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: - residual, kl = self.downsample(x) - kl_sum += kl + residual = self.downsample(x) # out += residual new_scale = max(out.q_scale(), residual.q_scale()) out = torch.ops.quantized.add(out, residual, new_scale, 0) out = self.relu(out) - return out, kl_sum + return out class QResNet(nn.Module): def __init__(self, block, layers, num_classes=1000, bias=False): diff --git a/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py b/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py index 6f3077e..6d0a57e 100644 --- a/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py +++ b/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py @@ -75,17 +75,17 @@ def __init__(self, in_planes, planes, stride=1, option='A', bias=False): bias=bias), QuantizedBatchNorm2d(self.expansion * planes)) def forward(self, x): - out, _ = self.conv1(x) + out = self.conv1(x) out = self.bn1(out) out = F.relu(out) - out, _ = self.conv2(out) + out = self.conv2(out) out = self.bn2(out) sh = self.shortcut(x.contiguous()).contiguous() new_scale = max(out.q_scale(), sh.q_scale()) out = torch.ops.quantized.add(out, sh, new_scale, 0) # out += self.shortcut(x) out = F.relu(out) - return out, 0 # kl=0 + return out class Bottleneck(nn.Module): expansion = 4 @@ -118,31 +118,26 @@ def __init__(self, inplanes, planes, stride=1, downsample=None, bias=False): def forward(self, x): residual = x - kl_sum = 0 - out, kl = self.conv1(x) - kl_sum += kl + out = self.conv1(x) out = self.bn1(out) out = self.relu(out) - out, kl = self.conv2(out) - kl_sum += kl + out = self.conv2(out) out = self.bn2(out) out = self.relu(out) - out, kl = self.conv3(out) - kl_sum += kl + out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: - residual, kl = self.downsample(x) - kl_sum += kl + residual = self.downsample(x) # out += residual new_scale = max(out.q_scale(), residual.q_scale()) out = torch.ops.quantized.add(out, residual, new_scale, 0) out = self.relu(out) - return out, kl_sum + return out class QResNet(nn.Module): def __init__(self, block, layers, num_classes=1000, bias=False): From 525c4625c3cf5e971d8ae15eaf3624ba97904aaa Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Mon, 13 Feb 2023 19:42:53 -0500 Subject: [PATCH 53/69] quantization directory structure --- bayesian_torch/ao/__init__.py | 0 bayesian_torch/ao/nn/__init__.py | 0 bayesian_torch/ao/nn/quantized/__init__.py | 0 .../modules/quantize_conv_variational.py | 1428 +++++++++++++++++ .../modules/quantize_linear_variational.py | 204 +++ .../modules/quantized_conv_flipout.py | 1303 +++++++++++++++ .../modules/quantized_linear_flipout.py | 206 +++ bayesian_torch/ao/quantization/__init__.py | 2 + bayesian_torch/ao/quantization/quantize.py | 9 + 9 files changed, 3152 insertions(+) create mode 100644 bayesian_torch/ao/__init__.py create mode 100644 bayesian_torch/ao/nn/__init__.py create mode 100644 bayesian_torch/ao/nn/quantized/__init__.py create mode 100644 bayesian_torch/ao/nn/quantized/modules/quantize_conv_variational.py create mode 100644 bayesian_torch/ao/nn/quantized/modules/quantize_linear_variational.py create mode 100644 bayesian_torch/ao/nn/quantized/modules/quantized_conv_flipout.py create mode 100644 bayesian_torch/ao/nn/quantized/modules/quantized_linear_flipout.py create mode 100644 bayesian_torch/ao/quantization/__init__.py create mode 100644 bayesian_torch/ao/quantization/quantize.py diff --git a/bayesian_torch/ao/__init__.py b/bayesian_torch/ao/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bayesian_torch/ao/nn/__init__.py b/bayesian_torch/ao/nn/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bayesian_torch/ao/nn/quantized/__init__.py b/bayesian_torch/ao/nn/quantized/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bayesian_torch/ao/nn/quantized/modules/quantize_conv_variational.py b/bayesian_torch/ao/nn/quantized/modules/quantize_conv_variational.py new file mode 100644 index 0000000..a8b25dc --- /dev/null +++ b/bayesian_torch/ao/nn/quantized/modules/quantize_conv_variational.py @@ -0,0 +1,1428 @@ +# Copyright (C) 2021 Intel Labs +# +# BSD-3-Clause License +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# @authors: Jun-Liang Lin +# +# ====================================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Parameter +from ..base_variational_layer import BaseVariationalLayer_ +from .conv_variational import * +import math + +__all__ = [ + 'QuantizedConv1dReparameterization', + 'QuantizedConv2dReparameterization', + 'QuantizedConv3dReparameterization', + 'QuantizedConvTranspose1dReparameterization', + 'QuantizedConvTranspose2dReparameterization', + 'QuantizedConvTranspose3dReparameterization', +] + + +class QuantizedConv1dReparameterization(Conv1dReparameterization): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + + super(QuantizedConv1dReparameterization, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias + ) + + ## redundant ## + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + ## redundant ## + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): # Deprecated. Only for forward mode #1. + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if self.dnn_to_bnn_flag: + return_kl = False + + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + + weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_()) + bias = None + + if self.bias: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.conv1d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups) + + else: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + out = torch.nn.quantized.functional.conv1d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + + if return_kl: + return out, 0 # disable kl divergence computing + + return out + + + +class QuantizedConv2dReparameterization(Conv2dReparameterization): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + + """ + + super(QuantizedConv2dReparameterization, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias + ) + + ## redundant ## + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + ## redundant ## + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): # Deprecated. Only for forward mode #1. + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + if self.dnn_to_bnn_flag: + return_kl = False + + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + + weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_()) + bias = None + + if self.bias: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.conv2d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups) + + else: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + out = torch.nn.quantized.functional.conv2d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + + if return_kl: + return out, 0 # disable kl divergence computing + + return out + + +class QuantizedConv3dReparameterization(Conv3dReparameterization): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + + super(QuantizedConv3dReparameterization, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias + ) + + ## redundant ## + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + ## redundant ## + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): # Deprecated. Only for forward mode #1. + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + if self.dnn_to_bnn_flag: + return_kl = False + + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + + weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_()) + bias = None + + if self.bias: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.conv3d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups) + + else: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + out = torch.nn.quantized.functional.conv3d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + + if return_kl: + return out, 0 # disable kl divergence computing + + return out + +class QuantizedConvTranspose1dReparameterization(ConvTranspose1dReparameterization): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + + super(ConvTranspose1dReparameterization, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias + ) + + ## redundant ## + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + ## redundant ## + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): # Deprecated. Only for forward mode #1. + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + if self.dnn_to_bnn_flag: + return_kl = False + + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + + weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_()) + bias = None + + if self.bias: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.conv_transpose1d(input, weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + else: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + out = torch.ops.quantized.conv_transpose1d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point) + + + if return_kl: + return out, 0 # disable kl divergence computing + + return out + +class QuantizedConvTranspose2dReparameterization(ConvTranspose2dReparameterization): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + + super(ConvTranspose2dReparameterization, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias + ) + + ## redundant ## + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + ## redundant ## + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): # Deprecated. Only for forward mode #1. + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + if self.dnn_to_bnn_flag: + return_kl = False + + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + + weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_()) + bias = None + + if self.bias: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.conv_transpose2d(input, weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + else: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + out = torch.ops.quantized.conv_transpose2d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point) + + + if return_kl: + return out, 0 # disable kl divergence computing + + return out + +class QuantizedConvTranspose3dReparameterization(ConvTranspose3dReparameterization): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + + super(ConvTranspose3dReparameterization, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias + ) + + ## redundant ## + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + ## redundant ## + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): # Deprecated. Only for forward mode #1. + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + if self.dnn_to_bnn_flag: + return_kl = False + + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + + weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_()) + bias = None + + if self.bias: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.conv_transpose3d(input, weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + else: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) # Calculate the new scale after adding two quantized tensors. + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + out = torch.ops.quantized.conv_transpose3d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point) + + + if return_kl: + return out, 0 # disable kl divergence computing + + return out \ No newline at end of file diff --git a/bayesian_torch/ao/nn/quantized/modules/quantize_linear_variational.py b/bayesian_torch/ao/nn/quantized/modules/quantize_linear_variational.py new file mode 100644 index 0000000..e666f9b --- /dev/null +++ b/bayesian_torch/ao/nn/quantized/modules/quantize_linear_variational.py @@ -0,0 +1,204 @@ +# Copyright (C) 2021 Intel Labs +# +# BSD-3-Clause License +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ====================================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Module, Parameter +from ..base_variational_layer import BaseVariationalLayer_ +import math +from .linear_variational import LinearReparameterization + + + +class QuantizedLinearReparameterization(LinearReparameterization): + def __init__(self, + in_features, + out_features): + """ + + """ + super(QuantizedLinearReparameterization, self).__init__( + in_features, + out_features) + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_weight), requires_grad=False) + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_weight))), requires_grad=False) + delattr(self, "mu_weight") + delattr(self, "rho_weight") + + self.quantized_mu_bias = Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False) + self.quantized_sigma_bias = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False) + delattr(self, "mu_bias") + delattr(self, "rho_bias") + + def dequantize(self): # Deprecated + self.mu_weight = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + return + + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + input: tensors + Input tensor. + + enable_int8_compute: bool, optional + Whether to enable int8 computation. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + if self.dnn_to_bnn_flag: + return_kl = False + + if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + weight = self.mu_weight + (self.sigma_weight * self.eps_weight.data.normal_()) + bias = None + if self.sigma_bias is not None: + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + + out = F.linear(input, weight, bias) + + else: + eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale()) + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0) + new_scale = max(new_scale, self.quantized_mu_weight.q_scale()) + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0) + bias = None + + if self.quantized_sigma_bias is not None: + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_()) + if input.dtype!=torch.quint8: + input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) + + out = torch.nn.quantized.functional.linear(input, weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + out = out.dequantize() + + if return_kl: + return out, 0 # disable kl divergence computing + + return out diff --git a/bayesian_torch/ao/nn/quantized/modules/quantized_conv_flipout.py b/bayesian_torch/ao/nn/quantized/modules/quantized_conv_flipout.py new file mode 100644 index 0000000..cf771c7 --- /dev/null +++ b/bayesian_torch/ao/nn/quantized/modules/quantized_conv_flipout.py @@ -0,0 +1,1303 @@ +# Copyright (C) 2021 Intel Labs +# +# BSD-3-Clause License +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# +# Convolutional layers with flipout Monte Carlo weight estimator to perform +# variational inference in Bayesian neural networks. Variational layers +# enables Monte Carlo approximation of the distribution over the kernel +# +# +# ====================================================================================== +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Parameter +from ..base_variational_layer import BaseVariationalLayer_ +from .conv_flipout import * + +from torch.distributions.normal import Normal +from torch.distributions.uniform import Uniform + +__all__ = [ + 'QuantizedConv1dFlipout', + 'QuantizedConv2dFlipout', + 'QuantizedConv3dFlipout', + 'QuantizedConvTranspose1dFlipout', + 'QuantizedConvTranspose2dFlipout', + 'QuantizedConvTranspose3dFlipout', +] + + +class QuantizedConv1dFlipout(Conv1dFlipout): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + super(QuantizedConv1dFlipout).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias) + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format. + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if self.dnn_to_bnn_flag: + return_kl = False + + if x.dtype!=torch.quint8: + x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) + + bias = None + if self.bias: + bias = self.quantized_mu_bias + + outputs = torch.nn.quantized.functional.conv1d(x, self.quantized_mu_weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + + bias = None + if self.bias: + eps_bias = self.eps_bias.data.normal_() + bias = (self.quantized_sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + perturbed_outputs = torch.nn.quantized.functional.conv1d(x, + weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding, + dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + + if return_kl: + return out, 0 + + return out + + +class QuantizedConv2dFlipout(Conv2dFlipout): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): # be aware of bias + """ + + """ + super(QuantizedConv2dFlipout, self).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias) + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format. + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if self.dnn_to_bnn_flag: + return_kl = False + + if x.dtype!=torch.quint8: + x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) + + bias = None + if self.bias: + bias = self.quantized_mu_bias + + outputs = torch.nn.quantized.functional.conv2d(x, self.quantized_mu_weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + + bias = None + if self.bias: + eps_bias = self.eps_bias.data.normal_() + bias = (self.quantized_sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + perturbed_outputs = torch.nn.quantized.functional.conv2d(x, + weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding, + dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + + if return_kl: + return out, 0 + + return out + + +class QuantizedConv3dFlipout(Conv3dFlipout): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + super(QuantizedConv3dFlipout).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias) + + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + # + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format. + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def dequantize(self): + self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + if self.bias: + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + + return + + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if self.dnn_to_bnn_flag: + return_kl = False + + if x.dtype!=torch.quint8: + x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) + + bias = None + if self.bias: + bias = self.quantized_mu_bias + + outputs = torch.nn.quantized.functional.conv3d(x, self.quantized_mu_weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + + bias = None + if self.bias: + eps_bias = self.eps_bias.data.normal_() + bias = (self.quantized_sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + perturbed_outputs = torch.nn.quantized.functional.conv3d(x, + weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding, + dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + + if return_kl: + return out, 0 + + return out + +class QuantizedConvTranspose1dFlipout(ConvTranspose1dFlipout): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + super(QuantizedConvTranspose1dFlipout).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias) + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + if not hasattr(self, "output_padding"): + self.output_padding = 0 + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format. + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if self.dnn_to_bnn_flag: + return_kl = False + + if x.dtype!=torch.quint8: + x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) + + bias = None + if self.bias: + bias = self.quantized_mu_bias + + self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(self.quantized_mu_weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + outputs = torch.ops.quantized.conv_transpose1d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + + bias = None + if self.bias: + eps_bias = self.eps_bias.data.normal_() + bias = (self.quantized_sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(delta_kernel, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + perturbed_outputs = torch.ops.quantized.conv_transpose1d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + + if return_kl: + return out, 0 + + return out + +class QuantizedConvTranspose2dFlipout(ConvTranspose2dFlipout): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + super(QuantizedConvTranspose2dFlipout).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias) + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + if not hasattr(self, "output_padding"): + self.output_padding = 0 + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format. + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if self.dnn_to_bnn_flag: + return_kl = False + + if x.dtype!=torch.quint8: + x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) + + bias = None + if self.bias: + bias = self.quantized_mu_bias + + self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(self.quantized_mu_weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + outputs = torch.ops.quantized.conv_transpose2d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + + bias = None + if self.bias: + eps_bias = self.eps_bias.data.normal_() + bias = (self.quantized_sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(delta_kernel, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + perturbed_outputs = torch.ops.quantized.conv_transpose2d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + + if return_kl: + return out, 0 + + return out + +class QuantizedConvTranspose3dFlipout(ConvTranspose3dFlipout): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False): + """ + """ + super(QuantizedConvTranspose3dFlipout).__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias=bias) + + # for conv bn fusion + self.bn_weight = None + self.bn_bias = None + self.bn_running_mean = None + self.bn_running_var = None + self.bn_eps = None + + self.is_dequant = False + + if not hasattr(self, "output_padding"): + self.output_padding = 0 + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + if self.bn_weight is None: # has batchnorm layer, no bn fusion + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu() + else: # fuse conv and bn + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu() + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu() + delattr(self, "mu_kernel") + delattr(self, "rho_kernel") + + + ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format. + ## Variable names may be confusing. We don't quantize them. + ## TODO: rename variables + if self.bias: # if has bias + if self.bn_weight is None: # if no bn fusion + self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu() + else: # if apply bn fusion + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps) + self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu() + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu() + delattr(self, "mu_bias") + delattr(self, "rho_bias") + else: + if self.bn_weight is not None: # if no bias but apply bn fusion + self.bias = True + bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias + self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu() + self.quantized_sigma_bias = None + + delattr(self, "bn_weight") + delattr(self, "bn_bias") + delattr(self, "bn_running_mean") + delattr(self, "bn_running_var") + delattr(self, "bn_eps") + + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if self.dnn_to_bnn_flag: + return_kl = False + + if x.dtype!=torch.quint8: + x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) + + bias = None + if self.bias: + bias = self.quantized_mu_bias + + self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(self.quantized_mu_weight, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + + outputs = torch.ops.quantized.conv_transpose3d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + + bias = None + if self.bias: + eps_bias = self.eps_bias.data.normal_() + bias = (self.quantized_sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(delta_kernel, bias, self.stride, + self.padding, self.output_padding, + self.dilation, self.groups) + perturbed_outputs = torch.ops.quantized.conv_transpose3d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point) + + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + + if return_kl: + return out, 0 + + return out \ No newline at end of file diff --git a/bayesian_torch/ao/nn/quantized/modules/quantized_linear_flipout.py b/bayesian_torch/ao/nn/quantized/modules/quantized_linear_flipout.py new file mode 100644 index 0000000..289da98 --- /dev/null +++ b/bayesian_torch/ao/nn/quantized/modules/quantized_linear_flipout.py @@ -0,0 +1,206 @@ +# Copyright (C) 2021 Intel Labs +# +# BSD-3-Clause License +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# +# Linear Flipout Layers with flipout weight estimator to perform +# variational inference in Bayesian neural networks. Variational layers +# enables Monte Carlo approximation of the distribution over the weights +# +# @authors: Jun-Liang Lin +# +# ====================================================================================== +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Module, Parameter +from torch.distributions.normal import Normal +from torch.distributions.uniform import Uniform + +from .linear_flipout import LinearFlipout + +__all__ = ["QuantizedLinearFlipout"] + +class QuantizedLinearFlipout(LinearFlipout): + def __init__(self, + in_features, + out_features): + + super(QuantizedLinearFlipout, self).__init__( + in_features, + out_features) + + self.is_dequant = False + + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): + """ An implementation for symmetric quantization + + Parameters + ---------- + x: tensor + Input tensor. + upper_bound: int, optional + Restrict the maximum value of the original tensor (select 100 empirically). + target_range: int, optional + The range of target data type (255 for int8) + + Returns + ---------- + scale: float + + zero_point: int + + """ + scale = torch.zeros(1).to(x.device) # initialize + zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization + xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative) + scale = xmax*2/target_range # original range divided by target range + return scale, zero_point + + def get_quantized_tensor(self, x, default_scale=0.1): + """ Quantize tensors + + Parameters + ---------- + x: tensors + Input tensor. + + default_scale: float, optional + Default scale for the case that the computed scale is zero. + + + Returns + ---------- + quantized_x: tensors + + + """ + scale, zero_point = self.get_scale_and_zero_point(x) + if scale == 0: + scale = torch.tensor([default_scale]) # avoid zero scale + quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8) + + return quantized_x + + def get_dequantized_tensor(self, x): + + dequantized_x = x.dequantize() + + return dequantized_x + + + def quantize(self): + self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_weight), requires_grad=False) + self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_weight))), requires_grad=False) + delattr(self, "mu_weight") + delattr(self, "rho_weight") + + self.quantized_mu_bias = Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False) + self.quantized_sigma_bias = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False) + delattr(self, "mu_bias") + delattr(self, "rho_bias") + + def dequantize(self): + self.mu_weight = self.get_dequantized_tensor(self.quantized_mu_weight) + self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) + + self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias) + self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) + return + + def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + """ Forward pass + + Parameters + ---------- + x: tensors + Input tensor. + + normal_scale: float, optional + Scale for quantized tensor sampled from normal distribution. + since 99.7% values will lie within 3 standard deviations, the original range is set as 6. + + default_scale: float, optional + Default scale for quantized input tensor and quantized output tensor. + Set to 0.1 by grid search. + + default_zero_point: int, optional + Default zero point for quantized input tensor and quantized output tensor. + Set to 128 for quint8 tensor. + + + + Returns + ---------- + out: tensors + Output tensor. Already dequantized. + KL: float + set to 0 since we diable KL divergence computation in quantized layers. + + + """ + + if self.dnn_to_bnn_flag: + return_kl = False + + bias = None + if self.quantized_mu_bias is not None: + if not self.is_dequant: + self.dequantize() + self.is_dequant = True + bias = self.mu_bias + + outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale()) + delta_weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0) + + bias = None + if self.quantized_sigma_bias is not None: + eps_bias = self.eps_bias.data.normal_() + bias = (self.sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + perturbed_outputs = torch.nn.quantized.functional.linear(x, + weight=delta_weight, bias=bias, scale=default_scale, zero_point=default_zero_point) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + out = out.dequantize() + + if return_kl: + return out, 0 + + return out diff --git a/bayesian_torch/ao/quantization/__init__.py b/bayesian_torch/ao/quantization/__init__.py new file mode 100644 index 0000000..5d672c7 --- /dev/null +++ b/bayesian_torch/ao/quantization/__init__.py @@ -0,0 +1,2 @@ +## bayesian_torch.quantization.prepare +## bayesian_torch.quantization.convert \ No newline at end of file diff --git a/bayesian_torch/ao/quantization/quantize.py b/bayesian_torch/ao/quantization/quantize.py new file mode 100644 index 0000000..fc7975a --- /dev/null +++ b/bayesian_torch/ao/quantization/quantize.py @@ -0,0 +1,9 @@ +""" +define prepare and convert function +""" + +def prepare(): + return + +def convert(): + return \ No newline at end of file From 3360bcf06a9cb46ea9e610521fbecae3bd4d5252 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Mon, 13 Feb 2023 22:40:12 -0500 Subject: [PATCH 54/69] example for prepare function --- .../variational_layers/conv_variational2.py | 245 ++++++++++++++++++ 1 file changed, 245 insertions(+) create mode 100644 bayesian_torch/layers/variational_layers/conv_variational2.py diff --git a/bayesian_torch/layers/variational_layers/conv_variational2.py b/bayesian_torch/layers/variational_layers/conv_variational2.py new file mode 100644 index 0000000..8ec18d3 --- /dev/null +++ b/bayesian_torch/layers/variational_layers/conv_variational2.py @@ -0,0 +1,245 @@ +# Copyright (C) 2021 Intel Labs +# +# BSD-3-Clause License +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# +# Convolutional Layers with reparameterization estimator to perform variational +# inference in Bayesian neural networks. Reparameterization layers +# enables Monte Carlo approximation of the distribution over 'kernel' and 'bias'. +# +# Kullback-Leibler divergence between the surrogate posterior and prior is computed +# and returned along with the tensors of outputs after convolution operation, which is +# required to compute Evidence Lower Bound (ELBO). +# +# @authors: Ranganath Krishnan +# +# ====================================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Parameter +from bayesian_torch.layers.base_variational_layer import BaseVariationalLayer_, get_kernel_size +import math +from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver +from torch.quantization.qconfig import QConfig + +__all__ = [ + # 'Conv1dReparameterization', + # 'Conv2dReparameterization', + # 'Conv3dReparameterization', + # 'ConvTranspose1dReparameterization', + # 'ConvTranspose2dReparameterization', + # 'ConvTranspose3dReparameterization', +] + + + + +class Conv2dReparameterization(BaseVariationalLayer_): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + prior_mean=0, + prior_variance=1, + posterior_mu_init=0, + posterior_rho_init=-3.0, + bias=True): + """ + Implements Conv2d layer with reparameterization trick. + + Inherits from bayesian_torch.layers.BaseVariationalLayer_ + + Parameters: + in_channels: int -> number of channels in the input image, + out_channels: int -> number of channels produced by the convolution, + kernel_size: int -> size of the convolving kernel, + stride: int -> stride of the convolution. Default: 1, + padding: int -> zero-padding added to both sides of the input. Default: 0, + dilation: int -> spacing between kernel elements. Default: 1, + groups: int -> number of blocked connections from input channels to output channels, + prior_mean: float -> mean of the prior arbitrary distribution to be used on the complexity cost, + prior_variance: float -> variance of the prior arbitrary distribution to be used on the complexity cost, + posterior_mu_init: float -> init trainable mu parameter representing mean of the approximate posterior, + posterior_rho_init: float -> init trainable rho parameter representing the sigma of the approximate posterior through softplus function, + bias: bool -> if set to False, the layer will not learn an additive bias. Default: True, + """ + + super(Conv2dReparameterization, self).__init__() + if in_channels % groups != 0: + raise ValueError('invalid in_channels size') + if out_channels % groups != 0: + raise ValueError('invalid in_channels size') + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + self.prior_mean = prior_mean + self.prior_variance = prior_variance + self.posterior_mu_init = posterior_mu_init, # mean of weight + # variance of weight --> sigma = log (1 + exp(rho)) + self.posterior_rho_init = posterior_rho_init, + self.bias = bias + + kernel_size = get_kernel_size(kernel_size, 2) + + self.mu_kernel = Parameter( + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1])) + self.rho_kernel = Parameter( + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1])) + self.register_buffer( + 'eps_kernel', + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1]), + persistent=False) + self.register_buffer( + 'prior_weight_mu', + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1]), + persistent=False) + self.register_buffer( + 'prior_weight_sigma', + torch.Tensor(out_channels, in_channels // groups, kernel_size[0], + kernel_size[1]), + persistent=False) + + if self.bias: + self.mu_bias = Parameter(torch.Tensor(out_channels)) + self.rho_bias = Parameter(torch.Tensor(out_channels)) + self.register_buffer('eps_bias', torch.Tensor(out_channels), persistent=False) + self.register_buffer('prior_bias_mu', torch.Tensor(out_channels), persistent=False) + self.register_buffer('prior_bias_sigma', + torch.Tensor(out_channels), + persistent=False) + else: + self.register_parameter('mu_bias', None) + self.register_parameter('rho_bias', None) + self.register_buffer('eps_bias', None, persistent=False) + self.register_buffer('prior_bias_mu', None, persistent=False) + self.register_buffer('prior_bias_sigma', None, persistent=False) + + self.init_parameters() + + def prepare(self): + myconfig = QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8), + weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8)) + self.quant = nn.ModuleList([torch.quantization.QuantStub(myconfig) for _ in range(10)]) + self.dequant = torch.quantization.DeQuantStub() + + def init_parameters(self): + self.prior_weight_mu.fill_(self.prior_mean) + self.prior_weight_sigma.fill_(self.prior_variance) + + self.mu_kernel.data.normal_(mean=self.posterior_mu_init[0], std=0.1) + self.rho_kernel.data.normal_(mean=self.posterior_rho_init[0], std=0.1) + if self.bias: + self.prior_bias_mu.fill_(self.prior_mean) + self.prior_bias_sigma.fill_(self.prior_variance) + + self.mu_bias.data.normal_(mean=self.posterior_mu_init[0], std=0.1) + self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], + std=0.1) + + def kl_loss(self): + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) + + return kl + + def forward(self, input, return_kl=True): + + input = self.quant[0](input) ### + + if self.dnn_to_bnn_flag: + return_kl = False + + sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) + eps_kernel = self.eps_kernel.data.normal_() + + sigma_weight = self.quant[1](sigma_weight) #### + eps_kernel = self.quant[2](eps_kernel) #### + mu_kernel = self.quant[3](self.mu_kernel) #### + + tmp_result = sigma_weight * eps_kernel + tmp_result = self.quant[4](tmp_result) #### + + weight = mu_kernel + tmp_result + + weight = self.quant[5](weight) #### + + if return_kl: + kl_weight = self.kl_div(self.mu_kernel, sigma_weight, + self.prior_weight_mu, self.prior_weight_sigma) + bias = None + + if self.bias: + sigma_bias = torch.log1p(torch.exp(self.rho_bias)) + eps_bias = self.eps_bias.data.normal_() + bias = self.mu_bias + (sigma_bias * eps_bias) + if return_kl: + kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, + self.prior_bias_sigma) + + out = F.conv2d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups) + + out = self.quant[6](out) #### + + if return_kl: + if self.bias: + kl = kl_weight + kl_bias + else: + kl = kl_weight + return out, kl + + return out + +if __name__=="__main__": + m = Conv2dReparameterization(3,3,3) + m.eval() + m.qconfig = torch.quantization.get_default_qconfig("fbgemm") + mp = torch.quantization.prepare(m) + input = torch.randn(3,3,4,4) + mp(input) + mq = torch.quantization.convert(mp) From 51bab43b90757f6404b148d41037feb580d47700 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Sun, 19 Feb 2023 20:36:21 -0500 Subject: [PATCH 55/69] quantization prepare function --- .../variational_layers/conv_variational.py | 35 ++- .../variational_layers/conv_variational2.py | 245 ------------------ 2 files changed, 34 insertions(+), 246 deletions(-) delete mode 100644 bayesian_torch/layers/variational_layers/conv_variational2.py diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py index 0d2ebfd..0fd065f 100644 --- a/bayesian_torch/layers/variational_layers/conv_variational.py +++ b/bayesian_torch/layers/variational_layers/conv_variational.py @@ -295,6 +295,15 @@ def __init__(self, self.register_buffer('prior_bias_sigma', None, persistent=False) self.init_parameters() + self.quant_prepare=False + + def prepare(self): + self.qint_quant = nn.ModuleList([torch.quantization.QuantStub( + QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8))) for _ in range(5)]) + self.quint_quant = nn.ModuleList([torch.quantization.QuantStub( + QConfig(activation=HistogramObserver.with_args(dtype=torch.quint8))) for _ in range(2)]) + self.dequant = torch.quantization.DeQuantStub() + self.quant_prepare=True def init_parameters(self): self.prior_weight_mu.fill_(self.prior_mean) @@ -325,7 +334,8 @@ def forward(self, input, return_kl=True): sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() - weight = self.mu_kernel + (sigma_weight * eps_kernel) + tmp_result = sigma_weight * eps_kernel + weight = mu_kernel + tmp_result if return_kl: kl_weight = self.kl_div(self.mu_kernel, sigma_weight, @@ -342,6 +352,20 @@ def forward(self, input, return_kl=True): out = F.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups) + + if self.quant_prepare: + # quint8 quantstub + input = self.quint_quant[0](input) # input + out = self.quint_quant[1](out) # output + + # qint8 quantstub + sigma_weight = self.qint_quant[0](sigma_weight) # weight + mu_kernel = self.qint_quant[1](self.mu_kernel) # weight + eps_kernel = self.qint_quant[2](eps_kernel) # random variable + tmp_result =self.qint_quant[3](tmp_result) # multiply activation + weight = self.qint_quant[4](weight) # add activatation + + if return_kl: if self.bias: kl = kl_weight + kl_bias @@ -946,3 +970,12 @@ def forward(self, input, return_kl=True): return out, kl return out + +if __name__=="__main__": + m = Conv2dReparameterization(3,3,3) + m.eval() + m.qconfig = torch.quantization.get_default_qconfig("fbgemm") + mp = torch.quantization.prepare(m) + input = torch.randn(3,3,4,4) + mp(input) + mq = torch.quantization.convert(mp) \ No newline at end of file diff --git a/bayesian_torch/layers/variational_layers/conv_variational2.py b/bayesian_torch/layers/variational_layers/conv_variational2.py deleted file mode 100644 index 8ec18d3..0000000 --- a/bayesian_torch/layers/variational_layers/conv_variational2.py +++ /dev/null @@ -1,245 +0,0 @@ -# Copyright (C) 2021 Intel Labs -# -# BSD-3-Clause License -# -# Redistribution and use in source and binary forms, with or without modification, -# are permitted provided that the following conditions are met: -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# 3. Neither the name of the copyright holder nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS -# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, -# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT -# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE -# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# -# Convolutional Layers with reparameterization estimator to perform variational -# inference in Bayesian neural networks. Reparameterization layers -# enables Monte Carlo approximation of the distribution over 'kernel' and 'bias'. -# -# Kullback-Leibler divergence between the surrogate posterior and prior is computed -# and returned along with the tensors of outputs after convolution operation, which is -# required to compute Evidence Lower Bound (ELBO). -# -# @authors: Ranganath Krishnan -# -# ====================================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.nn import Parameter -from bayesian_torch.layers.base_variational_layer import BaseVariationalLayer_, get_kernel_size -import math -from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver -from torch.quantization.qconfig import QConfig - -__all__ = [ - # 'Conv1dReparameterization', - # 'Conv2dReparameterization', - # 'Conv3dReparameterization', - # 'ConvTranspose1dReparameterization', - # 'ConvTranspose2dReparameterization', - # 'ConvTranspose3dReparameterization', -] - - - - -class Conv2dReparameterization(BaseVariationalLayer_): - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - prior_mean=0, - prior_variance=1, - posterior_mu_init=0, - posterior_rho_init=-3.0, - bias=True): - """ - Implements Conv2d layer with reparameterization trick. - - Inherits from bayesian_torch.layers.BaseVariationalLayer_ - - Parameters: - in_channels: int -> number of channels in the input image, - out_channels: int -> number of channels produced by the convolution, - kernel_size: int -> size of the convolving kernel, - stride: int -> stride of the convolution. Default: 1, - padding: int -> zero-padding added to both sides of the input. Default: 0, - dilation: int -> spacing between kernel elements. Default: 1, - groups: int -> number of blocked connections from input channels to output channels, - prior_mean: float -> mean of the prior arbitrary distribution to be used on the complexity cost, - prior_variance: float -> variance of the prior arbitrary distribution to be used on the complexity cost, - posterior_mu_init: float -> init trainable mu parameter representing mean of the approximate posterior, - posterior_rho_init: float -> init trainable rho parameter representing the sigma of the approximate posterior through softplus function, - bias: bool -> if set to False, the layer will not learn an additive bias. Default: True, - """ - - super(Conv2dReparameterization, self).__init__() - if in_channels % groups != 0: - raise ValueError('invalid in_channels size') - if out_channels % groups != 0: - raise ValueError('invalid in_channels size') - - self.in_channels = in_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.stride = stride - self.padding = padding - self.dilation = dilation - self.groups = groups - self.prior_mean = prior_mean - self.prior_variance = prior_variance - self.posterior_mu_init = posterior_mu_init, # mean of weight - # variance of weight --> sigma = log (1 + exp(rho)) - self.posterior_rho_init = posterior_rho_init, - self.bias = bias - - kernel_size = get_kernel_size(kernel_size, 2) - - self.mu_kernel = Parameter( - torch.Tensor(out_channels, in_channels // groups, kernel_size[0], - kernel_size[1])) - self.rho_kernel = Parameter( - torch.Tensor(out_channels, in_channels // groups, kernel_size[0], - kernel_size[1])) - self.register_buffer( - 'eps_kernel', - torch.Tensor(out_channels, in_channels // groups, kernel_size[0], - kernel_size[1]), - persistent=False) - self.register_buffer( - 'prior_weight_mu', - torch.Tensor(out_channels, in_channels // groups, kernel_size[0], - kernel_size[1]), - persistent=False) - self.register_buffer( - 'prior_weight_sigma', - torch.Tensor(out_channels, in_channels // groups, kernel_size[0], - kernel_size[1]), - persistent=False) - - if self.bias: - self.mu_bias = Parameter(torch.Tensor(out_channels)) - self.rho_bias = Parameter(torch.Tensor(out_channels)) - self.register_buffer('eps_bias', torch.Tensor(out_channels), persistent=False) - self.register_buffer('prior_bias_mu', torch.Tensor(out_channels), persistent=False) - self.register_buffer('prior_bias_sigma', - torch.Tensor(out_channels), - persistent=False) - else: - self.register_parameter('mu_bias', None) - self.register_parameter('rho_bias', None) - self.register_buffer('eps_bias', None, persistent=False) - self.register_buffer('prior_bias_mu', None, persistent=False) - self.register_buffer('prior_bias_sigma', None, persistent=False) - - self.init_parameters() - - def prepare(self): - myconfig = QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8), - weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8)) - self.quant = nn.ModuleList([torch.quantization.QuantStub(myconfig) for _ in range(10)]) - self.dequant = torch.quantization.DeQuantStub() - - def init_parameters(self): - self.prior_weight_mu.fill_(self.prior_mean) - self.prior_weight_sigma.fill_(self.prior_variance) - - self.mu_kernel.data.normal_(mean=self.posterior_mu_init[0], std=0.1) - self.rho_kernel.data.normal_(mean=self.posterior_rho_init[0], std=0.1) - if self.bias: - self.prior_bias_mu.fill_(self.prior_mean) - self.prior_bias_sigma.fill_(self.prior_variance) - - self.mu_bias.data.normal_(mean=self.posterior_mu_init[0], std=0.1) - self.rho_bias.data.normal_(mean=self.posterior_rho_init[0], - std=0.1) - - def kl_loss(self): - sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) - kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) - if self.bias: - sigma_bias = torch.log1p(torch.exp(self.rho_bias)) - kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma) - - return kl - - def forward(self, input, return_kl=True): - - input = self.quant[0](input) ### - - if self.dnn_to_bnn_flag: - return_kl = False - - sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) - eps_kernel = self.eps_kernel.data.normal_() - - sigma_weight = self.quant[1](sigma_weight) #### - eps_kernel = self.quant[2](eps_kernel) #### - mu_kernel = self.quant[3](self.mu_kernel) #### - - tmp_result = sigma_weight * eps_kernel - tmp_result = self.quant[4](tmp_result) #### - - weight = mu_kernel + tmp_result - - weight = self.quant[5](weight) #### - - if return_kl: - kl_weight = self.kl_div(self.mu_kernel, sigma_weight, - self.prior_weight_mu, self.prior_weight_sigma) - bias = None - - if self.bias: - sigma_bias = torch.log1p(torch.exp(self.rho_bias)) - eps_bias = self.eps_bias.data.normal_() - bias = self.mu_bias + (sigma_bias * eps_bias) - if return_kl: - kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, - self.prior_bias_sigma) - - out = F.conv2d(input, weight, bias, self.stride, self.padding, - self.dilation, self.groups) - - out = self.quant[6](out) #### - - if return_kl: - if self.bias: - kl = kl_weight + kl_bias - else: - kl = kl_weight - return out, kl - - return out - -if __name__=="__main__": - m = Conv2dReparameterization(3,3,3) - m.eval() - m.qconfig = torch.quantization.get_default_qconfig("fbgemm") - mp = torch.quantization.prepare(m) - input = torch.randn(3,3,4,4) - mp(input) - mq = torch.quantization.convert(mp) From df094087e4c05cb1d411bae669947d6685c21650 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Sun, 19 Feb 2023 20:43:07 -0500 Subject: [PATCH 56/69] import quantization module --- bayesian_torch/layers/variational_layers/conv_variational.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py index 0fd065f..a9e33ba 100644 --- a/bayesian_torch/layers/variational_layers/conv_variational.py +++ b/bayesian_torch/layers/variational_layers/conv_variational.py @@ -48,6 +48,8 @@ from torch.nn import Parameter from ..base_variational_layer import BaseVariationalLayer_, get_kernel_size import math +from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver +from torch.quantization.qconfig import QConfig __all__ = [ 'Conv1dReparameterization', From b4ce3f5e3d042432a63a0c17d32453165370b839 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Sun, 19 Feb 2023 23:08:39 -0500 Subject: [PATCH 57/69] finish quantization function --- .../layers/variational_layers/conv_variational.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py index a9e33ba..bb7d1a7 100644 --- a/bayesian_torch/layers/variational_layers/conv_variational.py +++ b/bayesian_torch/layers/variational_layers/conv_variational.py @@ -301,9 +301,9 @@ def __init__(self, def prepare(self): self.qint_quant = nn.ModuleList([torch.quantization.QuantStub( - QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8))) for _ in range(5)]) + QConfig(weight=HistogramObserver.with_args(dtype=torch.qint8), activation=HistogramObserver.with_args(dtype=torch.qint8))) for _ in range(5)]) self.quint_quant = nn.ModuleList([torch.quantization.QuantStub( - QConfig(activation=HistogramObserver.with_args(dtype=torch.quint8))) for _ in range(2)]) + QConfig(weight=HistogramObserver.with_args(dtype=torch.quint8), activation=HistogramObserver.with_args(dtype=torch.quint8))) for _ in range(2)]) self.dequant = torch.quantization.DeQuantStub() self.quant_prepare=True @@ -337,7 +337,7 @@ def forward(self, input, return_kl=True): sigma_weight = torch.log1p(torch.exp(self.rho_kernel)) eps_kernel = self.eps_kernel.data.normal_() tmp_result = sigma_weight * eps_kernel - weight = mu_kernel + tmp_result + weight = self.mu_kernel + tmp_result if return_kl: kl_weight = self.kl_div(self.mu_kernel, sigma_weight, @@ -976,6 +976,7 @@ def forward(self, input, return_kl=True): if __name__=="__main__": m = Conv2dReparameterization(3,3,3) m.eval() + m.prepare() m.qconfig = torch.quantization.get_default_qconfig("fbgemm") mp = torch.quantization.prepare(m) input = torch.randn(3,3,4,4) From 9b5a9dca7a62091ec586784b409b3990cf01142c Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Wed, 1 Mar 2023 23:07:58 -0500 Subject: [PATCH 58/69] quantization module prototype --- bayesian_torch/__init__.py | 1 + bayesian_torch/ao/quantization/__init__.py | 3 +- bayesian_torch/ao/quantization/quantize.py | 168 +++++++++++++++++- bayesian_torch/examples/quantization_test.py | 34 ++++ .../quantize_conv_variational.py | 70 +++++++- .../bayesian/resnet_variational_large.py | 4 +- bayesian_torch/quantization/__init__.py | 3 + bayesian_torch/quantization/quantize.py | 2 + 8 files changed, 272 insertions(+), 13 deletions(-) create mode 100644 bayesian_torch/examples/quantization_test.py create mode 100644 bayesian_torch/quantization/__init__.py create mode 100644 bayesian_torch/quantization/quantize.py diff --git a/bayesian_torch/__init__.py b/bayesian_torch/__init__.py index e69de29..da64647 100644 --- a/bayesian_torch/__init__.py +++ b/bayesian_torch/__init__.py @@ -0,0 +1 @@ +from bayesian_torch import quantization as quantization \ No newline at end of file diff --git a/bayesian_torch/ao/quantization/__init__.py b/bayesian_torch/ao/quantization/__init__.py index 5d672c7..dab2378 100644 --- a/bayesian_torch/ao/quantization/__init__.py +++ b/bayesian_torch/ao/quantization/__init__.py @@ -1,2 +1,3 @@ ## bayesian_torch.quantization.prepare -## bayesian_torch.quantization.convert \ No newline at end of file +## bayesian_torch.quantization.convert +from .quantize import * \ No newline at end of file diff --git a/bayesian_torch/ao/quantization/quantize.py b/bayesian_torch/ao/quantization/quantize.py index fc7975a..06fa99f 100644 --- a/bayesian_torch/ao/quantization/quantize.py +++ b/bayesian_torch/ao/quantization/quantize.py @@ -1,9 +1,163 @@ -""" -define prepare and convert function -""" +# Copyright (C) 2021 Intel Labs +# +# BSD-3-Clause License +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Define prepare and convert function +# -def prepare(): - return +import torch +import torch.nn as nn +from bayesian_torch.models.bayesian.resnet_variational_large import ( + BasicBlock, + Bottleneck, + ResNet, +) +from typing import Any, List, Optional, Type, Union +from torch import Tensor +from bayesian_torch.models.bnn_to_qbnn import bnn_to_qbnn +# import copy -def convert(): - return \ No newline at end of file +__all__ = [ + "prepare", + "convert", +] + +class QuantizableBasicBlock(BasicBlock): + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.add_relu = torch.nn.quantized.FloatFunctional() + + def forward(self, x: Tensor) -> Tensor: + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out = self.add_relu.add_relu(out, identity) + + return out + + +class QuantizableBottleneck(Bottleneck): + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.skip_add_relu = nn.quantized.FloatFunctional() + self.relu1 = nn.ReLU(inplace=False) + self.relu2 = nn.ReLU(inplace=False) + + def forward(self, x: Tensor) -> Tensor: + identity = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu1(out) + out = self.conv2(out) + out = self.bn2(out) + out = self.relu2(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + identity = self.downsample(x) + out = self.skip_add_relu.add_relu(out, identity) + + return out + + +class QuantizableResNet(ResNet): + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + self.quant = torch.ao.quantization.QuantStub() + self.dequant = torch.ao.quantization.DeQuantStub() + + def forward(self, x: Tensor) -> Tensor: + x = self.quant(x) + + x= self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + for layer in self.layer1: + x=layer(x) + + for layer in self.layer2: + x = layer(x) + + for layer in self.layer3: + x = layer(x) + + for layer in self.layer4: + x = layer(x) + + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.fc(x) + + + # x = self.dequant(x) + return x + + + +def enable_prepare(m): + for name, value in list(m._modules.items()): + if m._modules[name]._modules: + enable_prepare(m._modules[name]) + elif "Reparameterization" in m._modules[name].__class__.__name__ or "Flipout" in m._modules[name].__class__.__name__: + prepare = getattr(m._modules[name], "prepare", None) + if callable(prepare): + m._modules[name].prepare() + m._modules[name].dnn_to_bnn_flag=True + + +def prepare(model): + """ + 1. construct quantizable model + 2. traverse the model to enable the prepare function in each layer + 3. run torch.quantize.prepare() + """ + qmodel = QuantizableResNet(QuantizableBottleneck, [3, 4, 6, 3]) + qmodel.load_state_dict(model.state_dict()) + qmodel.eval() + enable_prepare(qmodel) + qmodel.qconfig = torch.quantization.get_default_qconfig("fbgemm") + qmodel = torch.quantization.prepare(qmodel) + + return qmodel + +def convert(model): + qmodel = torch.quantization.convert(model) # torch layers + bnn_to_qbnn(qmodel) # bayesian layers + return qmodel \ No newline at end of file diff --git a/bayesian_torch/examples/quantization_test.py b/bayesian_torch/examples/quantization_test.py new file mode 100644 index 0000000..bc18c25 --- /dev/null +++ b/bayesian_torch/examples/quantization_test.py @@ -0,0 +1,34 @@ +# import torch +# import bayesian_torch +# from bayesian_torch.ao.quantization import prepare, convert +# import bayesian_torch.models.bayesian.resnet_variational_large as resnet +# from bayesian_torch.models.bnn_to_qbnn import bnn_to_qbnn + +# model = resnet.__dict__['resnet50']() + +# input = torch.randn(1,3,224,224) +# mp = prepare(model) +# mp(input) # haven't replaced the batchnorm layer +# qmodel = torch.quantization.convert(mp) +# bnn_to_qbnn(qmodel) + + +import torch +import bayesian_torch +import bayesian_torch.models.bayesian.resnet_variational_large as resnet + +m = resnet.__dict__['resnet50']() +# alternative way to construct a bnn model +# from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn +# m = torchvision.models.resnet50(weights="IMAGENET1K_V1") +# dnn_to_bnn(m) + + + +mp = bayesian_torch.quantization.prepare(m) +input = torch.randn(1,3,224,224) +mp(input) # calibration +mq = bayesian_torch.quantization.convert(mp) + + + diff --git a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py index a8b25dc..31ed9e7 100644 --- a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py +++ b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py @@ -93,6 +93,7 @@ def __init__(self, self.bn_eps = None self.is_dequant = False + self.quant_dict = None def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): """ An implementation for symmetric quantization @@ -237,7 +238,26 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s if self.dnn_to_bnn_flag: return_kl = False - if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. + if self.quant_dict is not None: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point']) + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point']) + bias = None + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, self.quant_dict[3]['scale'], self.quant_dict[3]['zero_point'], torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + out = torch.nn.quantized.functional.conv1d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=self.quant_dict[4]['scale'], zero_point=self.quant_dict[4]['zero_point']) # input: quint8, weight: qint8, bias: fp32 + + elif not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() self.is_dequant = True @@ -323,6 +343,7 @@ def __init__(self, self.bn_eps = None self.is_dequant = False + self.quant_dict = None def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): """ An implementation for symmetric quantization @@ -419,6 +440,10 @@ def quantize(self): delattr(self, "bn_running_var") delattr(self, "bn_eps") + delattr(self, "qint_quant") + delattr(self, "quint_quant") + delattr(self, "dequant") + def dequantize(self): # Deprecated. Only for forward mode #1. self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight) self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight) @@ -466,7 +491,26 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s if self.dnn_to_bnn_flag: return_kl = False - if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. + if self.quant_dict is not None: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point']) + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point']) + bias = None + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, self.quant_dict[3]['scale'], self.quant_dict[3]['zero_point'], torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + out = torch.nn.quantized.functional.conv2d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=self.quant_dict[4]['scale'], zero_point=self.quant_dict[4]['zero_point']) # input: quint8, weight: qint8, bias: fp32 + + elif not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() self.is_dequant = True @@ -550,6 +594,7 @@ def __init__(self, self.bn_eps = None self.is_dequant = False + self.quant_dict = None def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): """ An implementation for symmetric quantization @@ -693,7 +738,26 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s if self.dnn_to_bnn_flag: return_kl = False - if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. + if self.quant_dict is not None: + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point']) + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point']) + bias = None + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, self.quant_dict[3]['scale'], self.quant_dict[3]['zero_point'], torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + out = torch.nn.quantized.functional.conv3d(input, weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=self.quant_dict[4]['scale'], zero_point=self.quant_dict[4]['zero_point']) # input: quint8, weight: qint8, bias: fp32 + + elif not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() self.is_dequant = True diff --git a/bayesian_torch/models/bayesian/resnet_variational_large.py b/bayesian_torch/models/bayesian/resnet_variational_large.py index bc641d6..6fdf561 100644 --- a/bayesian_torch/models/bayesian/resnet_variational_large.py +++ b/bayesian_torch/models/bayesian/resnet_variational_large.py @@ -14,7 +14,7 @@ from bayesian_torch.layers import BatchNorm2dLayer __all__ = [ - 'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152' + 'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'BasicBlock', 'Bottleneck' ] prior_mu = 0.0 @@ -200,7 +200,7 @@ def _make_layer(self, block, planes, blocks, stride=1): posterior_mu_init=posterior_mu_init, posterior_rho_init=posterior_rho_init, bias=False), - BatchNorm2dLayer(planes * block.expansion), + nn.BatchNorm2d(planes * block.expansion), ) layers = [] diff --git a/bayesian_torch/quantization/__init__.py b/bayesian_torch/quantization/__init__.py new file mode 100644 index 0000000..91a6e8b --- /dev/null +++ b/bayesian_torch/quantization/__init__.py @@ -0,0 +1,3 @@ +from .quantize import * + +# __all__ = ['prepare', 'convert'] \ No newline at end of file diff --git a/bayesian_torch/quantization/quantize.py b/bayesian_torch/quantization/quantize.py new file mode 100644 index 0000000..967f79a --- /dev/null +++ b/bayesian_torch/quantization/quantize.py @@ -0,0 +1,2 @@ +from bayesian_torch.ao.quantization.quantize import prepare +from bayesian_torch.ao.quantization.quantize import convert \ No newline at end of file From 9b0118f11f72e214a66f5b8b721dbce3b1eb9f2a Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Fri, 3 Mar 2023 09:30:46 -0500 Subject: [PATCH 59/69] bnn to qbnn --- bayesian_torch/models/bnn_to_qbnn.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py index d689465..37441b5 100644 --- a/bayesian_torch/models/bnn_to_qbnn.py +++ b/bayesian_torch/models/bnn_to_qbnn.py @@ -119,6 +119,15 @@ def qbnn_conv_layer(d): groups=d.groups, ) qbnn_layer.__dict__.update(d.__dict__) + + if d.quant_prepare: + qbnn_layer.quant_dict = [] + for qstub in d.qint_quant: + qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()}) + qbnn_layer.quant_dict = qbnn_layer.quant_dict[2:] + for qstub in d.quint_quant: + qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()}) + qbnn_layer.quantize() if d.dnn_to_bnn_flag: qbnn_layer.dnn_to_bnn_flag = True @@ -180,7 +189,10 @@ def batch_norm_folding(conv, bn): def bnn_to_qbnn(m, fuse_conv_bn=False): for name, value in list(m._modules.items()): if m._modules[name]._modules: - bnn_to_qbnn(m._modules[name], fuse_conv_bn=fuse_conv_bn) + if "Conv" in m._modules[name].__class__.__name__: + setattr(m, name, qbnn_conv_layer(m._modules[name])) + else: + bnn_to_qbnn(m._modules[name], fuse_conv_bn=fuse_conv_bn) elif "Linear" in m._modules[name].__class__.__name__: setattr(m, name, qbnn_linear_layer(m._modules[name])) elif "LSTM" in m._modules[name].__class__.__name__: From b780aad54cb3c6baba568b53c794c38844933455 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Mon, 6 Mar 2023 02:11:38 -0500 Subject: [PATCH 60/69] qbnn example --- .../main_bayesian_imagenet_bnn2qbnn.py | 23 ++++++++++----- .../variational_layers/linear_variational.py | 29 +++++++++++++++++-- .../quantize_linear_variational.py | 22 +++++++++++++- bayesian_torch/models/bnn_to_qbnn.py | 9 ++++++ 4 files changed, 72 insertions(+), 11 deletions(-) diff --git a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py index 1577651..2de3604 100644 --- a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py +++ b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py @@ -16,8 +16,8 @@ import bayesian_torch.models.bayesian.resnet_variational_large as resnet import numpy as np from bayesian_torch.models.bnn_to_qbnn import bnn_to_qbnn -# import bayesian_torch.models.bayesian.quantized_resnet_variational_large as qresnet -import bayesian_torch.models.bayesian.quantized_resnet_flipout_large as qresnet +import bayesian_torch.models.bayesian.quantized_resnet_variational_large as qresnet +# import bayesian_torch.models.bayesian.quantized_resnet_flipout_large as qresnet torch.cuda.is_available = lambda : False os.environ["CUDA_VISIBLE_DEVICES"] = "-1" @@ -262,9 +262,16 @@ def main(): model.load_state_dict(checkpoint["state_dict"]) model.module = model.module.cpu() - bnn_to_qbnn(model, fuse_conv_bn=False) # only replaces linear and conv layers + mp = bayesian_torch.quantization.prepare(model) + evaluate(args, mp, val_loader) # calibration + qmodel = bayesian_torch.quantization.convert(mp) + evaluate(args, qmodel, val_loader) + + - model = model.cpu() + # bnn_to_qbnn(model, fuse_conv_bn=False) # only replaces linear and conv layers + + # model = model.cpu() # save weights # save_checkpoint( @@ -278,16 +285,16 @@ def main(): # args.save_dir, # 'quantized_bayesian_q{}_imagenet.pth'.format(args.arch))) - qmodel = torch.nn.DataParallel(qresnet.__dict__['q'+args.arch](bias=False)) # set bias=True to make qconv has bias - qmodel.module.quant_then_dequant(qmodel, fuse_conv_bn=False) + # qmodel = torch.nn.DataParallel(qresnet.__dict__['q'+args.arch](bias=False)) # set bias=True to make qconv has bias + # qmodel.module.quant_then_dequant(qmodel, fuse_conv_bn=False) # load weights # checkpoint_file = args.save_dir + "/quantized_bayesian_q{}_imagenet.pth".format(args.arch) # checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu")) # qmodel.load_state_dict(checkpoint["state_dict"]) - qmodel.load_state_dict(model.state_dict()) - evaluate(args, qmodel, val_loader) + # qmodel.load_state_dict(model.state_dict()) + # evaluate(args, qmodel, val_loader) if __name__ == "__main__": main() diff --git a/bayesian_torch/layers/variational_layers/linear_variational.py b/bayesian_torch/layers/variational_layers/linear_variational.py index 7efb667..d69bfff 100644 --- a/bayesian_torch/layers/variational_layers/linear_variational.py +++ b/bayesian_torch/layers/variational_layers/linear_variational.py @@ -116,6 +116,15 @@ def __init__(self, self.register_buffer('eps_bias', None, persistent=False) self.init_parameters() + self.quant_prepare=False + + def prepare(self): + self.qint_quant = nn.ModuleList([torch.quantization.QuantStub( + QConfig(weight=HistogramObserver.with_args(dtype=torch.qint8), activation=HistogramObserver.with_args(dtype=torch.qint8))) for _ in range(5)]) + self.quint_quant = nn.ModuleList([torch.quantization.QuantStub( + QConfig(weight=HistogramObserver.with_args(dtype=torch.quint8), activation=HistogramObserver.with_args(dtype=torch.quint8))) for _ in range(2)]) + self.dequant = torch.quantization.DeQuantStub() + self.quant_prepare=True def init_parameters(self): self.prior_weight_mu.fill_(self.prior_mean) @@ -147,8 +156,10 @@ def forward(self, input, return_kl=True): if self.dnn_to_bnn_flag: return_kl = False sigma_weight = torch.log1p(torch.exp(self.rho_weight)) - weight = self.mu_weight + \ - (sigma_weight * self.eps_weight.data.normal_()) + eps_weight = self.eps_weight.data.normal_() + tmp_result = sigma_weight * eps_kernel + weight = self.mu_weight + tmp_result + if return_kl: kl_weight = self.kl_div(self.mu_weight, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma) @@ -162,6 +173,20 @@ def forward(self, input, return_kl=True): self.prior_bias_sigma) out = F.linear(input, weight, bias) + + if self.quant_prepare: + # quint8 quantstub + input = self.quint_quant[0](input) # input + out = self.quint_quant[1](out) # output + + # qint8 quantstub + sigma_weight = self.qint_quant[0](sigma_weight) # weight + mu_weight = self.qint_quant[1](self.mu_weight) # weight + eps_weight = self.qint_quant[2](eps_weight) # random variable + tmp_result =self.qint_quant[3](tmp_result) # multiply activation + weight = self.qint_quant[4](weight) # add activatation + + if return_kl: if self.mu_bias is not None: kl = kl_weight + kl_bias diff --git a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py index e666f9b..d2e48bc 100644 --- a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py +++ b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py @@ -53,6 +53,7 @@ def __init__(self, out_features) self.is_dequant = False + self.quant_dict = None def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): """ An implementation for symmetric quantization @@ -168,7 +169,26 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s if self.dnn_to_bnn_flag: return_kl = False - if not enable_int8_compute: # Deprecated. Use this method for reducing model size only. + if self.quant_dict is not None: + eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point']) + weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point']) + bias = None + + ## DO NOT QUANTIZE BIAS!!! + if self.bias: + if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion + bias = self.quantized_mu_bias + else: # original case + bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_()) + + if input.dtype!=torch.quint8: # check if input has been quantized + input = torch.quantize_per_tensor(input, self.quant_dict[3]['scale'], self.quant_dict[3]['zero_point'], torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + out = torch.nn.quantized.functional.linear(input, weight, bias, scale=self.quant_dict[4]['scale'], zero_point=self.quant_dict[4]['zero_point']) # input: quint8, weight: qint8, bias: fp32 + out = out.dequantize() + + elif not enable_int8_compute: # Deprecated. Use this method for reducing model size only. if not self.is_dequant: self.dequantize() self.is_dequant = True diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py index 37441b5..7660aa0 100644 --- a/bayesian_torch/models/bnn_to_qbnn.py +++ b/bayesian_torch/models/bnn_to_qbnn.py @@ -101,6 +101,15 @@ def qbnn_linear_layer(d): out_features=d.out_features, ) qbnn_layer.__dict__.update(d.__dict__) + + if d.quant_prepare: + qbnn_layer.quant_dict = [] + for qstub in d.qint_quant: + qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()}) + qbnn_layer.quant_dict = qbnn_layer.quant_dict[2:] + for qstub in d.quint_quant: + qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()}) + qbnn_layer.quantize() if d.dnn_to_bnn_flag: qbnn_layer.dnn_to_bnn_flag = True From 87488e2887be351060b46e8048e7521994a0e41d Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Mon, 6 Mar 2023 02:54:34 -0500 Subject: [PATCH 61/69] qbnn performance test --- .../layers/variational_layers/conv_variational.py | 6 +++--- .../layers/variational_layers/linear_variational.py | 8 +++++--- .../variational_layers/quantize_linear_variational.py | 6 +++--- bayesian_torch/models/bnn_to_qbnn.py | 2 ++ 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py index bb7d1a7..0d85f09 100644 --- a/bayesian_torch/layers/variational_layers/conv_variational.py +++ b/bayesian_torch/layers/variational_layers/conv_variational.py @@ -48,7 +48,7 @@ from torch.nn import Parameter from ..base_variational_layer import BaseVariationalLayer_, get_kernel_size import math -from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver +from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver, MinMaxObserver from torch.quantization.qconfig import QConfig __all__ = [ @@ -301,9 +301,9 @@ def __init__(self, def prepare(self): self.qint_quant = nn.ModuleList([torch.quantization.QuantStub( - QConfig(weight=HistogramObserver.with_args(dtype=torch.qint8), activation=HistogramObserver.with_args(dtype=torch.qint8))) for _ in range(5)]) + QConfig(weight=MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric), activation=MinMaxObserver.with_args(dtype=torch.qint8,qscheme=torch.per_tensor_symmetric))) for _ in range(5)]) self.quint_quant = nn.ModuleList([torch.quantization.QuantStub( - QConfig(weight=HistogramObserver.with_args(dtype=torch.quint8), activation=HistogramObserver.with_args(dtype=torch.quint8))) for _ in range(2)]) + QConfig(weight=MinMaxObserver.with_args(dtype=torch.quint8), activation=MinMaxObserver.with_args(dtype=torch.quint8))) for _ in range(2)]) self.dequant = torch.quantization.DeQuantStub() self.quant_prepare=True diff --git a/bayesian_torch/layers/variational_layers/linear_variational.py b/bayesian_torch/layers/variational_layers/linear_variational.py index d69bfff..8bdf644 100644 --- a/bayesian_torch/layers/variational_layers/linear_variational.py +++ b/bayesian_torch/layers/variational_layers/linear_variational.py @@ -47,6 +47,8 @@ from torch.nn import Module, Parameter from ..base_variational_layer import BaseVariationalLayer_ import math +from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver, MinMaxObserver +from torch.quantization.qconfig import QConfig class LinearReparameterization(BaseVariationalLayer_): @@ -120,9 +122,9 @@ def __init__(self, def prepare(self): self.qint_quant = nn.ModuleList([torch.quantization.QuantStub( - QConfig(weight=HistogramObserver.with_args(dtype=torch.qint8), activation=HistogramObserver.with_args(dtype=torch.qint8))) for _ in range(5)]) + QConfig(weight=MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric), activation=MinMaxObserver.with_args(dtype=torch.qint8,qscheme=torch.per_tensor_symmetric))) for _ in range(5)]) self.quint_quant = nn.ModuleList([torch.quantization.QuantStub( - QConfig(weight=HistogramObserver.with_args(dtype=torch.quint8), activation=HistogramObserver.with_args(dtype=torch.quint8))) for _ in range(2)]) + QConfig(weight=MinMaxObserver.with_args(dtype=torch.quint8), activation=MinMaxObserver.with_args(dtype=torch.quint8))) for _ in range(2)]) self.dequant = torch.quantization.DeQuantStub() self.quant_prepare=True @@ -157,7 +159,7 @@ def forward(self, input, return_kl=True): return_kl = False sigma_weight = torch.log1p(torch.exp(self.rho_weight)) eps_weight = self.eps_weight.data.normal_() - tmp_result = sigma_weight * eps_kernel + tmp_result = sigma_weight * eps_weight weight = self.mu_weight + tmp_result if return_kl: diff --git a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py index d2e48bc..34a970f 100644 --- a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py +++ b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py @@ -118,8 +118,8 @@ def quantize(self): delattr(self, "mu_weight") delattr(self, "rho_weight") - self.quantized_mu_bias = Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False) - self.quantized_sigma_bias = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False) + self.quantized_mu_bias = self.mu_bias#Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False) + self.quantized_sigma_bias = torch.log1p(torch.exp(self.rho_bias))#Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False) delattr(self, "mu_bias") delattr(self, "rho_bias") @@ -171,7 +171,7 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s if self.quant_dict is not None: eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6. - weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point']) + weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point']) weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point']) bias = None diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py index 7660aa0..d201e75 100644 --- a/bayesian_torch/models/bnn_to_qbnn.py +++ b/bayesian_torch/models/bnn_to_qbnn.py @@ -200,6 +200,8 @@ def bnn_to_qbnn(m, fuse_conv_bn=False): if m._modules[name]._modules: if "Conv" in m._modules[name].__class__.__name__: setattr(m, name, qbnn_conv_layer(m._modules[name])) + elif "Linear" in m._modules[name].__class__.__name__: + setattr(m, name, qbnn_linear_layer(m._modules[name])) else: bnn_to_qbnn(m._modules[name], fuse_conv_bn=fuse_conv_bn) elif "Linear" in m._modules[name].__class__.__name__: From 1e8bd696aaefff4281a56d987b944a0f89e16626 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Tue, 7 Mar 2023 18:32:22 -0500 Subject: [PATCH 62/69] fix accuracy drop --- bayesian_torch/models/bnn_to_qbnn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py index d201e75..09dcaed 100644 --- a/bayesian_torch/models/bnn_to_qbnn.py +++ b/bayesian_torch/models/bnn_to_qbnn.py @@ -165,8 +165,8 @@ def qbnn_batchnorm2d_layer(d): # qbnn_layer.bias = Parameter(get_quantized_tensor(d.bias), requires_grad=False) # qbnn_layer.running_mean = Parameter(get_quantized_tensor(d.running_mean), requires_grad=False) # qbnn_layer.running_var = Parameter(get_quantized_tensor(d.running_var), requires_grad=False) - qbnn_layer.scale = Parameter(torch.tensor([0.1]), requires_grad=False) - qbnn_layer.zero_point = Parameter(torch.tensor([128]), requires_grad=False) + # qbnn_layer.scale = Parameter(torch.tensor([0.1]), requires_grad=False) + # qbnn_layer.zero_point = Parameter(torch.tensor([128]), requires_grad=False) return qbnn_layer From b3d998094238164405c2ba0a73860b88eadf1b18 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Mon, 13 Mar 2023 20:22:45 -0400 Subject: [PATCH 63/69] support load and store quantized models --- .../main_bayesian_imagenet_bnn2qbnn.py | 83 ++++++++++++------- bayesian_torch/models/bnn_to_qbnn.py | 12 +-- 2 files changed, 57 insertions(+), 38 deletions(-) diff --git a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py index 2de3604..73dea9b 100644 --- a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py +++ b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py @@ -10,14 +10,17 @@ import torch.optim import torch.utils.data from torch.utils.tensorboard import SummaryWriter +import torchvision import torchvision.transforms as transforms import torchvision.datasets as datasets +import bayesian_torch import bayesian_torch.models.bayesian.resnet_variational_large as resnet import numpy as np from bayesian_torch.models.bnn_to_qbnn import bnn_to_qbnn -import bayesian_torch.models.bayesian.quantized_resnet_variational_large as qresnet -# import bayesian_torch.models.bayesian.quantized_resnet_flipout_large as qresnet +from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn +# import bayesian_torch.models.bayesian.quantized_resnet_variational_large as qresnet +import bayesian_torch.models.bayesian.quantized_resnet_flipout_large as qresnet torch.cuda.is_available = lambda : False os.environ["CUDA_VISIBLE_DEVICES"] = "-1" @@ -68,7 +71,7 @@ "--save-dir", dest="save_dir", help="The directory used to save the trained models", - default="./checkpoint/bayesian", + default="../../bayesian-torch-20221214/bayesian_torch/checkpoint/bayesian", type=str, ) parser.add_argument( @@ -134,7 +137,7 @@ help="use tensorboard for logging and visualization of training progress", ) -def evaluate(args, model, val_loader): +def evaluate(args, model, val_loader, calibration=False): pred_probs_mc = [] test_loss = 0 correct = 0 @@ -159,6 +162,9 @@ def evaluate(args, model, val_loader): i+=1 end = time.time() print("inference throughput: ", i*args.val_batch_size / (end - begin), " images/s") + # break + if calibration and i==3: + break output = torch.cat(output_list, 1) output = torch.nn.functional.softmax(output, dim=2) @@ -232,7 +238,7 @@ def main(): tb_writer = None - valdir = os.path.join(args.data, 'Imagenet_2012Val') + valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) val_dataset = datasets.ImageFolder( @@ -256,6 +262,20 @@ def main(): os.makedirs(args.save_dir) if args.mode == "test": + const_bnn_prior_parameters = { + "prior_mu": 0.0, + "prior_sigma": 1.0, + "posterior_mu_init": 0.0, + "posterior_rho_init": args.bnn_rho_init, + "type": "Flipout" if args.use_flipout_layers else "Reparameterization", # Flipout or Reparameterization + "moped_enable": moped_enable, # initialize mu/sigma from the dnn weights + "moped_delta": args.moped_delta_factor, + } + quantizable_model = torchvision.models.quantization.resnet50() + dnn_to_bnn(quantizable_model, const_bnn_prior_parameters) + model = torch.nn.DataParallel(quantizable_model) + + checkpoint_file = args.save_dir + "/bayesian_{}_imagenet.pth".format(args.arch) checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu")) @@ -263,38 +283,37 @@ def main(): model.module = model.module.cpu() mp = bayesian_torch.quantization.prepare(model) - evaluate(args, mp, val_loader) # calibration + evaluate(args, mp, val_loader, calibration=True) # calibration qmodel = bayesian_torch.quantization.convert(mp) evaluate(args, qmodel, val_loader) + # save weights + save_checkpoint( + { + 'epoch': None, + 'state_dict': qmodel.state_dict(), + 'best_prec1': None, + }, + True, + filename=os.path.join( + args.save_dir, + 'quantized_bayesian_{}_imagenetv2.pth'.format(args.arch))) + + # reconstruct (no calibration) + quantizable_model = torchvision.models.quantization.resnet50() + dnn_to_bnn(quantizable_model, const_bnn_prior_parameters) + model = torch.nn.DataParallel(quantizable_model) + mp = bayesian_torch.quantization.prepare(model) + qmodel1 = bayesian_torch.quantization.convert(mp) + # load + checkpoint_file = args.save_dir + "/quantized_bayesian_{}_imagenetv2.pth".format(args.arch) + checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu")) + qmodel1.load_state_dict(checkpoint["state_dict"]) + evaluate(args, qmodel1, val_loader) - # bnn_to_qbnn(model, fuse_conv_bn=False) # only replaces linear and conv layers - - # model = model.cpu() - # save weights - # save_checkpoint( - # { - # 'epoch': None, - # 'state_dict': model.state_dict(), - # 'best_prec1': None, - # }, - # True, - # filename=os.path.join( - # args.save_dir, - # 'quantized_bayesian_q{}_imagenet.pth'.format(args.arch))) - - # qmodel = torch.nn.DataParallel(qresnet.__dict__['q'+args.arch](bias=False)) # set bias=True to make qconv has bias - # qmodel.module.quant_then_dequant(qmodel, fuse_conv_bn=False) - - # load weights - # checkpoint_file = args.save_dir + "/quantized_bayesian_q{}_imagenet.pth".format(args.arch) - # checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu")) - # qmodel.load_state_dict(checkpoint["state_dict"]) - - # qmodel.load_state_dict(model.state_dict()) - # evaluate(args, qmodel, val_loader) + return mp, qmodel, qmodel1 if __name__ == "__main__": - main() + mp, qmodel, qmodel1 = main() diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py index 09dcaed..85953cf 100644 --- a/bayesian_torch/models/bnn_to_qbnn.py +++ b/bayesian_torch/models/bnn_to_qbnn.py @@ -103,12 +103,12 @@ def qbnn_linear_layer(d): qbnn_layer.__dict__.update(d.__dict__) if d.quant_prepare: - qbnn_layer.quant_dict = [] + qbnn_layer.quant_dict = nn.ModuleList() for qstub in d.qint_quant: - qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()}) + qbnn_layer.quant_dict.append(nn.ParameterDict({'scale': torch.nn.Parameter(qstub.scale.float()), 'zero_point': torch.nn.Parameter(qstub.zero_point.float())})) qbnn_layer.quant_dict = qbnn_layer.quant_dict[2:] for qstub in d.quint_quant: - qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()}) + qbnn_layer.quant_dict.append(nn.ParameterDict({'scale': torch.nn.Parameter(qstub.scale.float()), 'zero_point': torch.nn.Parameter(qstub.zero_point.float())})) qbnn_layer.quantize() if d.dnn_to_bnn_flag: @@ -130,12 +130,12 @@ def qbnn_conv_layer(d): qbnn_layer.__dict__.update(d.__dict__) if d.quant_prepare: - qbnn_layer.quant_dict = [] + qbnn_layer.quant_dict = nn.ModuleList() for qstub in d.qint_quant: - qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()}) + qbnn_layer.quant_dict.append(nn.ParameterDict({'scale': torch.nn.Parameter(qstub.scale.float()), 'zero_point': torch.nn.Parameter(qstub.zero_point.float())})) qbnn_layer.quant_dict = qbnn_layer.quant_dict[2:] for qstub in d.quint_quant: - qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()}) + qbnn_layer.quant_dict.append(nn.ParameterDict({'scale': torch.nn.Parameter(qstub.scale.float()), 'zero_point': torch.nn.Parameter(qstub.zero_point.float())})) qbnn_layer.quantize() if d.dnn_to_bnn_flag: From ccc52ee3cf9740c6b3dce0f859dd4533ed3093d6 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Mon, 13 Mar 2023 22:21:43 -0400 Subject: [PATCH 64/69] calibration support for quantized flipout layers --- .../layers/flipout_layers/conv_flipout.py | 48 +++++++++- .../layers/flipout_layers/linear_flipout.py | 45 ++++++++-- .../flipout_layers/quantized_conv_flipout.py | 90 ++++++++++++------- .../quantized_linear_flipout.py | 89 ++++++++++++------ 4 files changed, 202 insertions(+), 70 deletions(-) diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py index c92d24b..1bf0405 100644 --- a/bayesian_torch/layers/flipout_layers/conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py @@ -37,6 +37,8 @@ import torch.nn as nn import torch.nn.functional as F from ..base_variational_layer import BaseVariationalLayer_, get_kernel_size +from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver, MinMaxObserver +from torch.quantization.qconfig import QConfig from torch.distributions.normal import Normal from torch.distributions.uniform import Uniform @@ -136,6 +138,15 @@ def __init__(self, self.register_buffer('prior_bias_sigma', None, persistent=False) self.init_parameters() + self.quant_prepare=False + + def prepare(self): + self.qint_quant = nn.ModuleList([torch.quantization.QuantStub( + QConfig(weight=MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric), activation=MinMaxObserver.with_args(dtype=torch.qint8,qscheme=torch.per_tensor_symmetric))) for _ in range(4)]) + self.quint_quant = nn.ModuleList([torch.quantization.QuantStub( + QConfig(weight=MinMaxObserver.with_args(dtype=torch.quint8), activation=MinMaxObserver.with_args(dtype=torch.quint8))) for _ in range(8)]) + self.dequant = torch.quantization.DeQuantStub() + self.quant_prepare=True def init_parameters(self): # prior values @@ -303,6 +314,15 @@ def __init__(self, self.register_buffer('prior_bias_sigma', None, persistent=False) self.init_parameters() + self.quant_prepare=False + + def prepare(self): + self.qint_quant = nn.ModuleList([torch.quantization.QuantStub( + QConfig(weight=MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric), activation=MinMaxObserver.with_args(dtype=torch.qint8,qscheme=torch.per_tensor_symmetric))) for _ in range(4)]) + self.quint_quant = nn.ModuleList([torch.quantization.QuantStub( + QConfig(weight=MinMaxObserver.with_args(dtype=torch.quint8), activation=MinMaxObserver.with_args(dtype=torch.quint8))) for _ in range(8)]) + self.dequant = torch.quantization.DeQuantStub() + self.quant_prepare=True def init_parameters(self): # prior values @@ -365,18 +385,38 @@ def forward(self, x, return_kl=True): self.prior_bias_sigma) # perturbed feedforward - perturbed_outputs = F.conv2d(x * sign_input, + x_tmp = x * sign_input + perturbed_outputs_tmp = F.conv2d(x * sign_input, weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding, dilation=self.dilation, - groups=self.groups) * sign_output + groups=self.groups) + perturbed_outputs = perturbed_outputs_tmp * sign_output + out = outputs + perturbed_outputs + + if self.quant_prepare: + # quint8 quantstub + input = self.quint_quant[0](input) # input + outputs = self.quint_quant[1](outputs) # output + sign_input = self.quint_quant[2](sign_input) + sign_output = self.quint_quant[3](sign_output) + x_tmp = self.quint_quant[4](x_tmp) + perturbed_outputs_tmp = self.quint_quant[5](perturbed_outputs_tmp) # output + perturbed_outputs = self.quint_quant[6](perturbed_outputs) # output + out = self.quint_quant[7](out) # output + + # qint8 quantstub + sigma_weight = self.qint_quant[0](sigma_weight) # weight + mu_kernel = self.qint_quant[1](self.mu_kernel) # weight + eps_kernel = self.qint_quant[2](eps_kernel) # random variable + delta_kernel =self.qint_quant[3](delta_kernel) # multiply activation # returning outputs + perturbations if return_kl: - return outputs + perturbed_outputs, kl - return outputs + perturbed_outputs + return out, kl + return out class Conv3dFlipout(BaseVariationalLayer_): diff --git a/bayesian_torch/layers/flipout_layers/linear_flipout.py b/bayesian_torch/layers/flipout_layers/linear_flipout.py index af34d5d..3555290 100644 --- a/bayesian_torch/layers/flipout_layers/linear_flipout.py +++ b/bayesian_torch/layers/flipout_layers/linear_flipout.py @@ -40,6 +40,8 @@ from torch.distributions.normal import Normal from torch.distributions.uniform import Uniform from ..base_variational_layer import BaseVariationalLayer_ +from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver, MinMaxObserver +from torch.quantization.qconfig import QConfig __all__ = ["LinearFlipout"] @@ -107,6 +109,15 @@ def __init__(self, self.register_buffer('eps_bias', None, persistent=False) self.init_parameters() + self.quant_prepare=False + + def prepare(self): + self.qint_quant = nn.ModuleList([torch.quantization.QuantStub( + QConfig(weight=MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric), activation=MinMaxObserver.with_args(dtype=torch.qint8,qscheme=torch.per_tensor_symmetric))) for _ in range(4)]) + self.quint_quant = nn.ModuleList([torch.quantization.QuantStub( + QConfig(weight=MinMaxObserver.with_args(dtype=torch.quint8), activation=MinMaxObserver.with_args(dtype=torch.quint8))) for _ in range(8)]) + self.dequant = torch.quantization.DeQuantStub() + self.quant_prepare=True def init_parameters(self): # init prior mu @@ -136,7 +147,9 @@ def forward(self, x, return_kl=True): return_kl = False # sampling delta_W sigma_weight = torch.log1p(torch.exp(self.rho_weight)) - delta_weight = (sigma_weight * self.eps_weight.data.normal_()) + eps_weight = self.eps_weight.data.normal_() + delta_weight = sigma_weight * eps_weight + # delta_weight = (sigma_weight * self.eps_weight.data.normal_()) # get kl divergence if return_kl: @@ -153,14 +166,32 @@ def forward(self, x, return_kl=True): # linear outputs outputs = F.linear(x, self.mu_weight, self.mu_bias) - sign_input = x.clone().uniform_(-1, 1).sign() sign_output = outputs.clone().uniform_(-1, 1).sign() - - perturbed_outputs = F.linear(x * sign_input, delta_weight, - bias) * sign_output + x_tmp = x * sign_input + perturbed_outputs_tmp = F.linear(x_tmp, delta_weight, bias) + perturbed_outputs = perturbed_outputs_tmp * sign_output + out = outputs + perturbed_outputs + + if self.quant_prepare: + # quint8 quantstub + input = self.quint_quant[0](input) # input + outputs = self.quint_quant[1](outputs) # output + sign_input = self.quint_quant[2](sign_input) + sign_output = self.quint_quant[3](sign_output) + x_tmp = self.quint_quant[4](x_tmp) + perturbed_outputs_tmp = self.quint_quant[5](perturbed_outputs_tmp) # output + perturbed_outputs = self.quint_quant[6](perturbed_outputs) # output + out = self.quint_quant[7](out) # output + + # qint8 quantstub + sigma_weight = self.qint_quant[0](sigma_weight) # weight + mu_weight = self.qint_quant[1](self.mu_weight) # weight + eps_weight = self.qint_quant[2](eps_weight) # random variable + delta_weight =self.qint_quant[3](delta_weight) # multiply activation + # returning outputs + perturbations if return_kl: - return outputs + perturbed_outputs, kl - return outputs + perturbed_outputs + return out, kl + return out diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py index cf771c7..55acd67 100644 --- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py @@ -284,6 +284,7 @@ def __init__(self, self.bn_eps = None self.is_dequant = False + self.quant_dict = None def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): """ An implementation for symmetric quantization @@ -425,40 +426,67 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 if self.dnn_to_bnn_flag: return_kl = False - if x.dtype!=torch.quint8: - x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) - - bias = None - if self.bias: - bias = self.quantized_mu_bias - - outputs = torch.nn.quantized.functional.conv2d(x, self.quantized_mu_weight, bias, self.stride, self.padding, - self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 - - # sampling perturbation signs - sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() - sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() - sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) - sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) - - # getting perturbation weights - eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) - new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) - delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) - bias = None if self.bias: - eps_bias = self.eps_bias.data.normal_() - bias = (self.quantized_sigma_bias * eps_bias) + bias = self.quantized_mu_bias # TODO: check correctness + + if self.quant_dict is not None: + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point']) + + if x.dtype!=torch.quint8: # check if input has been quantized + x = torch.quantize_per_tensor(x, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point'], torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + outputs = torch.nn.quantized.functional.conv2d(x, self.quantized_mu_weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=self.quant_dict[3]['scale'], zero_point=self.quant_dict[3]['zero_point']) # input: quint8, weight: qint8, bias: fp32 + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, self.quant_dict[4]['scale'], self.quant_dict[4]['zero_point'], torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, self.quant_dict[5]['scale'], self.quant_dict[5]['zero_point'], torch.quint8) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, self.quant_dict[6]['scale'], self.quant_dict[6]['zero_point']) + perturbed_outputs = torch.nn.quantized.functional.conv2d(x, + weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding, + dilation=self.dilation, groups=self.groups, scale=self.quant_dict[7]['scale'], zero_point=self.quant_dict[7]['zero_point']) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, self.quant_dict[8]['scale'], self.quant_dict[8]['zero_point']) + out = torch.ops.quantized.add(outputs, perturbed_outputs, self.quant_dict[9]['scale'], self.quant_dict[9]['zero_point']) + out = out.dequantize() - # perturbed feedforward - x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) - - perturbed_outputs = torch.nn.quantized.functional.conv2d(x, - weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding, - dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point) - perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) - out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + else: + if x.dtype!=torch.quint8: + x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8) + + outputs = torch.nn.quantized.functional.conv2d(x, self.quantized_mu_weight, bias, self.stride, self.padding, + self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) + delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0) + + bias = None + if self.bias: + eps_bias = self.eps_bias.data.normal_() + bias = (self.quantized_sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + perturbed_outputs = torch.nn.quantized.functional.conv2d(x, + weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding, + dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) if return_kl: return out, 0 diff --git a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py index 289da98..388817d 100644 --- a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py +++ b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py @@ -54,6 +54,7 @@ def __init__(self, out_features) self.is_dequant = False + self.quant_dict = None def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): """ An implementation for symmetric quantization @@ -118,8 +119,8 @@ def quantize(self): delattr(self, "mu_weight") delattr(self, "rho_weight") - self.quantized_mu_bias = Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False) - self.quantized_sigma_bias = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False) + self.quantized_mu_bias = self.mu_bias#Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False) + self.quantized_sigma_bias = torch.log1p(torch.exp(self.rho_bias))#Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False) delattr(self, "mu_bias") delattr(self, "rho_bias") @@ -173,32 +174,64 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 self.is_dequant = True bias = self.mu_bias - outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 - - # sampling perturbation signs - sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() - sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() - sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) - sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) - - # getting perturbation weights - eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), normal_scale, 0, torch.qint8) - new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale()) - delta_weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0) - - bias = None - if self.quantized_sigma_bias is not None: - eps_bias = self.eps_bias.data.normal_() - bias = (self.sigma_bias * eps_bias) - - # perturbed feedforward - x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) - - perturbed_outputs = torch.nn.quantized.functional.linear(x, - weight=delta_weight, bias=bias, scale=default_scale, zero_point=default_zero_point) - perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) - out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) - out = out.dequantize() + if self.quant_dict is not None: + + # getting perturbation weights + eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8) + delta_weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point']) + + bias = None + if self.quantized_sigma_bias is not None: + eps_bias = self.eps_bias.data.normal_() + bias = (self.sigma_bias * eps_bias) + + if x.dtype!=torch.quint8: # check if input has been quantized + x = torch.quantize_per_tensor(x, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point'], torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format + + outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=self.quant_dict[3]['scale'], zero_point=self.quant_dict[3]['zero_point']) # input: quint8, weight: qint8, bias: fp32 + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, self.quant_dict[4]['scale'], self.quant_dict[4]['zero_point'], torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, self.quant_dict[5]['scale'], self.quant_dict[5]['zero_point'], torch.quint8) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, self.quant_dict[6]['scale'], self.quant_dict[6]['zero_point']) + perturbed_outputs = torch.nn.quantized.functional.linear(x, + weight=delta_weight, bias=bias, scale=self.quant_dict[7]['scale'], zero_point=self.quant_dict[7]['zero_point']) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, self.quant_dict[8]['scale'], self.quant_dict[8]['zero_point']) + out = torch.ops.quantized.add(outputs, perturbed_outputs, self.quant_dict[9]['scale'], self.quant_dict[9]['zero_point']) + out = out.dequantize() + + else: + + outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32 + + # sampling perturbation signs + sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8) + sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8) + + # getting perturbation weights + eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), normal_scale, 0, torch.qint8) + new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale()) + delta_weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0) + + bias = None + if self.quantized_sigma_bias is not None: + eps_bias = self.eps_bias.data.normal_() + bias = (self.sigma_bias * eps_bias) + + # perturbed feedforward + x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point) + + perturbed_outputs = torch.nn.quantized.functional.linear(x, + weight=delta_weight, bias=bias, scale=default_scale, zero_point=default_zero_point) + perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point) + out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point) + out = out.dequantize() if return_kl: return out, 0 From 17480e67f3af59703356aada07fe126900c35543 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Sun, 19 Mar 2023 18:46:02 -0400 Subject: [PATCH 65/69] fix qconv2d flipout layers --- bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py index 55acd67..18fd2ce 100644 --- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py @@ -454,7 +454,7 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 dilation=self.dilation, groups=self.groups, scale=self.quant_dict[7]['scale'], zero_point=self.quant_dict[7]['zero_point']) perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, self.quant_dict[8]['scale'], self.quant_dict[8]['zero_point']) out = torch.ops.quantized.add(outputs, perturbed_outputs, self.quant_dict[9]['scale'], self.quant_dict[9]['zero_point']) - out = out.dequantize() + # out = out.dequantize() else: if x.dtype!=torch.quint8: From 18b296fa981fde4ca0cf9d510d97a0e08e927219 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Wed, 29 Mar 2023 14:27:44 -0400 Subject: [PATCH 66/69] modify the bias to a torch.Parameter to allow for JIT tracing --- .../layers/variational_layers/quantize_linear_variational.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py index 34a970f..a12a569 100644 --- a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py +++ b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py @@ -119,7 +119,7 @@ def quantize(self): delattr(self, "rho_weight") self.quantized_mu_bias = self.mu_bias#Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False) - self.quantized_sigma_bias = torch.log1p(torch.exp(self.rho_bias))#Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False) + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False)#Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False) delattr(self, "mu_bias") delattr(self, "rho_bias") @@ -131,7 +131,7 @@ def dequantize(self): # Deprecated self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias) return - def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True): + def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.2, default_zero_point=128, return_kl=True): """ Forward pass Parameters From 69dc4db534cc81ad031a2c4263a3733b5b546dc7 Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Wed, 29 Mar 2023 14:32:23 -0400 Subject: [PATCH 67/69] pre-sampling for flipout layers --- .../flipout_layers/quantized_conv_flipout.py | 24 ++++++++++++++-- .../quantized_linear_flipout.py | 28 +++++++++++++++++-- 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py index 18fd2ce..4be011a 100644 --- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py @@ -38,6 +38,7 @@ from torch.nn import Parameter from ..base_variational_layer import BaseVariationalLayer_ from .conv_flipout import * +import random from torch.distributions.normal import Normal from torch.distributions.uniform import Uniform @@ -285,6 +286,9 @@ def __init__(self, self.is_dequant = False self.quant_dict = None + self.presampled_input_perturb = None + self.presampled_output_perturb = None + def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): """ An implementation for symmetric quantization @@ -442,8 +446,24 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 self.dilation, self.groups, scale=self.quant_dict[3]['scale'], zero_point=self.quant_dict[3]['zero_point']) # input: quint8, weight: qint8, bias: fp32 # sampling perturbation signs - sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() - sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + input_tsize = torch.prod(torch.tensor(x.shape))*1 + output_tsize = torch.prod(torch.tensor(outputs.shape))*1 + + if self.presampled_input_perturb is None: + self.presampled_input_perturb = torch.randint(0, 1, (input_tsize + torch.prod(torch.tensor(x.shape)),)).float() + self.presampled_input_perturb[self.presampled_input_perturb==0] = -1 + + if self.presampled_output_perturb is None: + self.presampled_output_perturb = torch.randint(0, 1, (output_tsize + torch.prod(torch.tensor(outputs.shape)),)).float() + self.presampled_output_perturb[self.presampled_output_perturb==0] = -1 + + st = random.randint(0, input_tsize) + sign_input = self.presampled_input_perturb[st:st+torch.prod(torch.tensor(x.shape))].reshape(x.shape) + + st = random.randint(0, output_tsize) + sign_output = self.presampled_output_perturb[st:st+torch.prod(torch.tensor(outputs.shape))].reshape(outputs.shape) + # sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + # sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() sign_input = torch.quantize_per_tensor(sign_input, self.quant_dict[4]['scale'], self.quant_dict[4]['zero_point'], torch.quint8) sign_output = torch.quantize_per_tensor(sign_output, self.quant_dict[5]['scale'], self.quant_dict[5]['zero_point'], torch.quint8) diff --git a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py index 388817d..3cce873 100644 --- a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py +++ b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py @@ -39,6 +39,7 @@ from torch.nn import Module, Parameter from torch.distributions.normal import Normal from torch.distributions.uniform import Uniform +import random from .linear_flipout import LinearFlipout @@ -55,6 +56,8 @@ def __init__(self, self.is_dequant = False self.quant_dict = None + self.presampled_input_perturb = None + self.presampled_output_perturb = None def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255): """ An implementation for symmetric quantization @@ -120,7 +123,7 @@ def quantize(self): delattr(self, "rho_weight") self.quantized_mu_bias = self.mu_bias#Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False) - self.quantized_sigma_bias = torch.log1p(torch.exp(self.rho_bias))#Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False) + self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False)#Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False) delattr(self, "mu_bias") delattr(self, "rho_bias") @@ -191,8 +194,27 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1 outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=self.quant_dict[3]['scale'], zero_point=self.quant_dict[3]['zero_point']) # input: quint8, weight: qint8, bias: fp32 # sampling perturbation signs - sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() - sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() + # sampling perturbation signs + input_tsize = torch.prod(torch.tensor(x.shape))*1 + output_tsize = torch.prod(torch.tensor(outputs.shape))*1 + + if self.presampled_input_perturb is None: + self.presampled_input_perturb = torch.randint(0, 1, (input_tsize + torch.prod(torch.tensor(x.shape)),)).float() + self.presampled_input_perturb[self.presampled_input_perturb==0] = -1 + + if self.presampled_output_perturb is None: + self.presampled_output_perturb = torch.randint(0, 1, (output_tsize + torch.prod(torch.tensor(outputs.shape)),)).float() + self.presampled_output_perturb[self.presampled_output_perturb==0] = -1 + + st = random.randint(0, input_tsize) + sign_input = self.presampled_input_perturb[st:st+torch.prod(torch.tensor(x.shape))].reshape(x.shape) + + st = random.randint(0, output_tsize) + sign_output = self.presampled_output_perturb[st:st+torch.prod(torch.tensor(outputs.shape))].reshape(outputs.shape) + + + # sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign() + # sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign() sign_input = torch.quantize_per_tensor(sign_input, self.quant_dict[4]['scale'], self.quant_dict[4]['zero_point'], torch.quint8) sign_output = torch.quantize_per_tensor(sign_output, self.quant_dict[5]['scale'], self.quant_dict[5]['zero_point'], torch.quint8) From c3e47ed1c9005776580678e3d47f3bed4495431d Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Wed, 19 Apr 2023 12:24:50 -0400 Subject: [PATCH 68/69] fix input --- bayesian_torch/layers/flipout_layers/conv_flipout.py | 2 +- bayesian_torch/layers/flipout_layers/linear_flipout.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py index 1bf0405..6463028 100644 --- a/bayesian_torch/layers/flipout_layers/conv_flipout.py +++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py @@ -398,7 +398,7 @@ def forward(self, x, return_kl=True): if self.quant_prepare: # quint8 quantstub - input = self.quint_quant[0](input) # input + x = self.quint_quant[0](x) # input outputs = self.quint_quant[1](outputs) # output sign_input = self.quint_quant[2](sign_input) sign_output = self.quint_quant[3](sign_output) diff --git a/bayesian_torch/layers/flipout_layers/linear_flipout.py b/bayesian_torch/layers/flipout_layers/linear_flipout.py index 3555290..a3de14e 100644 --- a/bayesian_torch/layers/flipout_layers/linear_flipout.py +++ b/bayesian_torch/layers/flipout_layers/linear_flipout.py @@ -175,7 +175,7 @@ def forward(self, x, return_kl=True): if self.quant_prepare: # quint8 quantstub - input = self.quint_quant[0](input) # input + x = self.quint_quant[0](x) # input outputs = self.quint_quant[1](outputs) # output sign_input = self.quint_quant[2](sign_input) sign_output = self.quint_quant[3](sign_output) From 86adb6d6fa7dced490f9a24b95e54ebc1c43ea0a Mon Sep 17 00:00:00 2001 From: junliang-lin Date: Wed, 19 Apr 2023 12:25:31 -0400 Subject: [PATCH 69/69] fix batchnorm --- bayesian_torch/layers/batchnorm.py | 23 ++++++++++++------- .../bayesian/resnet_variational_large.py | 2 +- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/bayesian_torch/layers/batchnorm.py b/bayesian_torch/layers/batchnorm.py index 145997c..25ab8f3 100644 --- a/bayesian_torch/layers/batchnorm.py +++ b/bayesian_torch/layers/batchnorm.py @@ -54,7 +54,6 @@ def _check_input_dim(self, input): input.dim())) def forward(self, input): - self._check_input_dim(input[0]) exponential_average_factor = 0.0 if self.training and self.track_running_stats: self.num_batches_tracked += 1 @@ -63,13 +62,21 @@ def forward(self, input): else: # use exponential moving average exponential_average_factor = self.momentum - out = F.batch_norm(input[0], self.running_mean, self.running_var, - self.weight, self.bias, self.training - or not self.track_running_stats, - exponential_average_factor, self.eps) - kl = 0 - return out, kl - + if len(input) == 2: + self._check_input_dim(input[0]) + out = F.batch_norm(input[0], self.running_mean, self.running_var, + self.weight, self.bias, self.training + or not self.track_running_stats, + exponential_average_factor, self.eps) + kl = 0 + return out, kl + else: + out = F.batch_norm(input, self.running_mean, self.running_var, + self.weight, self.bias, self.training + or not self.track_running_stats, + exponential_average_factor, self.eps) + return out + class BatchNorm1dLayer(nn.Module): def __init__(self, diff --git a/bayesian_torch/models/bayesian/resnet_variational_large.py b/bayesian_torch/models/bayesian/resnet_variational_large.py index 6fdf561..e5fb9fd 100644 --- a/bayesian_torch/models/bayesian/resnet_variational_large.py +++ b/bayesian_torch/models/bayesian/resnet_variational_large.py @@ -200,7 +200,7 @@ def _make_layer(self, block, planes, blocks, stride=1): posterior_mu_init=posterior_mu_init, posterior_rho_init=posterior_rho_init, bias=False), - nn.BatchNorm2d(planes * block.expansion), + BatchNorm2dLayer(planes * block.expansion), ) layers = []